34 files changed, 8341 insertions, 0 deletions
diff --git a/circuitpython/extmod/ulab/code/numpy/approx.c b/circuitpython/extmod/ulab/code/numpy/approx.c
new file mode 100644
index 0000000..85cdbf7
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/approx.c
@@ -0,0 +1,227 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+ *               2020 Diego Elio Pettenò
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../ulab.h"
+#include "../ulab_tools.h"
+#include "carray/carray_tools.h"
+#include "approx.h"
+
+//| """Numerical approximation methods"""
+//|
+
+const mp_obj_float_t approx_trapz_dx = {{&mp_type_float}, MICROPY_FLOAT_CONST(1.0)};
+
+#if ULAB_NUMPY_HAS_INTERP
+//| def interp(
+//|     x: ulab.numpy.ndarray,
+//|     xp: ulab.numpy.ndarray,
+//|     fp: ulab.numpy.ndarray,
+//|     *,
+//|     left: Optional[_float] = None,
+//|     right: Optional[_float] = None
+//| ) -> ulab.numpy.ndarray:
+//|     """
+//|     :param ulab.numpy.ndarray x: The x-coordinates at which to evaluate the interpolated values.
+//|     :param ulab.numpy.ndarray xp: The x-coordinates of the data points, must be increasing
+//|     :param ulab.numpy.ndarray fp: The y-coordinates of the data points, same length as xp
+//|     :param left: Value to return for ``x < xp[0]``, default is ``fp[0]``.
+//|     :param right: Value to return for ``x > xp[-1]``, default is ``fp[-1]``.
+//|
+//|     Returns the one-dimensional piecewise linear interpolant to a function with given discrete data points (xp, fp), evaluated at x."""
+//|     ...
+//|
+
+STATIC mp_obj_t approx_interp(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_left, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} },
+        { MP_QSTR_right, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} },
+    };
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    ndarray_obj_t *x = ndarray_from_mp_obj(args[0].u_obj, 0);
+    ndarray_obj_t *xp = ndarray_from_mp_obj(args[1].u_obj, 0); // xp must hold an increasing sequence of independent values
+    ndarray_obj_t *fp = ndarray_from_mp_obj(args[2].u_obj, 0);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(x->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(xp->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(fp->dtype)
+    if((xp->ndim != 1) || (fp->ndim != 1) || (xp->len < 2) || (fp->len < 2) || (xp->len != fp->len)) {
+        mp_raise_ValueError(translate("interp is defined for 1D iterables of equal length"));
+    }
+
+    ndarray_obj_t *y = ndarray_new_linear_array(x->len, NDARRAY_FLOAT);
+    mp_float_t left_value, right_value;
+    uint8_t *xparray = (uint8_t *)xp->array;
+
+    mp_float_t xp_left = ndarray_get_float_value(xparray, xp->dtype);
+    xparray += (xp->len-1) * xp->strides[ULAB_MAX_DIMS - 1];
+    mp_float_t xp_right = ndarray_get_float_value(xparray, xp->dtype);
+
+    uint8_t *fparray = (uint8_t *)fp->array;
+
+    if(args[3].u_obj == mp_const_none) {
+        left_value = ndarray_get_float_value(fparray, fp->dtype);
+    } else {
+        left_value = mp_obj_get_float(args[3].u_obj);
+    }
+    if(args[4].u_obj == mp_const_none) {
+        fparray += (fp->len-1) * fp->strides[ULAB_MAX_DIMS - 1];
+        right_value = ndarray_get_float_value(fparray, fp->dtype);
+    } else {
+        right_value = mp_obj_get_float(args[4].u_obj);
+    }
+
+    xparray = xp->array;
+    fparray = fp->array;
+
+    uint8_t *xarray = (uint8_t *)x->array;
+    mp_float_t *yarray = (mp_float_t *)y->array;
+    uint8_t *temp;
+
+    for(size_t i=0; i < x->len; i++, yarray++) {
+        mp_float_t x_value = ndarray_get_float_value(xarray, x->dtype);
+        xarray += x->strides[ULAB_MAX_DIMS - 1];
+        if(x_value < xp_left) {
+            *yarray = left_value;
+        } else if(x_value > xp_right) {
+            *yarray = right_value;
+        } else { // do the binary search here
+            mp_float_t xp_left_, xp_right_;
+            mp_float_t fp_left, fp_right;
+            size_t left_index = 0, right_index = xp->len - 1, middle_index;
+            while(right_index - left_index > 1) {
+                middle_index = left_index + (right_index - left_index) / 2;
+                temp = xparray + middle_index * xp->strides[ULAB_MAX_DIMS - 1];
+                mp_float_t xp_middle = ndarray_get_float_value(temp, xp->dtype);
+                if(x_value <= xp_middle) {
+                    right_index = middle_index;
+                } else {
+                    left_index = middle_index;
+                }
+            }
+            temp = xparray + left_index * xp->strides[ULAB_MAX_DIMS - 1];
+            xp_left_ = ndarray_get_float_value(temp, xp->dtype);
+
+            temp = xparray + right_index * xp->strides[ULAB_MAX_DIMS - 1];
+            xp_right_ = ndarray_get_float_value(temp, xp->dtype);
+
+            temp = fparray + left_index * fp->strides[ULAB_MAX_DIMS - 1];
+            fp_left = ndarray_get_float_value(temp, fp->dtype);
+
+            temp = fparray + right_index * fp->strides[ULAB_MAX_DIMS - 1];
+            fp_right = ndarray_get_float_value(temp, fp->dtype);
+
+            *yarray = fp_left + (x_value - xp_left_) * (fp_right - fp_left) / (xp_right_ - xp_left_);
+        }
+    }
+    return MP_OBJ_FROM_PTR(y);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(approx_interp_obj, 2, approx_interp);
+#endif
+
+#if ULAB_NUMPY_HAS_TRAPZ
+//| def trapz(y: ulab.numpy.ndarray, x: Optional[ulab.numpy.ndarray] = None, dx: _float = 1.0) -> _float:
+//|     """
+//|     :param 1D ulab.numpy.ndarray y: the values of the dependent variable
+//|     :param 1D ulab.numpy.ndarray x: optional, the coordinates of the independent variable. Defaults to uniformly spaced values.
+//|     :param float dx: the spacing between sample points, if x=None
+//|
+//|     Returns the integral of y(x) using the trapezoidal rule.
+//|     """
+//|     ...
+//|
+
+STATIC mp_obj_t approx_trapz(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_x, MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_dx, MP_ARG_OBJ, {.u_rom_obj = MP_ROM_PTR(&approx_trapz_dx)} },
+    };
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    ndarray_obj_t *y = ndarray_from_mp_obj(args[0].u_obj, 0);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(y->dtype)
+    ndarray_obj_t *x;
+    mp_float_t mean = MICROPY_FLOAT_CONST(0.0);
+    if(y->len < 2) {
+        return mp_obj_new_float(mean);
+    }
+    if((y->ndim != 1)) {
+        mp_raise_ValueError(translate("trapz is defined for 1D iterables"));
+    }
+
+    mp_float_t (*funcy)(void *) = ndarray_get_float_function(y->dtype);
+    uint8_t *yarray = (uint8_t *)y->array;
+
+    size_t count = 1;
+    mp_float_t y1, y2, m;
+
+    if(args[1].u_obj != mp_const_none) {
+        x = ndarray_from_mp_obj(args[1].u_obj, 0); // x must hold an increasing sequence of independent values
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(x->dtype)
+        if((x->ndim != 1) || (y->len != x->len)) {
+            mp_raise_ValueError(translate("trapz is defined for 1D arrays of equal length"));
+        }
+
+        mp_float_t (*funcx)(void *) = ndarray_get_float_function(x->dtype);
+        uint8_t *xarray = (uint8_t *)x->array;
+        mp_float_t x1, x2;
+
+        y1 = funcy(yarray);
+        yarray += y->strides[ULAB_MAX_DIMS - 1];
+        x1 = funcx(xarray);
+        xarray += x->strides[ULAB_MAX_DIMS - 1];
+
+        for(size_t i=1; i < y->len; i++) {
+            y2 = funcy(yarray);
+            yarray += y->strides[ULAB_MAX_DIMS - 1];
+            x2 = funcx(xarray);
+            xarray += x->strides[ULAB_MAX_DIMS - 1];
+            mp_float_t value = (x2 - x1) * (y2 + y1);
+            m = mean + (value - mean) / (mp_float_t)count;
+            mean = m;
+            x1 = x2;
+            y1 = y2;
+            count++;
+        }
+    } else {
+        mp_float_t dx = mp_obj_get_float(args[2].u_obj);
+        y1 = funcy(yarray);
+        yarray += y->strides[ULAB_MAX_DIMS - 1];
+
+        for(size_t i=1; i < y->len; i++) {
+            y2 = ndarray_get_float_index(y->array, y->dtype, i);
+            mp_float_t value = (y2 + y1);
+            m = mean + (value - mean) / (mp_float_t)count;
+            mean = m;
+            y1 = y2;
+            count++;
+        }
+        mean *= dx;
+    }
+    return mp_obj_new_float(MICROPY_FLOAT_CONST(0.5)*mean*(y->len-1));
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(approx_trapz_obj, 1, approx_trapz);
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/approx.h b/circuitpython/extmod/ulab/code/numpy/approx.h
new file mode 100644
index 0000000..487a98b
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/approx.h
@@ -0,0 +1,29 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+*/
+
+#ifndef _APPROX_
+#define _APPROX_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+#define     APPROX_EPS          MICROPY_FLOAT_CONST(1.0e-4)
+#define     APPROX_NONZDELTA    MICROPY_FLOAT_CONST(0.05)
+#define     APPROX_ZDELTA       MICROPY_FLOAT_CONST(0.00025)
+#define     APPROX_ALPHA        MICROPY_FLOAT_CONST(1.0)
+#define     APPROX_BETA         MICROPY_FLOAT_CONST(2.0)
+#define     APPROX_GAMMA        MICROPY_FLOAT_CONST(0.5)
+#define     APPROX_DELTA        MICROPY_FLOAT_CONST(0.5)
+
+MP_DECLARE_CONST_FUN_OBJ_KW(approx_interp_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(approx_trapz_obj);
+
+#endif  /* _APPROX_ */
diff --git a/circuitpython/extmod/ulab/code/numpy/carray/carray.c b/circuitpython/extmod/ulab/code/numpy/carray/carray.c
new file mode 100644
index 0000000..a5f8a2b
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/carray/carray.c
@@ -0,0 +1,826 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2021-2022 Zoltán Vörös
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/objint.h"
+#include "py/runtime.h"
+#include "py/builtin.h"
+#include "py/misc.h"
+
+#include "../../ulab.h"
+#include "../../ndarray.h"
+#include "../../ulab_tools.h"
+#include "carray.h"
+
+#if ULAB_SUPPORTS_COMPLEX
+
+//| import ulab.numpy
+
+//| def real(val):
+//|     """
+//|     Return the real part of the complex argument, which can be
+//|     either an ndarray, or a scalar."""
+//|     ...
+//|
+
+mp_obj_t carray_real(mp_obj_t _source) {
+    if(mp_obj_is_type(_source, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(_source);
+        if(source->dtype != NDARRAY_COMPLEX) {
+            ndarray_obj_t *target = ndarray_new_dense_ndarray(source->ndim, source->shape, source->dtype);
+            ndarray_copy_array(source, target, 0);
+            return MP_OBJ_FROM_PTR(target);
+        } else { // the input is most definitely a complex array
+            ndarray_obj_t *target = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_FLOAT);
+            ndarray_copy_array(source, target, 0);
+            return MP_OBJ_FROM_PTR(target);
+        }
+    } else {
+        mp_raise_NotImplementedError(translate("function is implemented for ndarrays only"));
+    }
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(carray_real_obj, carray_real);
+
+//| def imag(val):
+//|     """
+//|     Return the imaginary part of the complex argument, which can be
+//|     either an ndarray, or a scalar."""
+//|     ...
+//|
+
+mp_obj_t carray_imag(mp_obj_t _source) {
+    if(mp_obj_is_type(_source, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(_source);
+        if(source->dtype != NDARRAY_COMPLEX) { // if not complex, then the imaginary part is zero
+            ndarray_obj_t *target = ndarray_new_dense_ndarray(source->ndim, source->shape, source->dtype);
+            return MP_OBJ_FROM_PTR(target);
+        } else { // the input is most definitely a complex array
+            ndarray_obj_t *target = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_FLOAT);
+            ndarray_copy_array(source, target, source->itemsize / 2);
+            return MP_OBJ_FROM_PTR(target);
+        }
+    } else {
+        mp_raise_NotImplementedError(translate("function is implemented for ndarrays only"));
+    }
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(carray_imag_obj, carray_imag);
+
+#if ULAB_NUMPY_HAS_CONJUGATE
+
+//| def conjugate(val):
+//|     """
+//|     Return the conjugate of the complex argument, which can be
+//|     either an ndarray, or a scalar."""
+//|     ...
+//|
+mp_obj_t carray_conjugate(mp_obj_t _source) {
+    if(mp_obj_is_type(_source, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(_source);
+        ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, source->dtype);
+        ndarray_copy_array(source, ndarray, 0);
+        if(source->dtype == NDARRAY_COMPLEX) {
+            mp_float_t *array = (mp_float_t *)ndarray->array;
+            array++;
+            for(size_t i = 0; i < ndarray->len; i++) {
+                *array *= MICROPY_FLOAT_CONST(-1.0);
+                array += 2;
+            }
+        }
+        return MP_OBJ_FROM_PTR(ndarray);
+    } else {
+        if(mp_obj_is_type(_source, &mp_type_complex)) {
+            mp_float_t real, imag;
+            mp_obj_get_complex(_source, &real, &imag);
+            imag = imag * MICROPY_FLOAT_CONST(-1.0);
+            return mp_obj_new_complex(real, imag);
+        } else if(mp_obj_is_int(_source) || mp_obj_is_float(_source)) {
+            return _source;
+        } else {
+            mp_raise_TypeError(translate("input must be an ndarray, or a scalar"));
+        }
+    }
+    // this should never happen
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(carray_conjugate_obj, carray_conjugate);
+#endif
+
+#if ULAB_NUMPY_HAS_SORT_COMPLEX
+//| def sort_complex(a: ulab.numpy.ndarray) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: a
+//|       a one-dimensional ndarray
+//|
+//|     Sort a complex array using the real part first, then the imaginary part.
+//|     Always returns a sorted complex array, even if the input was real."""
+//|     ...
+//|
+
+static void carray_sort_complex_(mp_float_t *array, size_t len) {
+    // array is assumed to be a floating vector containing the real and imaginary parts
+    // of a complex array at alternating positions as
+    // array[0] = real[0]
+    // array[1] = imag[0]
+    // array[2] = real[1]
+    // array[3] = imag[1]
+
+    mp_float_t real, imag;
+    size_t c, q = len, p, r = len >> 1;
+    for (;;) {
+        if (r > 0) {
+            r--;
+            real = array[2 * r];
+            imag = array[2 * r + 1];
+        } else {
+            q--;
+            if(q == 0) {
+                break;
+            }
+            real = array[2 * q];
+            imag = array[2 * q + 1];
+            array[2 * q] = array[0];
+            array[2 * q + 1] = array[1];
+        }
+        p = r;
+        c = r + r + 1;
+        while (c < q) {
+            if(c + 1 < q) {
+                if((array[2 * (c+1)] > array[2 * c]) ||
+                    ((array[2 * (c+1)] == array[2 * c]) && (array[2 * (c+1) + 1] > array[2 * c + 1]))) {
+                    c++;
+                }
+            }
+            if((array[2 * c] > real) ||
+                ((array[2 * c] == real) && (array[2 * c + 1] > imag))) {
+                array[2 * p] = array[2 * c]; // real part
+                array[2 * p + 1] = array[2 * c + 1]; // imag part
+                p = c;
+                c = p + p + 1;
+            } else {
+                break;
+            }
+        }
+        array[2 * p] = real;
+        array[2 * p + 1] = imag;
+    }
+}
+
+mp_obj_t carray_sort_complex(mp_obj_t _source) {
+    if(!mp_obj_is_type(_source, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("input must be a 1D ndarray"));
+    }
+    ndarray_obj_t *source = MP_OBJ_TO_PTR(_source);
+    if(source->ndim != 1) {
+        mp_raise_TypeError(translate("input must be a 1D ndarray"));
+    }
+
+    ndarray_obj_t *ndarray = ndarray_copy_view_convert_type(source, NDARRAY_COMPLEX);
+    mp_float_t *array = (mp_float_t *)ndarray->array;
+    carray_sort_complex_(array, ndarray->len);
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(carray_sort_complex_obj, carray_sort_complex);
+#endif
+
+//| def abs(a: ulab.numpy.ndarray) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: a
+//|       a one-dimensional ndarray
+//|
+//|     Return the absolute value of complex ndarray."""
+//|     ...
+//|
+
+mp_obj_t carray_abs(ndarray_obj_t *source, ndarray_obj_t *target) {
+    // calculates the absolute value of a complex array and returns a dense array
+    uint8_t *sarray = (uint8_t *)source->array;
+    mp_float_t *tarray = (mp_float_t *)target->array;
+    uint8_t itemsize = mp_binary_get_size('@', NDARRAY_FLOAT, NULL);
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    mp_float_t rvalue = *(mp_float_t *)sarray;
+                    mp_float_t ivalue = *(mp_float_t *)(sarray + itemsize);
+                    *tarray++ = MICROPY_FLOAT_C_FUN(sqrt)(rvalue * rvalue + ivalue * ivalue);
+                    sarray += source->strides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                sarray += source->strides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+            sarray += source->strides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+        sarray += source->strides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+    #endif
+    return MP_OBJ_FROM_PTR(target);
+}
+
+static void carray_copy_part(uint8_t *tarray, uint8_t *sarray, size_t *shape, int32_t *strides) {
+    // copies the real or imaginary part of an array
+    // into the respective part of a dense complex array
+    uint8_t sz = sizeof(mp_float_t);
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    memcpy(tarray, sarray, sz);
+                    tarray += 2 * sz;
+                    sarray += strides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                sarray -= strides[ULAB_MAX_DIMS - 1] * shape[ULAB_MAX_DIMS-1];
+                sarray += strides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < shape[ULAB_MAX_DIMS - 2]);
+            #endif /* ULAB_MAX_DIMS > 1 */
+        #if ULAB_MAX_DIMS > 2
+            sarray -= strides[ULAB_MAX_DIMS - 2] * shape[ULAB_MAX_DIMS-2];
+            sarray += strides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < shape[ULAB_MAX_DIMS - 3]);
+        #endif /* ULAB_MAX_DIMS > 2 */
+    #if ULAB_MAX_DIMS > 3
+        sarray -= strides[ULAB_MAX_DIMS - 3] * shape[ULAB_MAX_DIMS-3];
+        sarray += strides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i < shape[ULAB_MAX_DIMS - 4]);
+    #endif /* ULAB_MAX_DIMS > 3 */
+}
+
+mp_obj_t carray_binary_equal_not_equal(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides, mp_binary_op_t op) {
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT8);
+    results->boolean = 1;
+    uint8_t *array = (uint8_t *)results->array;
+
+    if(op == MP_BINARY_OP_NOT_EQUAL) {
+        memset(array, 1, results->len);
+    }
+
+    if((lhs->dtype == NDARRAY_COMPLEX) && (rhs->dtype == NDARRAY_COMPLEX)) {
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        mp_float_t *rarray = (mp_float_t *)rhs->array;
+
+        ulab_rescale_float_strides(lstrides);
+        ulab_rescale_float_strides(rstrides);
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        if((larray[0] == rarray[0]) && (larray[1] == rarray[1])) {
+                            *array ^= 0x01;
+                        }
+                        array++;
+                        larray += lstrides[ULAB_MAX_DIMS - 1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    larray -= lstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    larray += lstrides[ULAB_MAX_DIMS - 2];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    rarray += rstrides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+                #endif /* ULAB_MAX_DIMS > 1 */
+            #if ULAB_MAX_DIMS > 2
+                larray -= lstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                larray += lstrides[ULAB_MAX_DIMS - 3];
+                rarray -= rstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                rarray += rstrides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+            #endif /* ULAB_MAX_DIMS > 2 */
+        #if ULAB_MAX_DIMS > 3
+            larray -= lstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            larray += lstrides[ULAB_MAX_DIMS - 4];
+            rarray -= rstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            rarray += rstrides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+        #endif /* ULAB_MAX_DIMS > 3 */
+    } else { // only one of the operands is complex
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        uint8_t *rarray = (uint8_t *)rhs->array;
+
+        // align the complex array to the left
+        uint8_t rdtype = rhs->dtype;
+        int32_t *lstrides_ = lstrides;
+        int32_t *rstrides_ = rstrides;
+
+        if(rhs->dtype == NDARRAY_COMPLEX) {
+            larray = (mp_float_t *)rhs->array;
+            rarray = (uint8_t *)lhs->array;
+            lstrides_ = rstrides;
+            rstrides_ = lstrides;
+            rdtype = lhs->dtype;
+        }
+
+        ulab_rescale_float_strides(lstrides_);
+
+        if(rdtype == NDARRAY_UINT8) {
+            BINARY_LOOP_COMPLEX_EQUAL(results, array, uint8_t, larray, lstrides_, rarray, rstrides_);
+        } else if(rdtype == NDARRAY_INT8) {
+            BINARY_LOOP_COMPLEX_EQUAL(results, array, int8_t, larray, lstrides_, rarray, rstrides_);
+        } else if(rdtype == NDARRAY_UINT16) {
+            BINARY_LOOP_COMPLEX_EQUAL(results, array, uint16_t, larray, lstrides_, rarray, rstrides_);
+        } else if(rdtype == NDARRAY_INT16) {
+            BINARY_LOOP_COMPLEX_EQUAL(results, array, int16_t, larray, lstrides_, rarray, rstrides_);
+        } else if(rdtype == NDARRAY_FLOAT) {
+            BINARY_LOOP_COMPLEX_EQUAL(results, array, mp_float_t, larray, lstrides_, rarray, rstrides_);
+        }
+    }
+    return MP_OBJ_FROM_PTR(results);
+}
+
+mp_obj_t carray_binary_add(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_COMPLEX);
+    mp_float_t *resarray = (mp_float_t *)results->array;
+
+    if((lhs->dtype == NDARRAY_COMPLEX) && (rhs->dtype == NDARRAY_COMPLEX)) {
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        mp_float_t *rarray = (mp_float_t *)rhs->array;
+
+        ulab_rescale_float_strides(lstrides);
+        ulab_rescale_float_strides(rstrides);
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        // real part
+                        *resarray++ = larray[0] + rarray[0];
+                        // imaginary part
+                        *resarray++ = larray[1] + rarray[1];
+                        larray += lstrides[ULAB_MAX_DIMS - 1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    larray -= lstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    larray += lstrides[ULAB_MAX_DIMS - 2];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    rarray += rstrides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+                #endif /* ULAB_MAX_DIMS > 1 */
+            #if ULAB_MAX_DIMS > 2
+                larray -= lstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                larray += lstrides[ULAB_MAX_DIMS - 3];
+                rarray -= rstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                rarray += rstrides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+            #endif /* ULAB_MAX_DIMS > 2 */
+        #if ULAB_MAX_DIMS > 3
+            larray -= lstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            larray += lstrides[ULAB_MAX_DIMS - 4];
+            rarray -= rstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            rarray += rstrides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+        #endif /* ULAB_MAX_DIMS > 3 */
+    } else { // only one of the operands is complex
+        uint8_t *larray = (uint8_t *)lhs->array;
+        uint8_t *rarray = (uint8_t *)rhs->array;
+
+        // align the complex array to the left
+        uint8_t rdtype = rhs->dtype;
+        int32_t *lstrides_ = lstrides;
+        int32_t *rstrides_ = rstrides;
+
+        if(rhs->dtype == NDARRAY_COMPLEX) {
+            larray = (uint8_t *)rhs->array;
+            rarray = (uint8_t *)lhs->array;
+            lstrides_ = rstrides;
+            rstrides_ = lstrides;
+            rdtype = lhs->dtype;
+        }
+
+        if(rdtype == NDARRAY_UINT8) {
+            BINARY_LOOP_COMPLEX(results, resarray, uint8_t, larray, lstrides_, rarray, rstrides_, +);
+        } else if(rdtype == NDARRAY_INT8) {
+            BINARY_LOOP_COMPLEX(results, resarray, int8_t, larray, lstrides_, rarray, rstrides_, +);
+        } else if(rdtype == NDARRAY_UINT16) {
+            BINARY_LOOP_COMPLEX(results, resarray, uint16_t, larray, lstrides_, rarray, rstrides_, +);
+        } else if(rdtype == NDARRAY_INT16) {
+            BINARY_LOOP_COMPLEX(results, resarray, int16_t, larray, lstrides_, rarray, rstrides_, +);
+        } else if(rdtype == NDARRAY_FLOAT) {
+            BINARY_LOOP_COMPLEX(results, resarray, mp_float_t, larray, lstrides_, rarray, rstrides_, +);
+        }
+
+        // simply copy the imaginary part
+        uint8_t *tarray = (uint8_t *)results->array;
+        tarray += sizeof(mp_float_t);
+
+        if(lhs->dtype == NDARRAY_COMPLEX) {
+            rarray = (uint8_t *)lhs->array;
+            rstrides = lstrides;
+        } else {
+            rarray = (uint8_t *)rhs->array;
+        }
+        rarray += sizeof(mp_float_t);
+        carray_copy_part(tarray, rarray, results->shape, rstrides);
+    }
+    return MP_OBJ_FROM_PTR(results);
+}
+
+static void carray_binary_multiply_(ndarray_obj_t *results, mp_float_t *resarray, uint8_t *larray, uint8_t *rarray,
+                            int32_t *lstrides, int32_t *rstrides, uint8_t rdtype) {
+
+    if(rdtype == NDARRAY_UINT8) {
+        BINARY_LOOP_COMPLEX(results, resarray, uint8_t, larray, lstrides, rarray, rstrides, *);
+    } else if(rdtype == NDARRAY_INT8) {
+        BINARY_LOOP_COMPLEX(results, resarray, int8_t, larray, lstrides, rarray, rstrides, *);
+    } else if(rdtype == NDARRAY_UINT16) {
+        BINARY_LOOP_COMPLEX(results, resarray, uint16_t, larray, lstrides, rarray, rstrides, *);
+    } else if(rdtype == NDARRAY_INT16) {
+        BINARY_LOOP_COMPLEX(results, resarray, int16_t, larray, lstrides, rarray, rstrides, *);
+    } else if(rdtype == NDARRAY_FLOAT) {
+        BINARY_LOOP_COMPLEX(results, resarray, mp_float_t, larray, lstrides, rarray, rstrides, *);
+    }
+}
+
+mp_obj_t carray_binary_multiply(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_COMPLEX);
+    mp_float_t *resarray = (mp_float_t *)results->array;
+
+    if((lhs->dtype == NDARRAY_COMPLEX) && (rhs->dtype == NDARRAY_COMPLEX)) {
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        mp_float_t *rarray = (mp_float_t *)rhs->array;
+
+        ulab_rescale_float_strides(lstrides);
+        ulab_rescale_float_strides(rstrides);
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        // real part
+                        *resarray++ = larray[0] * rarray[0] - larray[1] * rarray[1];
+                        // imaginary part
+                        *resarray++ = larray[0] * rarray[1] + larray[1] * rarray[0];
+                        larray += lstrides[ULAB_MAX_DIMS - 1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    larray -= lstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    larray += lstrides[ULAB_MAX_DIMS - 2];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    rarray += rstrides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+                #endif /* ULAB_MAX_DIMS > 1 */
+            #if ULAB_MAX_DIMS > 2
+                larray -= lstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                larray += lstrides[ULAB_MAX_DIMS - 3];
+                rarray -= rstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                rarray += rstrides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+            #endif /* ULAB_MAX_DIMS > 2 */
+        #if ULAB_MAX_DIMS > 3
+            larray -= lstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            larray += lstrides[ULAB_MAX_DIMS - 4];
+            rarray -= rstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            rarray += rstrides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+        #endif /* ULAB_MAX_DIMS > 3 */
+    } else { // only one of the operands is complex
+
+        uint8_t *larray = (uint8_t *)lhs->array;
+        uint8_t *rarray = (uint8_t *)rhs->array;
+        uint8_t *lo = larray, *ro = rarray;
+        int32_t *left_strides = lstrides;
+        int32_t *right_strides = rstrides;
+        uint8_t rdtype = rhs->dtype;
+
+        // align the complex array to the left
+        if(rhs->dtype == NDARRAY_COMPLEX) {
+            lo = (uint8_t *)rhs->array;
+            ro = (uint8_t *)lhs->array;
+            rdtype = lhs->dtype;
+            left_strides = rstrides;
+            right_strides = lstrides;
+        }
+
+        larray = lo;
+        rarray = ro;
+        // real part
+        carray_binary_multiply_(results, resarray, larray, rarray, left_strides, right_strides, rdtype);
+
+        larray = lo + sizeof(mp_float_t);
+        rarray = ro;
+        resarray = (mp_float_t *)results->array;
+        resarray++;
+        // imaginary part
+        carray_binary_multiply_(results, resarray, larray, rarray, left_strides, right_strides, rdtype);
+    }
+    return MP_OBJ_FROM_PTR(results);
+}
+
+mp_obj_t carray_binary_subtract(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_COMPLEX);
+    mp_float_t *resarray = (mp_float_t *)results->array;
+
+    if((lhs->dtype == NDARRAY_COMPLEX) && (rhs->dtype == NDARRAY_COMPLEX)) {
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        mp_float_t *rarray = (mp_float_t *)rhs->array;
+
+        ulab_rescale_float_strides(lstrides);
+        ulab_rescale_float_strides(rstrides);
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        // real part
+                        *resarray++ = larray[0] - rarray[0];
+                        // imaginary part
+                        *resarray++ = larray[1] - rarray[1];
+                        larray += lstrides[ULAB_MAX_DIMS - 1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    larray -= lstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    larray += lstrides[ULAB_MAX_DIMS - 2];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    rarray += rstrides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+                #endif /* ULAB_MAX_DIMS > 1 */
+            #if ULAB_MAX_DIMS > 2
+                larray -= lstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                larray += lstrides[ULAB_MAX_DIMS - 3];
+                rarray -= rstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                rarray += rstrides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+            #endif /* ULAB_MAX_DIMS > 2 */
+        #if ULAB_MAX_DIMS > 3
+            larray -= lstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            larray += lstrides[ULAB_MAX_DIMS - 4];
+            rarray -= rstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            rarray += rstrides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+        #endif /* ULAB_MAX_DIMS > 3 */
+    } else {
+        uint8_t *larray = (uint8_t *)lhs->array;
+        if(lhs->dtype == NDARRAY_COMPLEX) {
+            uint8_t *rarray = (uint8_t *)rhs->array;
+            if(rhs->dtype == NDARRAY_UINT8) {
+                BINARY_LOOP_COMPLEX(results, resarray, uint8_t, larray, lstrides, rarray, rstrides, -);
+            } else if(rhs->dtype == NDARRAY_INT8) {
+                BINARY_LOOP_COMPLEX(results, resarray, int8_t, larray, lstrides, rarray, rstrides, -);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                BINARY_LOOP_COMPLEX(results, resarray, uint16_t, larray, lstrides, rarray, rstrides, -);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                BINARY_LOOP_COMPLEX(results, resarray, int16_t, larray, lstrides, rarray, rstrides, -);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                BINARY_LOOP_COMPLEX(results, resarray, mp_float_t, larray, lstrides, rarray, rstrides, -);
+            }
+            // copy the imaginary part
+            uint8_t *tarray = (uint8_t *)results->array;
+            tarray += sizeof(mp_float_t);
+
+            larray = (uint8_t *)lhs->array;
+            larray += sizeof(mp_float_t);
+
+            carray_copy_part(tarray, larray, results->shape, lstrides);
+        } else if(rhs->dtype == NDARRAY_COMPLEX) {
+            mp_float_t *rarray = (mp_float_t *)rhs->array;
+            ulab_rescale_float_strides(rstrides);
+
+            if(lhs->dtype == NDARRAY_UINT8) {
+                BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT(results, resarray, uint8_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_INT8) {
+                BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT(results, resarray, int8_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_UINT16) {
+                BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT(results, resarray, uint16_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_INT16) {
+                BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT(results, resarray, int16_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_FLOAT) {
+                BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT(results, resarray, mp_float_t, larray, lstrides, rarray, rstrides);
+            }
+        }
+    }
+
+    return MP_OBJ_FROM_PTR(results);
+}
+
+static void carray_binary_left_divide_(ndarray_obj_t *results, mp_float_t *resarray, uint8_t *larray, uint8_t *rarray,
+                            int32_t *lstrides, int32_t *rstrides, uint8_t rdtype) {
+
+    if(rdtype == NDARRAY_UINT8) {
+        BINARY_LOOP_COMPLEX(results, resarray, uint8_t, larray, lstrides, rarray, rstrides, /);
+    } else if(rdtype == NDARRAY_INT8) {
+        BINARY_LOOP_COMPLEX(results, resarray, int8_t, larray, lstrides, rarray, rstrides, /);
+    } else if(rdtype == NDARRAY_UINT16) {
+        BINARY_LOOP_COMPLEX(results, resarray, uint16_t, larray, lstrides, rarray, rstrides, /);
+    } else if(rdtype == NDARRAY_INT16) {
+        BINARY_LOOP_COMPLEX(results, resarray, int16_t, larray, lstrides, rarray, rstrides, /);
+    } else if(rdtype == NDARRAY_FLOAT) {
+        BINARY_LOOP_COMPLEX(results, resarray, mp_float_t, larray, lstrides, rarray, rstrides, /);
+    }
+}
+
+mp_obj_t carray_binary_divide(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_COMPLEX);
+    mp_float_t *resarray = (mp_float_t *)results->array;
+
+    if((lhs->dtype == NDARRAY_COMPLEX) && (rhs->dtype == NDARRAY_COMPLEX)) {
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        mp_float_t *rarray = (mp_float_t *)rhs->array;
+
+        ulab_rescale_float_strides(lstrides);
+        ulab_rescale_float_strides(rstrides);
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        // (a + bi) / (c + di) =
+                        // (ac + bd) / (c^2 + d^2) + i (bc - ad) / (c^2 + d^2)
+                        // denominator
+                        mp_float_t denom = rarray[0] * rarray[0] + rarray[1] * rarray[1];
+
+                        // real part
+                        *resarray++ = (larray[0] * rarray[0] + larray[1] * rarray[1]) / denom;
+                        // imaginary part
+                        *resarray++ = (larray[1] * rarray[0] - larray[0] * rarray[1]) / denom;
+                        larray += lstrides[ULAB_MAX_DIMS - 1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    larray -= lstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    larray += lstrides[ULAB_MAX_DIMS - 2];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    rarray += rstrides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+                #endif /* ULAB_MAX_DIMS > 1 */
+            #if ULAB_MAX_DIMS > 2
+                larray -= lstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                larray += lstrides[ULAB_MAX_DIMS - 3];
+                rarray -= rstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                rarray += rstrides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+            #endif /* ULAB_MAX_DIMS > 2 */
+        #if ULAB_MAX_DIMS > 3
+            larray -= lstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            larray += lstrides[ULAB_MAX_DIMS - 4];
+            rarray -= rstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            rarray += rstrides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+        #endif /* ULAB_MAX_DIMS > 3 */
+    } else {
+        uint8_t *larray = (uint8_t *)lhs->array;
+        uint8_t *rarray = (uint8_t *)rhs->array;
+        if(lhs->dtype == NDARRAY_COMPLEX) {
+            // real part
+            carray_binary_left_divide_(results, resarray, larray, rarray, lstrides, rstrides, rhs->dtype);
+            // imaginary part
+            resarray = (mp_float_t *)results->array;
+            resarray++;
+            larray = (uint8_t *)lhs->array;
+            larray += sizeof(mp_float_t);
+            rarray = (uint8_t *)rhs->array;
+            carray_binary_left_divide_(results, resarray, larray, rarray, lstrides, rstrides, rhs->dtype);
+        } else {
+            if(lhs->dtype == NDARRAY_UINT8) {
+                BINARY_LOOP_COMPLEX_RIGHT_DIVIDE(results, resarray, uint8_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_INT8) {
+                BINARY_LOOP_COMPLEX_RIGHT_DIVIDE(results, resarray, int8_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_UINT16) {
+                BINARY_LOOP_COMPLEX_RIGHT_DIVIDE(results, resarray, uint16_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_INT16) {
+                BINARY_LOOP_COMPLEX_RIGHT_DIVIDE(results, resarray, int16_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_FLOAT) {
+                BINARY_LOOP_COMPLEX_RIGHT_DIVIDE(results, resarray, mp_float_t, larray, lstrides, rarray, rstrides);
+            }
+        }
+    }
+
+    return MP_OBJ_FROM_PTR(results);
+}
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/carray/carray.h b/circuitpython/extmod/ulab/code/numpy/carray/carray.h
new file mode 100644
index 0000000..8ca5de2
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/carray/carray.h
@@ -0,0 +1,237 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2021-2022 Zoltán Vörös
+*/
+
+#ifndef _CARRAY_
+#define _CARRAY_
+
+MP_DECLARE_CONST_FUN_OBJ_1(carray_real_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(carray_imag_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(carray_conjugate_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(carray_sort_complex_obj);
+
+
+mp_obj_t carray_imag(mp_obj_t );
+mp_obj_t carray_real(mp_obj_t );
+
+mp_obj_t carray_abs(ndarray_obj_t *, ndarray_obj_t *);
+mp_obj_t carray_binary_add(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+mp_obj_t carray_binary_multiply(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+mp_obj_t carray_binary_subtract(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+mp_obj_t carray_binary_divide(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+mp_obj_t carray_binary_equal_not_equal(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *, mp_binary_op_t );
+
+#define BINARY_LOOP_COMPLEX1(results, resarray, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t l = 0;\
+    do {\
+        *(resarray) = *((mp_float_t *)(larray)) OPERATOR *((type_right *)(rarray));\
+        (resarray) += 2;\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#define BINARY_LOOP_COMPLEX2(results, resarray, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t k = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX1((results), (resarray), type_right, (larray), (lstrides), (rarray), (rstrides), OPERATOR);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#define BINARY_LOOP_COMPLEX3(results, resarray, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t j = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX2((results), (resarray), type_right, (larray), (lstrides), (rarray), (rstrides), OPERATOR);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#define BINARY_LOOP_COMPLEX4(results, resarray, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t i = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX3((results), (resarray), type_right, (larray), (lstrides), (rarray), (rstrides), OPERATOR);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT1(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t l = 0;\
+    do {\
+        *(resarray)++ = *((type_left *)(larray)) - (rarray)[0];\
+        *(resarray)++ = -(rarray)[1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT2(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t k = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT1((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT3(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t j = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT2((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT4(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t i = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT3((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE1(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t l = 0;\
+    do {\
+        mp_float_t *c = (mp_float_t *)(rarray);\
+        mp_float_t denom = c[0] * c[0] + c[1] * c[1];\
+        mp_float_t a = *((type_left *)(larray)) / denom;\
+        *(resarray)++ = a * c[0];\
+        *(resarray)++ = -a * c[1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE2(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t k = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_RIGHT_DIVIDE1((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE3(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t j = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_RIGHT_DIVIDE2((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE4(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t i = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_RIGHT_DIVIDE3((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+
+#define BINARY_LOOP_COMPLEX_EQUAL1(results, array, type_right, larray, lstrides, rarray, rstrides)\
+    size_t l = 0;\
+    do {\
+        if((*(larray) == *((type_right *)(rarray))) && ((larray)[1] == MICROPY_FLOAT_CONST(0.0))) {\
+            *(array) ^= 0x01;\
+        }\
+        (array)++;\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#define BINARY_LOOP_COMPLEX_EQUAL2(results, array, type_right, larray, lstrides, rarray, rstrides)\
+    size_t k = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_EQUAL1((results), (array), type_right, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#define BINARY_LOOP_COMPLEX_EQUAL3(results, array, type_right, larray, lstrides, rarray, rstrides)\
+    size_t j = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_EQUAL2((results), (array), type_right, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#define BINARY_LOOP_COMPLEX_EQUAL4(results, array, type_right, larray, lstrides, rarray, rstrides)\
+    size_t i = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_EQUAL3((results), (array), type_right, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+#if ULAB_MAX_DIMS == 1
+#define BINARY_LOOP_COMPLEX BINARY_LOOP_COMPLEX1
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT1
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE BINARY_LOOP_COMPLEX_RIGHT_DIVIDE1
+#define BINARY_LOOP_COMPLEX_EQUAL BINARY_LOOP_COMPLEX_EQUAL1
+#endif /* ULAB_MAX_DIMS == 1 */
+
+#if ULAB_MAX_DIMS == 2
+#define BINARY_LOOP_COMPLEX BINARY_LOOP_COMPLEX2
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT2
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE BINARY_LOOP_COMPLEX_RIGHT_DIVIDE2
+#define BINARY_LOOP_COMPLEX_EQUAL BINARY_LOOP_COMPLEX_EQUAL2
+#endif /* ULAB_MAX_DIMS == 2 */
+
+#if ULAB_MAX_DIMS == 3
+#define BINARY_LOOP_COMPLEX BINARY_LOOP_COMPLEX3
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT3
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE BINARY_LOOP_COMPLEX_RIGHT_DIVIDE3
+#define BINARY_LOOP_COMPLEX_EQUAL BINARY_LOOP_COMPLEX_EQUAL3
+#endif /* ULAB_MAX_DIMS == 3 */
+
+#if ULAB_MAX_DIMS == 4
+#define BINARY_LOOP_COMPLEX BINARY_LOOP_COMPLEX4
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT4
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE BINARY_LOOP_COMPLEX_RIGHT_DIVIDE4
+#define BINARY_LOOP_COMPLEX_EQUAL BINARY_LOOP_COMPLEX_EQUAL4
+#endif /* ULAB_MAX_DIMS == 4 */
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/carray/carray_tools.c b/circuitpython/extmod/ulab/code/numpy/carray/carray_tools.c
new file mode 100644
index 0000000..7b623d3
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/carray/carray_tools.c
@@ -0,0 +1,28 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2022 Zoltán Vörös
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../../ulab.h"
+#include "../../ndarray.h"
+
+#if ULAB_SUPPORTS_COMPLEX
+
+void raise_complex_NotImplementedError(void) {
+    mp_raise_NotImplementedError(translate("not implemented for complex dtype"));
+}
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/carray/carray_tools.h b/circuitpython/extmod/ulab/code/numpy/carray/carray_tools.h
new file mode 100644
index 0000000..3ac79b5
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/carray/carray_tools.h
@@ -0,0 +1,25 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2022 Zoltán Vörös
+*/
+
+#ifndef _CARRAY_TOOLS_
+#define _CARRAY_TOOLS_
+
+void raise_complex_NotImplementedError(void);
+
+#if ULAB_SUPPORTS_COMPLEX
+    #define NOT_IMPLEMENTED_FOR_COMPLEX() raise_complex_NotImplementedError();
+    #define COMPLEX_DTYPE_NOT_IMPLEMENTED(dtype) if((dtype) == NDARRAY_COMPLEX) raise_complex_NotImplementedError();
+#else
+    #define NOT_IMPLEMENTED_FOR_COMPLEX() // do nothing
+    #define COMPLEX_DTYPE_NOT_IMPLEMENTED(dtype) // do nothing
+#endif
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/compare.c b/circuitpython/extmod/ulab/code/numpy/compare.c
new file mode 100644
index 0000000..5a82072
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/compare.c
@@ -0,0 +1,428 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+ *               2020 Jeff Epler for Adafruit Industries
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../ulab.h"
+#include "../ndarray_operators.h"
+#include "../ulab_tools.h"
+#include "carray/carray_tools.h"
+#include "compare.h"
+
+static mp_obj_t compare_function(mp_obj_t x1, mp_obj_t x2, uint8_t op) {
+    ndarray_obj_t *lhs = ndarray_from_mp_obj(x1, 0);
+    ndarray_obj_t *rhs = ndarray_from_mp_obj(x2, 0);
+    #if ULAB_SUPPORTS_COMPLEX
+    if((lhs->dtype == NDARRAY_COMPLEX) || (rhs->dtype == NDARRAY_COMPLEX)) {
+        NOT_IMPLEMENTED_FOR_COMPLEX()
+    }
+    #endif
+    uint8_t ndim = 0;
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    int32_t *lstrides = m_new(int32_t, ULAB_MAX_DIMS);
+    int32_t *rstrides = m_new(int32_t, ULAB_MAX_DIMS);
+    if(!ndarray_can_broadcast(lhs, rhs, &ndim, shape, lstrides, rstrides)) {
+        mp_raise_ValueError(translate("operands could not be broadcast together"));
+        m_del(size_t, shape, ULAB_MAX_DIMS);
+        m_del(int32_t, lstrides, ULAB_MAX_DIMS);
+        m_del(int32_t, rstrides, ULAB_MAX_DIMS);
+    }
+
+    uint8_t *larray = (uint8_t *)lhs->array;
+    uint8_t *rarray = (uint8_t *)rhs->array;
+
+    if(op == COMPARE_EQUAL) {
+        return ndarray_binary_equality(lhs, rhs, ndim, shape, lstrides, rstrides, MP_BINARY_OP_EQUAL);
+    } else if(op == COMPARE_NOT_EQUAL) {
+        return ndarray_binary_equality(lhs, rhs, ndim, shape, lstrides, rstrides, MP_BINARY_OP_NOT_EQUAL);
+    }
+    // These are the upcasting rules
+    // float always becomes float
+    // operation on identical types preserves type
+    // uint8 + int8 => int16
+    // uint8 + int16 => int16
+    // uint8 + uint16 => uint16
+    // int8 + int16 => int16
+    // int8 + uint16 => uint16
+    // uint16 + int16 => float
+    // The parameters of RUN_COMPARE_LOOP are
+    // typecode of result, type_out, type_left, type_right, lhs operand, rhs operand, operator
+    if(lhs->dtype == NDARRAY_UINT8) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            RUN_COMPARE_LOOP(NDARRAY_UINT8, uint8_t, uint8_t, uint8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, uint8_t, int8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            RUN_COMPARE_LOOP(NDARRAY_UINT16, uint16_t, uint8_t, uint16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, uint8_t, int16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, uint8_t, mp_float_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        }
+    } else if(lhs->dtype == NDARRAY_INT8) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int8_t, uint8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            RUN_COMPARE_LOOP(NDARRAY_INT8, int8_t, int8_t, int8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int8_t, uint16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int8_t, int16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, int8_t, mp_float_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        }
+    } else if(lhs->dtype == NDARRAY_UINT16) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            RUN_COMPARE_LOOP(NDARRAY_UINT16, uint16_t, uint16_t, uint8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            RUN_COMPARE_LOOP(NDARRAY_UINT16, uint16_t, uint16_t, int8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            RUN_COMPARE_LOOP(NDARRAY_UINT16, uint16_t, uint16_t, uint16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, uint16_t, int16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, uint16_t, mp_float_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        }
+    } else if(lhs->dtype == NDARRAY_INT16) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int16_t, uint8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int16_t, int8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, int16_t, uint16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            RUN_COMPARE_LOOP(NDARRAY_INT16, int16_t, int16_t, int16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, int16_t, mp_float_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        }
+    } else if(lhs->dtype == NDARRAY_FLOAT) {
+        if(rhs->dtype == NDARRAY_UINT8) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, mp_float_t, uint8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT8) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, mp_float_t, int8_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_UINT16) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, mp_float_t, uint16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_INT16) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, mp_float_t, int16_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        } else if(rhs->dtype == NDARRAY_FLOAT) {
+            RUN_COMPARE_LOOP(NDARRAY_FLOAT, mp_float_t, mp_float_t, mp_float_t, larray, lstrides, rarray, rstrides, ndim, shape, op);
+        }
+    }
+    return mp_const_none; // we should never reach this point
+}
+
+static mp_obj_t compare_equal_helper(mp_obj_t x1, mp_obj_t x2, uint8_t comptype) {
+    // scalar comparisons should return a single object of mp_obj_t type
+    mp_obj_t result = compare_function(x1, x2, comptype);
+    if((mp_obj_is_int(x1) || mp_obj_is_float(x1)) && (mp_obj_is_int(x2) || mp_obj_is_float(x2))) {
+        mp_obj_iter_buf_t iter_buf;
+        mp_obj_t iterable = mp_getiter(result, &iter_buf);
+        mp_obj_t item = mp_iternext(iterable);
+        return item;
+    }
+    return result;
+}
+
+#if ULAB_NUMPY_HAS_CLIP
+
+mp_obj_t compare_clip(mp_obj_t x1, mp_obj_t x2, mp_obj_t x3) {
+    // Note: this function could be made faster by implementing a single-loop comparison in
+    // RUN_COMPARE_LOOP. However, that would add around 2 kB of compile size, while we
+    // would not gain a factor of two in speed, since the two comparisons should still be
+    // evaluated. In contrast, calling the function twice adds only 140 bytes to the firmware
+    if(mp_obj_is_int(x1) || mp_obj_is_float(x1)) {
+        mp_float_t v1 = mp_obj_get_float(x1);
+        mp_float_t v2 = mp_obj_get_float(x2);
+        mp_float_t v3 = mp_obj_get_float(x3);
+        if(v1 < v2) {
+            return x2;
+        } else if(v1 > v3) {
+            return x3;
+        } else {
+            return x1;
+        }
+    } else { // assume ndarrays
+        return compare_function(x2, compare_function(x1, x3, COMPARE_MINIMUM), COMPARE_MAXIMUM);
+    }
+}
+
+MP_DEFINE_CONST_FUN_OBJ_3(compare_clip_obj, compare_clip);
+#endif
+
+#if ULAB_NUMPY_HAS_EQUAL
+
+mp_obj_t compare_equal(mp_obj_t x1, mp_obj_t x2) {
+    return compare_equal_helper(x1, x2, COMPARE_EQUAL);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(compare_equal_obj, compare_equal);
+#endif
+
+#if ULAB_NUMPY_HAS_NOTEQUAL
+
+mp_obj_t compare_not_equal(mp_obj_t x1, mp_obj_t x2) {
+    return compare_equal_helper(x1, x2, COMPARE_NOT_EQUAL);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(compare_not_equal_obj, compare_not_equal);
+#endif
+
+#if ULAB_NUMPY_HAS_ISFINITE | ULAB_NUMPY_HAS_ISINF
+static mp_obj_t compare_isinf_isfinite(mp_obj_t _x, uint8_t mask) {
+    // mask should signify, whether the function is called from isinf (mask = 1),
+    // or from isfinite (mask = 0)
+    if(mp_obj_is_int(_x)) {
+        if(mask) {
+            return mp_const_false;
+        } else {
+            return mp_const_true;
+        }
+    } else if(mp_obj_is_float(_x)) {
+        mp_float_t x = mp_obj_get_float(_x);
+        if(isnan(x)) {
+            return mp_const_false;
+        }
+        if(mask) { // called from isinf
+            return isinf(x) ? mp_const_true : mp_const_false;
+        } else { // called from isfinite
+            return isinf(x) ? mp_const_false : mp_const_true;
+        }
+    } else if(mp_obj_is_type(_x, &ulab_ndarray_type)) {
+        ndarray_obj_t *x = MP_OBJ_TO_PTR(_x);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(x->dtype)
+        ndarray_obj_t *results = ndarray_new_dense_ndarray(x->ndim, x->shape, NDARRAY_BOOL);
+        // At this point, results is all False
+        uint8_t *rarray = (uint8_t *)results->array;
+        if(x->dtype != NDARRAY_FLOAT) {
+            // int types can never be infinite...
+            if(!mask) {
+                // ...so flip all values in the array, if the function was called from isfinite
+                memset(rarray, 1, results->len);
+            }
+            return results;
+        }
+        uint8_t *xarray = (uint8_t *)x->array;
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        mp_float_t value = *(mp_float_t *)xarray;
+                        if(isnan(value)) {
+                            *rarray++ = 0;
+                        } else {
+                            *rarray++ = isinf(value) ? mask : 1 - mask;
+                        }
+                        xarray += x->strides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < x->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    xarray -= x->strides[ULAB_MAX_DIMS - 1] * x->shape[ULAB_MAX_DIMS-1];
+                    xarray += x->strides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < x->shape[ULAB_MAX_DIMS - 2]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                xarray -= x->strides[ULAB_MAX_DIMS - 2] * x->shape[ULAB_MAX_DIMS-2];
+                xarray += x->strides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < x->shape[ULAB_MAX_DIMS - 3]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            xarray -= x->strides[ULAB_MAX_DIMS - 3] * x->shape[ULAB_MAX_DIMS-3];
+            xarray += x->strides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < x->shape[ULAB_MAX_DIMS - 4]);
+        #endif
+
+        return results;
+    } else {
+        mp_raise_TypeError(translate("wrong input type"));
+    }
+    return mp_const_none;
+}
+#endif
+
+#if ULAB_NUMPY_HAS_ISFINITE
+mp_obj_t compare_isfinite(mp_obj_t _x) {
+    return compare_isinf_isfinite(_x, 0);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(compare_isfinite_obj, compare_isfinite);
+#endif
+
+#if ULAB_NUMPY_HAS_ISINF
+mp_obj_t compare_isinf(mp_obj_t _x) {
+    return compare_isinf_isfinite(_x, 1);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(compare_isinf_obj, compare_isinf);
+#endif
+
+#if ULAB_NUMPY_HAS_MAXIMUM
+mp_obj_t compare_maximum(mp_obj_t x1, mp_obj_t x2) {
+    // extra round, so that we can return maximum(3, 4) properly
+    mp_obj_t result = compare_function(x1, x2, COMPARE_MAXIMUM);
+    if((mp_obj_is_int(x1) || mp_obj_is_float(x1)) && (mp_obj_is_int(x2) || mp_obj_is_float(x2))) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(result);
+        return mp_binary_get_val_array(ndarray->dtype, ndarray->array, 0);
+    }
+    return result;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(compare_maximum_obj, compare_maximum);
+#endif
+
+#if ULAB_NUMPY_HAS_MINIMUM
+
+mp_obj_t compare_minimum(mp_obj_t x1, mp_obj_t x2) {
+    // extra round, so that we can return minimum(3, 4) properly
+    mp_obj_t result = compare_function(x1, x2, COMPARE_MINIMUM);
+    if((mp_obj_is_int(x1) || mp_obj_is_float(x1)) && (mp_obj_is_int(x2) || mp_obj_is_float(x2))) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(result);
+        return mp_binary_get_val_array(ndarray->dtype, ndarray->array, 0);
+    }
+    return result;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(compare_minimum_obj, compare_minimum);
+#endif
+
+#if ULAB_NUMPY_HAS_WHERE
+
+mp_obj_t compare_where(mp_obj_t _condition, mp_obj_t _x, mp_obj_t _y) {
+    // this implementation will work with ndarrays, and scalars only
+    ndarray_obj_t *c = ndarray_from_mp_obj(_condition, 0);
+    ndarray_obj_t *x = ndarray_from_mp_obj(_x, 0);
+    ndarray_obj_t *y = ndarray_from_mp_obj(_y, 0);
+
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(c->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(x->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(y->dtype)
+
+    int32_t *cstrides = m_new(int32_t, ULAB_MAX_DIMS);
+    int32_t *xstrides = m_new(int32_t, ULAB_MAX_DIMS);
+    int32_t *ystrides = m_new(int32_t, ULAB_MAX_DIMS);
+
+    size_t *oshape = m_new(size_t, ULAB_MAX_DIMS);
+
+    uint8_t ndim;
+
+    // establish the broadcasting conditions first
+    // if any two of the arrays can be broadcast together, then
+    // the three arrays can also be broadcast together
+    if(!ndarray_can_broadcast(c, x, &ndim, oshape, cstrides, ystrides) ||
+        !ndarray_can_broadcast(c, y, &ndim, oshape, cstrides, ystrides) ||
+        !ndarray_can_broadcast(x, y, &ndim, oshape, xstrides, ystrides)) {
+        mp_raise_ValueError(translate("operands could not be broadcast together"));
+    }
+
+    ndim = MAX(MAX(c->ndim, x->ndim), y->ndim);
+
+    for(uint8_t i = 1; i <= ndim; i++) {
+        cstrides[ULAB_MAX_DIMS - i] = c->shape[ULAB_MAX_DIMS - i] < 2 ? 0 : c->strides[ULAB_MAX_DIMS - i];
+        xstrides[ULAB_MAX_DIMS - i] = x->shape[ULAB_MAX_DIMS - i] < 2 ? 0 : x->strides[ULAB_MAX_DIMS - i];
+        ystrides[ULAB_MAX_DIMS - i] = y->shape[ULAB_MAX_DIMS - i] < 2 ? 0 : y->strides[ULAB_MAX_DIMS - i];
+        oshape[ULAB_MAX_DIMS - i] = MAX(MAX(c->shape[ULAB_MAX_DIMS - i], x->shape[ULAB_MAX_DIMS - i]), y->shape[ULAB_MAX_DIMS - i]);
+    }
+
+    uint8_t out_dtype = ndarray_upcast_dtype(x->dtype, y->dtype);
+    ndarray_obj_t *out = ndarray_new_dense_ndarray(ndim, oshape, out_dtype);
+
+    mp_float_t (*cfunc)(void *) = ndarray_get_float_function(c->dtype);
+    mp_float_t (*xfunc)(void *) = ndarray_get_float_function(x->dtype);
+    mp_float_t (*yfunc)(void *) = ndarray_get_float_function(y->dtype);
+    mp_float_t (*ofunc)(void *, mp_float_t ) = ndarray_set_float_function(out->dtype);
+
+    uint8_t *oarray = (uint8_t *)out->array;
+    uint8_t *carray = (uint8_t *)c->array;
+    uint8_t *xarray = (uint8_t *)x->array;
+    uint8_t *yarray = (uint8_t *)y->array;
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    mp_float_t value;
+                    mp_float_t cvalue = cfunc(carray);
+                    if(cvalue != MICROPY_FLOAT_CONST(0.0)) {
+                        value = xfunc(xarray);
+                    } else {
+                        value = yfunc(yarray);
+                    }
+                    ofunc(oarray, value);
+                    oarray += out->itemsize;
+                    carray += cstrides[ULAB_MAX_DIMS - 1];
+                    xarray += xstrides[ULAB_MAX_DIMS - 1];
+                    yarray += ystrides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < out->shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                carray -= cstrides[ULAB_MAX_DIMS - 1] * c->shape[ULAB_MAX_DIMS-1];
+                carray += cstrides[ULAB_MAX_DIMS - 2];
+                xarray -= xstrides[ULAB_MAX_DIMS - 1] * x->shape[ULAB_MAX_DIMS-1];
+                xarray += xstrides[ULAB_MAX_DIMS - 2];
+                yarray -= ystrides[ULAB_MAX_DIMS - 1] * y->shape[ULAB_MAX_DIMS-1];
+                yarray += ystrides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < out->shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            carray -= cstrides[ULAB_MAX_DIMS - 2] * c->shape[ULAB_MAX_DIMS-2];
+            carray += cstrides[ULAB_MAX_DIMS - 3];
+            xarray -= xstrides[ULAB_MAX_DIMS - 2] * x->shape[ULAB_MAX_DIMS-2];
+            xarray += xstrides[ULAB_MAX_DIMS - 3];
+            yarray -= ystrides[ULAB_MAX_DIMS - 2] * y->shape[ULAB_MAX_DIMS-2];
+            yarray += ystrides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < out->shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        carray -= cstrides[ULAB_MAX_DIMS - 3] * c->shape[ULAB_MAX_DIMS-3];
+        carray += cstrides[ULAB_MAX_DIMS - 4];
+        xarray -= xstrides[ULAB_MAX_DIMS - 3] * x->shape[ULAB_MAX_DIMS-3];
+        xarray += xstrides[ULAB_MAX_DIMS - 4];
+        yarray -= ystrides[ULAB_MAX_DIMS - 3] * y->shape[ULAB_MAX_DIMS-3];
+        yarray += ystrides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i < out->shape[ULAB_MAX_DIMS - 4]);
+    #endif
+    return MP_OBJ_FROM_PTR(out);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_3(compare_where_obj, compare_where);
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/compare.h b/circuitpython/extmod/ulab/code/numpy/compare.h
new file mode 100644
index 0000000..90ceaf7
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/compare.h
@@ -0,0 +1,150 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+*/
+
+#ifndef _COMPARE_
+#define _COMPARE_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+enum COMPARE_FUNCTION_TYPE {
+    COMPARE_EQUAL,
+    COMPARE_NOT_EQUAL,
+    COMPARE_MINIMUM,
+    COMPARE_MAXIMUM,
+    COMPARE_CLIP,
+};
+
+MP_DECLARE_CONST_FUN_OBJ_3(compare_clip_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(compare_equal_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(compare_isfinite_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(compare_isinf_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(compare_minimum_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(compare_maximum_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(compare_not_equal_obj);
+MP_DECLARE_CONST_FUN_OBJ_3(compare_where_obj);
+
+#if ULAB_MAX_DIMS == 1
+#define COMPARE_LOOP(results, array, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t l = 0;\
+    do {\
+        *((type_out *)(array)) = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray)) ? (type_out)(*((type_left *)(larray))) : (type_out)(*((type_right *)(rarray)));\
+        (array) += (results)->strides[ULAB_MAX_DIMS - 1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l <  results->shape[ULAB_MAX_DIMS - 1]);\
+    return MP_OBJ_FROM_PTR(results);\
+
+#endif // ULAB_MAX_DIMS == 1
+
+#if ULAB_MAX_DIMS == 2
+#define COMPARE_LOOP(results, array, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            *((type_out *)(array)) = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray)) ? (type_out)(*((type_left *)(larray))) : (type_out)(*((type_right *)(rarray)));\
+            (array) += (results)->strides[ULAB_MAX_DIMS - 1];\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l <  results->shape[ULAB_MAX_DIMS - 1]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k <  results->shape[ULAB_MAX_DIMS - 2]);\
+    return MP_OBJ_FROM_PTR(results);\
+
+#endif // ULAB_MAX_DIMS == 2
+
+#if ULAB_MAX_DIMS == 3
+#define COMPARE_LOOP(results, array, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                *((type_out *)(array)) = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray)) ? (type_out)(*((type_left *)(larray))) : (type_out)(*((type_right *)(rarray)));\
+                (array) += (results)->strides[ULAB_MAX_DIMS - 1];\
+                (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l <  results->shape[ULAB_MAX_DIMS - 1]);\
+            (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k <  results->shape[ULAB_MAX_DIMS - 2]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j <  results->shape[ULAB_MAX_DIMS - 3]);\
+    return MP_OBJ_FROM_PTR(results);\
+
+#endif // ULAB_MAX_DIMS == 3
+
+#if ULAB_MAX_DIMS == 4
+#define COMPARE_LOOP(results, array, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t i = 0;\
+    do {\
+        size_t j = 0;\
+        do {\
+            size_t k = 0;\
+            do {\
+                size_t l = 0;\
+                do {\
+                    *((type_out *)(array)) = *((type_left *)(larray)) OPERATOR *((type_right *)(rarray)) ? (type_out)(*((type_left *)(larray))) : (type_out)(*((type_right *)(rarray)));\
+                    (array) += (results)->strides[ULAB_MAX_DIMS - 1];\
+                    (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+                    (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+                    l++;\
+                } while(l <  results->shape[ULAB_MAX_DIMS - 1]);\
+                (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];\
+                (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+                (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];\
+                (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+                k++;\
+            } while(k <  results->shape[ULAB_MAX_DIMS - 2]);\
+            (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];\
+            (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];\
+            (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+            j++;\
+        } while(j <  results->shape[ULAB_MAX_DIMS - 3]);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i <  results->shape[ULAB_MAX_DIMS - 4]);\
+    return MP_OBJ_FROM_PTR(results);\
+
+#endif // ULAB_MAX_DIMS == 4
+
+#define RUN_COMPARE_LOOP(dtype, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, ndim, shape, op) do {\
+    ndarray_obj_t *results = ndarray_new_dense_ndarray((ndim), (shape), (dtype));\
+    uint8_t *array = (uint8_t *)results->array;\
+    if((op) == COMPARE_MINIMUM) {\
+        COMPARE_LOOP(results, array, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, <);\
+    }\
+    if((op) == COMPARE_MAXIMUM) {\
+        COMPARE_LOOP(results, array, type_out, type_left, type_right, larray, lstrides, rarray, rstrides, >);\
+    }\
+} while(0)
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/create.c b/circuitpython/extmod/ulab/code/numpy/create.c
new file mode 100644
index 0000000..5777070
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/create.c
@@ -0,0 +1,783 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2019-2021 Zoltán Vörös
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+
+#include "../ulab.h"
+#include "create.h"
+#include "../ulab_tools.h"
+
+#if ULAB_NUMPY_HAS_ONES | ULAB_NUMPY_HAS_ZEROS | ULAB_NUMPY_HAS_FULL | ULAB_NUMPY_HAS_EMPTY
+static mp_obj_t create_zeros_ones_full(mp_obj_t oshape, uint8_t dtype, mp_obj_t value) {
+    if(!mp_obj_is_int(oshape) && !mp_obj_is_type(oshape, &mp_type_tuple) && !mp_obj_is_type(oshape, &mp_type_list)) {
+        mp_raise_TypeError(translate("input argument must be an integer, a tuple, or a list"));
+    }
+    ndarray_obj_t *ndarray = NULL;
+    if(mp_obj_is_int(oshape)) {
+        size_t n = mp_obj_get_int(oshape);
+        ndarray = ndarray_new_linear_array(n, dtype);
+    } else if(mp_obj_is_type(oshape, &mp_type_tuple) || mp_obj_is_type(oshape, &mp_type_list)) {
+        uint8_t len = (uint8_t)mp_obj_get_int(mp_obj_len_maybe(oshape));
+        if(len > ULAB_MAX_DIMS) {
+            mp_raise_TypeError(translate("too many dimensions"));
+        }
+        size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+        memset(shape, 0, ULAB_MAX_DIMS * sizeof(size_t));
+        size_t i = 0;
+        mp_obj_iter_buf_t iter_buf;
+        mp_obj_t item, iterable = mp_getiter(oshape, &iter_buf);
+        while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION){
+            shape[ULAB_MAX_DIMS - len + i] = (size_t)mp_obj_get_int(item);
+            i++;
+        }
+        ndarray = ndarray_new_dense_ndarray(len, shape, dtype);
+    }
+    if(value != mp_const_none) {
+        if(dtype == NDARRAY_BOOL) {
+            dtype = NDARRAY_UINT8;
+            if(mp_obj_is_true(value)) {
+                value = mp_obj_new_int(1);
+            } else {
+                value = mp_obj_new_int(0);
+            }
+        }
+        for(size_t i=0; i < ndarray->len; i++) {
+            #if ULAB_SUPPORTS_COMPLEX
+            if(dtype == NDARRAY_COMPLEX) {
+                ndarray_set_complex_value(ndarray->array, i, value);
+            } else {
+                ndarray_set_value(dtype, ndarray->array, i, value);
+            }
+            #else
+            ndarray_set_value(dtype, ndarray->array, i, value);
+            #endif
+        }
+    }
+    // if zeros calls the function, we don't have to do anything
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+#endif
+
+#if ULAB_NUMPY_HAS_ARANGE | ULAB_NUMPY_HAS_LINSPACE
+static ndarray_obj_t *create_linspace_arange(mp_float_t start, mp_float_t step, mp_float_t stop, size_t len, uint8_t dtype) {
+    mp_float_t value = start;
+
+    ndarray_obj_t *ndarray = ndarray_new_linear_array(len, dtype);
+    if(ndarray->boolean == NDARRAY_BOOLEAN) {
+        uint8_t *array = (uint8_t *)ndarray->array;
+        for(size_t i=0; i < len; i++, value += step) {
+            *array++ = value == MICROPY_FLOAT_CONST(0.0) ? 0 : 1;
+        }
+    } else if(dtype == NDARRAY_UINT8) {
+        ARANGE_LOOP(uint8_t, ndarray, len, step, stop);
+    } else if(dtype == NDARRAY_INT8) {
+        ARANGE_LOOP(int8_t, ndarray, len, step, stop);
+    } else if(dtype == NDARRAY_UINT16) {
+        ARANGE_LOOP(uint16_t, ndarray, len, step, stop);
+    } else if(dtype == NDARRAY_INT16) {
+        ARANGE_LOOP(int16_t, ndarray, len, step, stop);
+    } else {
+        ARANGE_LOOP(mp_float_t, ndarray, len, step, stop);
+    }
+    return ndarray;
+}
+#endif
+
+#if ULAB_NUMPY_HAS_ARANGE
+//| @overload
+//| def arange(stop: _float, step: _float = 1, *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray: ...
+//| @overload
+//| def arange(start: _float, stop: _float, step: _float = 1, *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: start
+//|       First value in the array, optional, defaults to 0
+//|     .. param: stop
+//|       Final value in the array
+//|     .. param: step
+//|       Difference between consecutive elements, optional, defaults to 1.0
+//|     .. param: dtype
+//|       Type of values in the array
+//|
+//|     Return a new 1-D array with elements ranging from ``start`` to ``stop``, with step size ``step``."""
+//|     ...
+//|
+
+mp_obj_t create_arange(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+    uint8_t dtype = NDARRAY_FLOAT;
+    mp_float_t start, stop, step;
+    if(n_args == 1) {
+        start = MICROPY_FLOAT_CONST(0.0);
+        stop = mp_obj_get_float(args[0].u_obj);
+        step = MICROPY_FLOAT_CONST(1.0);
+        if(mp_obj_is_int(args[0].u_obj)) dtype = NDARRAY_INT16;
+    } else if(n_args == 2) {
+        start = mp_obj_get_float(args[0].u_obj);
+        stop = mp_obj_get_float(args[1].u_obj);
+        step = MICROPY_FLOAT_CONST(1.0);
+        if(mp_obj_is_int(args[0].u_obj) && mp_obj_is_int(args[1].u_obj)) dtype = NDARRAY_INT16;
+    } else if(n_args == 3) {
+        start = mp_obj_get_float(args[0].u_obj);
+        stop = mp_obj_get_float(args[1].u_obj);
+        step = mp_obj_get_float(args[2].u_obj);
+        if(mp_obj_is_int(args[0].u_obj) && mp_obj_is_int(args[1].u_obj) && mp_obj_is_int(args[2].u_obj)) dtype = NDARRAY_INT16;
+    } else {
+        mp_raise_TypeError(translate("wrong number of arguments"));
+    }
+    if((MICROPY_FLOAT_C_FUN(fabs)(stop) > 32768) || (MICROPY_FLOAT_C_FUN(fabs)(start) > 32768) || (MICROPY_FLOAT_C_FUN(fabs)(step) > 32768)) {
+        dtype = NDARRAY_FLOAT;
+    }
+    if(args[3].u_obj != mp_const_none) {
+        dtype = (uint8_t)mp_obj_get_int(args[3].u_obj);
+    }
+    ndarray_obj_t *ndarray;
+    if((stop - start)/step < 0) {
+        ndarray = ndarray_new_linear_array(0, dtype);
+    } else {
+        size_t len = (size_t)(MICROPY_FLOAT_C_FUN(ceil)((stop - start) / step));
+        stop = start + (len - 1) * step;
+        ndarray = create_linspace_arange(start, step, stop, len, dtype);
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_arange_obj, 1, create_arange);
+#endif
+
+#if ULAB_NUMPY_HAS_CONCATENATE
+//| def concatenate(arrays: Tuple[ulab.numpy.ndarray], *, axis: int = 0) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: arrays
+//|       tuple of ndarrays
+//|     .. param: axis
+//|       axis along which the arrays will be joined
+//|
+//|     Join a sequence of arrays along an existing axis."""
+//|     ...
+//|
+
+mp_obj_t create_concatenate(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = 0 } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!mp_obj_is_type(args[0].u_obj, &mp_type_tuple)) {
+        mp_raise_TypeError(translate("first argument must be a tuple of ndarrays"));
+    }
+    int8_t axis = (int8_t)args[1].u_int;
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    mp_obj_tuple_t *ndarrays = MP_OBJ_TO_PTR(args[0].u_obj);
+
+    // first check, whether the arrays are compatible
+    ndarray_obj_t *_ndarray = MP_OBJ_TO_PTR(ndarrays->items[0]);
+    uint8_t dtype = _ndarray->dtype;
+    uint8_t ndim = _ndarray->ndim;
+    if(axis < 0) {
+        axis += ndim;
+    }
+    if((axis < 0) || (axis >= ndim)) {
+        mp_raise_ValueError(translate("wrong axis specified"));
+    }
+    // shift axis
+    axis = ULAB_MAX_DIMS - ndim + axis;
+    for(uint8_t j=0; j < ULAB_MAX_DIMS; j++) {
+        shape[j] = _ndarray->shape[j];
+    }
+
+    for(uint8_t i=1; i < ndarrays->len; i++) {
+        _ndarray = MP_OBJ_TO_PTR(ndarrays->items[i]);
+        // check, whether the arrays are compatible
+        if((dtype != _ndarray->dtype) || (ndim != _ndarray->ndim)) {
+            mp_raise_ValueError(translate("input arrays are not compatible"));
+        }
+        for(uint8_t j=0; j < ULAB_MAX_DIMS; j++) {
+            if(j == axis) {
+                shape[j] += _ndarray->shape[j];
+            } else {
+                if(shape[j] != _ndarray->shape[j]) {
+                    mp_raise_ValueError(translate("input arrays are not compatible"));
+                }
+            }
+        }
+    }
+
+    ndarray_obj_t *target = ndarray_new_dense_ndarray(ndim, shape, dtype);
+    uint8_t *tpos = (uint8_t *)target->array;
+    uint8_t *tarray;
+
+    for(uint8_t p=0; p < ndarrays->len; p++) {
+        // reset the pointer along the axis
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(ndarrays->items[p]);
+        uint8_t *sarray = (uint8_t *)source->array;
+        tarray = tpos;
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        memcpy(tarray, sarray, source->itemsize);
+                        tarray += target->strides[ULAB_MAX_DIMS - 1];
+                        sarray += source->strides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    tarray -= target->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                    tarray += target->strides[ULAB_MAX_DIMS - 2];
+                    sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                    sarray += source->strides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                tarray -= target->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                tarray += target->strides[ULAB_MAX_DIMS - 3];
+                sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                sarray += source->strides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            tarray -= target->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+            tarray += target->strides[ULAB_MAX_DIMS - 4];
+            sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+            sarray += source->strides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+        #endif
+        if(p < ndarrays->len - 1) {
+            tpos += target->strides[axis] * source->shape[axis];
+        }
+    }
+    return MP_OBJ_FROM_PTR(target);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_concatenate_obj, 1, create_concatenate);
+#endif
+
+#if ULAB_MAX_DIMS > 1
+#if ULAB_NUMPY_HAS_DIAG
+//| def diag(a: ulab.numpy.ndarray, *, k: int = 0) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: a
+//|       an ndarray
+//|     .. param: k
+//|       Offset of the diagonal from the main diagonal. Can be positive or negative.
+//|
+//|     Return specified diagonals."""
+//|     ...
+//|
+
+mp_obj_t create_diag(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_k, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = 0 } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("input must be an ndarray"));
+    }
+    ndarray_obj_t *source = MP_OBJ_TO_PTR(args[0].u_obj);
+    if(source->ndim == 1) { // return a rank-2 tensor with the prescribed diagonal
+        ndarray_obj_t *target = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, source->len, source->len), source->dtype);
+        uint8_t *sarray = (uint8_t *)source->array;
+        uint8_t *tarray = (uint8_t *)target->array;
+        for(size_t i=0; i < source->len; i++) {
+            memcpy(tarray, sarray, source->itemsize);
+            sarray += source->strides[ULAB_MAX_DIMS - 1];
+            tarray += (source->len + 1) * target->itemsize;
+        }
+        return MP_OBJ_FROM_PTR(target);
+    }
+    if(source->ndim > 2) {
+        mp_raise_TypeError(translate("input must be a tensor of rank 2"));
+    }
+    int32_t k = args[1].u_int;
+    size_t len = 0;
+    uint8_t *sarray = (uint8_t *)source->array;
+    if(k < 0) { // move the pointer "vertically"
+        if(-k < (int32_t)source->shape[ULAB_MAX_DIMS - 2]) {
+            sarray -= k * source->strides[ULAB_MAX_DIMS - 2];
+            len = MIN(source->shape[ULAB_MAX_DIMS - 2] + k, source->shape[ULAB_MAX_DIMS - 1]);
+        }
+    } else { // move the pointer "horizontally"
+        if(k < (int32_t)source->shape[ULAB_MAX_DIMS - 1]) {
+            sarray += k * source->strides[ULAB_MAX_DIMS - 1];
+            len = MIN(source->shape[ULAB_MAX_DIMS - 1] - k, source->shape[ULAB_MAX_DIMS - 2]);
+        }
+    }
+
+    if(len == 0) {
+        mp_raise_ValueError(translate("offset is too large"));
+    }
+
+    ndarray_obj_t *target = ndarray_new_linear_array(len, source->dtype);
+    uint8_t *tarray = (uint8_t *)target->array;
+
+    for(size_t i=0; i < len; i++) {
+        memcpy(tarray, sarray, source->itemsize);
+        sarray += source->strides[ULAB_MAX_DIMS - 2];
+        sarray += source->strides[ULAB_MAX_DIMS - 1];
+        tarray += source->itemsize;
+    }
+    return MP_OBJ_FROM_PTR(target);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_diag_obj, 1, create_diag);
+#endif /* ULAB_NUMPY_HAS_DIAG */
+
+#if ULAB_NUMPY_HAS_EMPTY
+// This function is bound in numpy.c to numpy.zeros(), and is simply an alias for that
+
+//| def empty(shape: Union[int, Tuple[int, ...]], *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|    """
+//|    .. param: shape
+//|       Shape of the array, either an integer (for a 1-D array) or a tuple of 2 integers (for a 2-D array)
+//|    .. param: dtype
+//|       Type of values in the array
+//|
+//|    Return a new array of the given shape with all elements set to 0. An alias for numpy.zeros."""
+//|    ...
+//|
+#endif
+
+#if ULAB_NUMPY_HAS_EYE
+//| def eye(size: int, *, M: Optional[int] = None, k: int = 0, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|     """Return a new square array of size, with the diagonal elements set to 1
+//|        and the other elements set to 0. If k is given, the diagonal is shifted by the specified amount."""
+//|     ...
+//|
+
+mp_obj_t create_eye(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_INT, { .u_int = 0 } },
+        { MP_QSTR_M, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_k, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = 0 } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    size_t n = args[0].u_int, m;
+    size_t k = args[2].u_int > 0 ? (size_t)args[2].u_int : (size_t)(-args[2].u_int);
+    uint8_t dtype = args[3].u_int;
+    if(args[1].u_rom_obj == mp_const_none) {
+        m = n;
+    } else {
+        m = mp_obj_get_int(args[1].u_rom_obj);
+    }
+    ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, n, m), dtype);
+    if(dtype == NDARRAY_BOOL) {
+       dtype = NDARRAY_UINT8;
+   }
+    mp_obj_t one = mp_obj_new_int(1);
+    size_t i = 0;
+    if((args[2].u_int >= 0)) {
+        while(k < m) {
+            ndarray_set_value(dtype, ndarray->array, i*m+k, one);
+            k++;
+            i++;
+        }
+    } else {
+        while(k < n) {
+            ndarray_set_value(dtype, ndarray->array, k*m+i, one);
+            k++;
+            i++;
+        }
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_eye_obj, 1, create_eye);
+#endif /* ULAB_NUMPY_HAS_EYE */
+#endif /* ULAB_MAX_DIMS > 1 */
+
+#if ULAB_NUMPY_HAS_FULL
+//| def full(shape: Union[int, Tuple[int, ...]], fill_value: Union[_float, _bool], *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|    """
+//|    .. param: shape
+//|       Shape of the array, either an integer (for a 1-D array) or a tuple of integers (for tensors of higher rank)
+//|    .. param: fill_value
+//|       scalar, the value with which the array is filled
+//|    .. param: dtype
+//|       Type of values in the array
+//|
+//|    Return a new array of the given shape with all elements set to 0."""
+//|    ...
+//|
+
+mp_obj_t create_full(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_obj = MP_OBJ_NULL } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_obj = MP_OBJ_NULL } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    uint8_t dtype = args[2].u_int;
+
+    return create_zeros_ones_full(args[0].u_obj, dtype, args[1].u_obj);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_full_obj, 0, create_full);
+#endif
+
+
+#if ULAB_NUMPY_HAS_LINSPACE
+//| def linspace(
+//|     start: _float,
+//|     stop: _float,
+//|     *,
+//|     dtype: _DType = ulab.numpy.float,
+//|     num: int = 50,
+//|     endpoint: _bool = True,
+//|     retstep: _bool = False
+//| ) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: start
+//|       First value in the array
+//|     .. param: stop
+//|       Final value in the array
+//|     .. param int: num
+//|       Count of values in the array.
+//|     .. param: dtype
+//|       Type of values in the array
+//|     .. param bool: endpoint
+//|       Whether the ``stop`` value is included.  Note that even when
+//|       endpoint=True, the exact ``stop`` value may not be included due to the
+//|       inaccuracy of floating point arithmetic.
+//|      .. param bool: retstep,
+//|       If True, return (`samples`, `step`), where `step` is the spacing between samples.
+//|
+//|     Return a new 1-D array with ``num`` elements ranging from ``start`` to ``stop`` linearly."""
+//|     ...
+//|
+
+mp_obj_t create_linspace(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_num, MP_ARG_INT, { .u_int = 50 } },
+        { MP_QSTR_endpoint, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_true } },
+        { MP_QSTR_retstep, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_false } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(args[2].u_int < 2) {
+        mp_raise_ValueError(translate("number of points must be at least 2"));
+    }
+    size_t len = (size_t)args[2].u_int;
+    mp_float_t start, step, stop;
+
+    ndarray_obj_t *ndarray = NULL;
+
+    #if ULAB_SUPPORTS_COMPLEX
+    mp_float_t step_real, step_imag;
+    bool complex_out = false;
+
+    if(mp_obj_is_type(args[0].u_obj, &mp_type_complex) || mp_obj_is_type(args[1].u_obj, &mp_type_complex)) {
+        complex_out = true;
+        ndarray = ndarray_new_linear_array(len, NDARRAY_COMPLEX);
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+        mp_float_t start_real, start_imag;
+        mp_float_t stop_real, stop_imag;
+
+        mp_obj_get_complex(args[0].u_obj, &start_real, &start_imag);
+        mp_obj_get_complex(args[1].u_obj, &stop_real, &stop_imag);
+        if(args[3].u_obj == mp_const_true) {
+            step_real = (stop_real - start_real) / (len - 1);
+            step_imag = (stop_imag - start_imag) / (len - 1);
+        } else {
+            step_real = (stop_real - start_real) / len;
+            step_imag = (stop_imag - start_imag) / len;
+        }
+
+        for(size_t i = 0; i < len; i++) {
+            *array++ = start_real;
+            *array++ = start_imag;
+            start_real += step_real;
+            start_imag += step_imag;
+        }
+    } else {
+    #endif
+        start = mp_obj_get_float(args[0].u_obj);
+        stop = mp_obj_get_float(args[1].u_obj);
+
+        uint8_t typecode = args[5].u_int;
+
+        if(args[3].u_obj == mp_const_true) {
+            step = (stop - start) / (len - 1);
+        } else {
+            step = (stop - start) / len;
+            stop = start + step * (len - 1);
+        }
+
+        ndarray = create_linspace_arange(start, step, stop, len, typecode);
+    #if ULAB_SUPPORTS_COMPLEX
+    }
+    #endif
+
+    if(args[4].u_obj == mp_const_false) {
+        return MP_OBJ_FROM_PTR(ndarray);
+    } else {
+        mp_obj_t tuple[2];
+        tuple[0] = ndarray;
+        #if ULAB_SUPPORTS_COMPLEX
+        if(complex_out) {
+            tuple[1] = mp_obj_new_complex(step_real, step_imag);
+        } else {
+            tuple[1] = mp_obj_new_float(step);
+        }
+        #else /* ULAB_SUPPORTS_COMPLEX */
+        tuple[1] = mp_obj_new_float(step);
+        #endif
+
+        return mp_obj_new_tuple(2, tuple);
+    }
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_linspace_obj, 2, create_linspace);
+#endif
+
+#if ULAB_NUMPY_HAS_LOGSPACE
+//| def logspace(
+//|     start: _float,
+//|     stop: _float,
+//|     *,
+//|     dtype: _DType = ulab.numpy.float,
+//|     num: int = 50,
+//|     endpoint: _bool = True,
+//|     base: _float = 10.0
+//| ) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: start
+//|       First value in the array
+//|     .. param: stop
+//|       Final value in the array
+//|     .. param int: num
+//|       Count of values in the array. Defaults to 50.
+//|     .. param: base
+//|       The base of the log space. The step size between the elements in
+//|       ``ln(samples) / ln(base)`` (or ``log_base(samples)``) is uniform. Defaults to 10.0.
+//|     .. param: dtype
+//|       Type of values in the array
+//|     .. param bool: endpoint
+//|       Whether the ``stop`` value is included.  Note that even when
+//|       endpoint=True, the exact ``stop`` value may not be included due to the
+//|       inaccuracy of floating point arithmetic. Defaults to True.
+//|
+//|     Return a new 1-D array with ``num`` evenly spaced elements on a log scale.
+//|     The sequence starts at ``base ** start``, and ends with ``base ** stop``."""
+//|     ...
+//|
+
+const mp_obj_float_t create_float_const_ten = {{&mp_type_float}, MICROPY_FLOAT_CONST(10.0)};
+
+mp_obj_t create_logspace(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_num, MP_ARG_INT, { .u_int = 50 } },
+        { MP_QSTR_base, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_PTR(&create_float_const_ten) } },
+        { MP_QSTR_endpoint, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_true } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(args[2].u_int < 2) {
+        mp_raise_ValueError(translate("number of points must be at least 2"));
+    }
+    size_t len = (size_t)args[2].u_int;
+    mp_float_t start, step, quotient;
+    start = mp_obj_get_float(args[0].u_obj);
+    uint8_t dtype = args[5].u_int;
+    mp_float_t base = mp_obj_get_float(args[3].u_obj);
+    if(args[4].u_obj == mp_const_true) step = (mp_obj_get_float(args[1].u_obj) - start)/(len - 1);
+    else step = (mp_obj_get_float(args[1].u_obj) - start) / len;
+    quotient = MICROPY_FLOAT_C_FUN(pow)(base, step);
+    ndarray_obj_t *ndarray = ndarray_new_linear_array(len, dtype);
+
+    mp_float_t value = MICROPY_FLOAT_C_FUN(pow)(base, start);
+    if(ndarray->dtype == NDARRAY_UINT8) {
+        uint8_t *array = (uint8_t *)ndarray->array;
+        if(ndarray->boolean) {
+            memset(array, 1, len);
+        } else {
+            for(size_t i=0; i < len; i++, value *= quotient) *array++ = (uint8_t)value;
+        }
+    } else if(ndarray->dtype == NDARRAY_INT8) {
+        int8_t *array = (int8_t *)ndarray->array;
+        for(size_t i=0; i < len; i++, value *= quotient) *array++ = (int8_t)value;
+    } else if(ndarray->dtype == NDARRAY_UINT16) {
+        uint16_t *array = (uint16_t *)ndarray->array;
+        for(size_t i=0; i < len; i++, value *= quotient) *array++ = (uint16_t)value;
+    } else if(ndarray->dtype == NDARRAY_INT16) {
+        int16_t *array = (int16_t *)ndarray->array;
+        for(size_t i=0; i < len; i++, value *= quotient) *array++ = (int16_t)value;
+    } else {
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+        for(size_t i=0; i < len; i++, value *= quotient) *array++ = value;
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_logspace_obj, 2, create_logspace);
+#endif
+
+#if ULAB_NUMPY_HAS_ONES
+//| def ones(shape: Union[int, Tuple[int, ...]], *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|    """
+//|    .. param: shape
+//|       Shape of the array, either an integer (for a 1-D array) or a tuple of 2 integers (for a 2-D array)
+//|    .. param: dtype
+//|       Type of values in the array
+//|
+//|    Return a new array of the given shape with all elements set to 1."""
+//|    ...
+//|
+
+mp_obj_t create_ones(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_obj = MP_OBJ_NULL } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    uint8_t dtype = args[1].u_int;
+    mp_obj_t one = mp_obj_new_int(1);
+    return create_zeros_ones_full(args[0].u_obj, dtype, one);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_ones_obj, 0, create_ones);
+#endif
+
+#if ULAB_NUMPY_HAS_ZEROS
+//| def zeros(shape: Union[int, Tuple[int, ...]], *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|    """
+//|    .. param: shape
+//|       Shape of the array, either an integer (for a 1-D array) or a tuple of 2 integers (for a 2-D array)
+//|    .. param: dtype
+//|       Type of values in the array
+//|
+//|    Return a new array of the given shape with all elements set to 0."""
+//|    ...
+//|
+
+mp_obj_t create_zeros(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_obj = MP_OBJ_NULL } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    uint8_t dtype = args[1].u_int;
+    return create_zeros_ones_full(args[0].u_obj, dtype, mp_const_none);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_zeros_obj, 0, create_zeros);
+#endif
+
+#if ULAB_NUMPY_HAS_FROMBUFFER
+mp_obj_t create_frombuffer(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_INT(NDARRAY_FLOAT) } },
+        { MP_QSTR_count, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_INT(-1) } },
+        { MP_QSTR_offset, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_INT(0) } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    uint8_t dtype = mp_obj_get_int(args[1].u_obj);
+    size_t offset = mp_obj_get_int(args[3].u_obj);
+
+    mp_buffer_info_t bufinfo;
+    if(mp_get_buffer(args[0].u_obj, &bufinfo, MP_BUFFER_READ)) {
+        size_t sz = ulab_binary_get_size(dtype);
+
+        if(bufinfo.len < offset) {
+            mp_raise_ValueError(translate("offset must be non-negative and no greater than buffer length"));
+        }
+        size_t len = (bufinfo.len - offset) / sz;
+        if((len * sz) != (bufinfo.len - offset)) {
+            mp_raise_ValueError(translate("buffer size must be a multiple of element size"));
+        }
+        if(mp_obj_get_int(args[2].u_obj) > 0) {
+            size_t count = mp_obj_get_int(args[2].u_obj);
+            if(len < count) {
+                mp_raise_ValueError(translate("buffer is smaller than requested size"));
+            } else {
+                len = count;
+            }
+        }
+        ndarray_obj_t *ndarray = m_new_obj(ndarray_obj_t);
+        ndarray->base.type = &ulab_ndarray_type;
+        ndarray->dtype = dtype == NDARRAY_BOOL ? NDARRAY_UINT8 : dtype;
+        ndarray->boolean = dtype == NDARRAY_BOOL ? NDARRAY_BOOLEAN : NDARRAY_NUMERIC;
+        ndarray->ndim = 1;
+        ndarray->len = len;
+        ndarray->itemsize = sz;
+        ndarray->shape[ULAB_MAX_DIMS - 1] = len;
+        ndarray->strides[ULAB_MAX_DIMS - 1] = sz;
+
+        uint8_t *buffer = bufinfo.buf;
+        ndarray->array = buffer + offset;
+        return MP_OBJ_FROM_PTR(ndarray);
+    }
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_frombuffer_obj, 1, create_frombuffer);
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/create.h b/circuitpython/extmod/ulab/code/numpy/create.h
new file mode 100644
index 0000000..18f636c
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/create.h
@@ -0,0 +1,79 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2019-2021 Zoltán Vörös
+*/
+
+#ifndef _CREATE_
+#define _CREATE_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+#if ULAB_NUMPY_HAS_ARANGE
+mp_obj_t create_arange(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_arange_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_CONCATENATE
+mp_obj_t create_concatenate(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_concatenate_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_DIAG
+mp_obj_t create_diag(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_diag_obj);
+#endif
+
+#if ULAB_MAX_DIMS > 1
+#if ULAB_NUMPY_HAS_EYE
+mp_obj_t create_eye(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_eye_obj);
+#endif
+#endif
+
+#if ULAB_NUMPY_HAS_FULL
+mp_obj_t create_full(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_full_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_LINSPACE
+mp_obj_t create_linspace(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_linspace_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_LOGSPACE
+mp_obj_t create_logspace(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_logspace_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_ONES
+mp_obj_t create_ones(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_ones_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_ZEROS
+mp_obj_t create_zeros(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_zeros_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_FROMBUFFER
+mp_obj_t create_frombuffer(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_frombuffer_obj);
+#endif
+
+#define ARANGE_LOOP(type_, ndarray, len, step, stop) \
+({\
+    type_ *array = (type_ *)(ndarray)->array;\
+    for (size_t i = 0; i < (len) - 1; i++, (value) += (step)) {\
+        *array++ = (type_)(value);\
+    }\
+    *array = (type_)(stop);\
+})
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/fft/fft.c b/circuitpython/extmod/ulab/code/numpy/fft/fft.c
new file mode 100644
index 0000000..27cb79c
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/fft/fft.c
@@ -0,0 +1,102 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/runtime.h"
+#include "py/builtin.h"
+#include "py/binary.h"
+#include "py/obj.h"
+#include "py/objarray.h"
+
+#include "../carray/carray_tools.h"
+#include "fft.h"
+
+//| """Frequency-domain functions"""
+//|
+//| import ulab.numpy
+
+
+//| def fft(r: ulab.numpy.ndarray, c: Optional[ulab.numpy.ndarray] = None) -> Tuple[ulab.numpy.ndarray, ulab.numpy.ndarray]:
+//|     """
+//|     :param ulab.numpy.ndarray r: A 1-dimension array of values whose size is a power of 2
+//|     :param ulab.numpy.ndarray c: An optional 1-dimension array of values whose size is a power of 2, giving the complex part of the value
+//|     :return tuple (r, c): The real and complex parts of the FFT
+//|
+//|     Perform a Fast Fourier Transform from the time domain into the frequency domain
+//|
+//|     See also ~ulab.extras.spectrum, which computes the magnitude of the fft,
+//|     rather than separately returning its real and imaginary parts."""
+//|     ...
+//|
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+static mp_obj_t fft_fft(mp_obj_t arg) {
+    return fft_fft_ifft_spectrogram(arg, FFT_FFT);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(fft_fft_obj, fft_fft);
+#else
+static mp_obj_t fft_fft(size_t n_args, const mp_obj_t *args) {
+    if(n_args == 2) {
+        return fft_fft_ifft_spectrogram(n_args, args[0], args[1], FFT_FFT);
+    } else {
+        return fft_fft_ifft_spectrogram(n_args, args[0], mp_const_none, FFT_FFT);
+    }
+}
+
+MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(fft_fft_obj, 1, 2, fft_fft);
+#endif
+
+//| def ifft(r: ulab.numpy.ndarray, c: Optional[ulab.numpy.ndarray] = None) -> Tuple[ulab.numpy.ndarray, ulab.numpy.ndarray]:
+//|     """
+//|     :param ulab.numpy.ndarray r: A 1-dimension array of values whose size is a power of 2
+//|     :param ulab.numpy.ndarray c: An optional 1-dimension array of values whose size is a power of 2, giving the complex part of the value
+//|     :return tuple (r, c): The real and complex parts of the inverse FFT
+//|
+//|     Perform an Inverse Fast Fourier Transform from the frequeny domain into the time domain"""
+//|     ...
+//|
+
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+static mp_obj_t fft_ifft(mp_obj_t arg) {
+    return fft_fft_ifft_spectrogram(arg, FFT_IFFT);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(fft_ifft_obj, fft_ifft);
+#else
+static mp_obj_t fft_ifft(size_t n_args, const mp_obj_t *args) {
+    NOT_IMPLEMENTED_FOR_COMPLEX()
+    if(n_args == 2) {
+        return fft_fft_ifft_spectrogram(n_args, args[0], args[1], FFT_IFFT);
+    } else {
+        return fft_fft_ifft_spectrogram(n_args, args[0], mp_const_none, FFT_IFFT);
+    }
+}
+
+MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(fft_ifft_obj, 1, 2, fft_ifft);
+#endif
+
+STATIC const mp_rom_map_elem_t ulab_fft_globals_table[] = {
+    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_fft) },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_fft), (mp_obj_t)&fft_fft_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_ifft), (mp_obj_t)&fft_ifft_obj },
+};
+
+STATIC MP_DEFINE_CONST_DICT(mp_module_ulab_fft_globals, ulab_fft_globals_table);
+
+const mp_obj_module_t ulab_fft_module = {
+    .base = { &mp_type_module },
+    .globals = (mp_obj_dict_t*)&mp_module_ulab_fft_globals,
+};
+MP_REGISTER_MODULE(MP_QSTR_ulab_dot_fft, ulab_fft_module, MODULE_ULAB_ENABLED && CIRCUITPY_ULAB);
diff --git a/circuitpython/extmod/ulab/code/numpy/fft/fft.h b/circuitpython/extmod/ulab/code/numpy/fft/fft.h
new file mode 100644
index 0000000..1e50a8d
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/fft/fft.h
@@ -0,0 +1,30 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#ifndef _FFT_
+#define _FFT_
+
+#include "../../ulab.h"
+#include "../../ulab_tools.h"
+#include "../../ndarray.h"
+#include "fft_tools.h"
+
+extern const mp_obj_module_t ulab_fft_module;
+
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+MP_DECLARE_CONST_FUN_OBJ_3(fft_fft_obj);
+MP_DECLARE_CONST_FUN_OBJ_3(fft_ifft_obj);
+#else
+MP_DECLARE_CONST_FUN_OBJ_VAR_BETWEEN(fft_fft_obj);
+MP_DECLARE_CONST_FUN_OBJ_VAR_BETWEEN(fft_ifft_obj);
+#endif
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/fft/fft_tools.c b/circuitpython/extmod/ulab/code/numpy/fft/fft_tools.c
new file mode 100644
index 0000000..8a55927
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/fft/fft_tools.c
@@ -0,0 +1,287 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#include <math.h>
+#include <string.h>
+#include "py/runtime.h"
+
+#include "../../ndarray.h"
+#include "../../ulab_tools.h"
+#include "../carray/carray_tools.h"
+#include "fft_tools.h"
+
+#ifndef MP_PI
+#define MP_PI MICROPY_FLOAT_CONST(3.14159265358979323846)
+#endif
+#ifndef MP_E
+#define MP_E MICROPY_FLOAT_CONST(2.71828182845904523536)
+#endif
+
+/* Kernel implementation for the case, when ulab has no complex support
+
+ * The following function takes two arrays, namely, the real and imaginary
+ * parts of a complex array, and calculates the Fourier transform in place.
+ *
+ * The function is basically a modification of four1 from Numerical Recipes,
+ * has no dependencies beyond micropython itself (for the definition of mp_float_t),
+ * and can be used independent of ulab.
+ */
+
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+/* Kernel implementation for the complex case. Data are contained in data as
+
+    data[0], data[1], data[2], data[3], .... , data[2n - 2], data[2n-1]
+    real[0], imag[0], real[1], imag[1], .... , real[n-1],    imag[n-1]
+
+    In general
+    real[i] = data[2i]
+    imag[i] = data[2i+1]
+
+*/
+void fft_kernel_complex(mp_float_t *data, size_t n, int isign) {
+    size_t j, m, mmax, istep;
+    mp_float_t tempr, tempi;
+    mp_float_t wtemp, wr, wpr, wpi, wi, theta;
+
+    j = 0;
+    for(size_t i = 0; i < n; i++) {
+        if (j > i) {
+            SWAP(mp_float_t, data[2*i], data[2*j]);
+            SWAP(mp_float_t, data[2*i+1], data[2*j+1]);
+        }
+        m = n >> 1;
+        while (j >= m && m > 0) {
+            j -= m;
+            m >>= 1;
+        }
+        j += m;
+    }
+
+    mmax = 1;
+    while (n > mmax) {
+        istep = mmax << 1;
+        theta = MICROPY_FLOAT_CONST(-2.0)*isign*MP_PI/istep;
+        wtemp = MICROPY_FLOAT_C_FUN(sin)(MICROPY_FLOAT_CONST(0.5) * theta);
+        wpr = MICROPY_FLOAT_CONST(-2.0) * wtemp * wtemp;
+        wpi = MICROPY_FLOAT_C_FUN(sin)(theta);
+        wr = MICROPY_FLOAT_CONST(1.0);
+        wi = MICROPY_FLOAT_CONST(0.0);
+        for(m = 0; m < mmax; m++) {
+            for(size_t i = m; i < n; i += istep) {
+                j = i + mmax;
+                tempr = wr * data[2*j] - wi * data[2*j+1];
+                tempi = wr * data[2*j+1] + wi * data[2*j];
+                data[2*j] = data[2*i] - tempr;
+                data[2*j+1] = data[2*i+1] - tempi;
+                data[2*i] += tempr;
+                data[2*i+1] += tempi;
+            }
+            wtemp = wr;
+            wr = wr*wpr - wi*wpi + wr;
+            wi = wi*wpr + wtemp*wpi + wi;
+        }
+        mmax = istep;
+    }
+}
+
+/*
+ * The following function is a helper interface to the python side.
+ * It has been factored out from fft.c, so that the same argument parsing
+ * routine can be called from scipy.signal.spectrogram.
+ */
+mp_obj_t fft_fft_ifft_spectrogram(mp_obj_t data_in, uint8_t type) {
+    if(!mp_obj_is_type(data_in, &ulab_ndarray_type)) {
+        mp_raise_NotImplementedError(translate("FFT is defined for ndarrays only"));
+    }
+    ndarray_obj_t *in = MP_OBJ_TO_PTR(data_in);
+    #if ULAB_MAX_DIMS > 1
+    if(in->ndim != 1) {
+        mp_raise_TypeError(translate("FFT is implemented for linear arrays only"));
+    }
+    #endif
+    size_t len = in->len;
+    // Check if input is of length of power of 2
+    if((len & (len-1)) != 0) {
+        mp_raise_ValueError(translate("input array length must be power of 2"));
+    }
+
+    ndarray_obj_t *out = ndarray_new_linear_array(len, NDARRAY_COMPLEX);
+    mp_float_t *data = (mp_float_t *)out->array;
+    uint8_t *array = (uint8_t *)in->array;
+
+    if(in->dtype == NDARRAY_COMPLEX) {
+        uint8_t sz = 2 * sizeof(mp_float_t);
+        uint8_t *data_ = (uint8_t *)out->array;
+        for(size_t i = 0; i < len; i++) {
+            memcpy(data_, array, sz);
+            array += in->strides[ULAB_MAX_DIMS - 1];
+        }
+    } else {
+        mp_float_t (*func)(void *) = ndarray_get_float_function(in->dtype);
+        for(size_t i = 0; i < len; i++) {
+            // real part; the imaginary part is 0, no need to assign
+            *data = func(array);
+            data += 2;
+            array += in->strides[ULAB_MAX_DIMS - 1];
+        }
+    }
+    data -= 2 * len;
+
+    if((type == FFT_FFT) || (type == FFT_SPECTROGRAM)) {
+        fft_kernel_complex(data, len, 1);
+        if(type == FFT_SPECTROGRAM) {
+            ndarray_obj_t *spectrum = ndarray_new_linear_array(len, NDARRAY_FLOAT);
+            mp_float_t *sarray = (mp_float_t *)spectrum->array;
+            for(size_t i = 0; i < len; i++) {
+                *sarray++ = MICROPY_FLOAT_C_FUN(sqrt)(data[0] * data[0] + data[1] * data[1]);
+                data += 2;
+            }
+            m_del(mp_float_t, data, 2 * len);
+            return MP_OBJ_FROM_PTR(spectrum);
+        }
+    } else { // inverse transform
+        fft_kernel_complex(data, len, -1);
+        // TODO: numpy accepts the norm keyword argument
+        for(size_t i = 0; i < len; i++) {
+            *data++ /= len;
+        }
+    }
+    return MP_OBJ_FROM_PTR(out);
+}
+#else /* ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE */
+void fft_kernel(mp_float_t *real, mp_float_t *imag, size_t n, int isign) {
+    size_t j, m, mmax, istep;
+    mp_float_t tempr, tempi;
+    mp_float_t wtemp, wr, wpr, wpi, wi, theta;
+
+    j = 0;
+    for(size_t i = 0; i < n; i++) {
+        if (j > i) {
+            SWAP(mp_float_t, real[i], real[j]);
+            SWAP(mp_float_t, imag[i], imag[j]);
+        }
+        m = n >> 1;
+        while (j >= m && m > 0) {
+            j -= m;
+            m >>= 1;
+        }
+        j += m;
+    }
+
+    mmax = 1;
+    while (n > mmax) {
+        istep = mmax << 1;
+        theta = MICROPY_FLOAT_CONST(-2.0)*isign*MP_PI/istep;
+        wtemp = MICROPY_FLOAT_C_FUN(sin)(MICROPY_FLOAT_CONST(0.5) * theta);
+        wpr = MICROPY_FLOAT_CONST(-2.0) * wtemp * wtemp;
+        wpi = MICROPY_FLOAT_C_FUN(sin)(theta);
+        wr = MICROPY_FLOAT_CONST(1.0);
+        wi = MICROPY_FLOAT_CONST(0.0);
+        for(m = 0; m < mmax; m++) {
+            for(size_t i = m; i < n; i += istep) {
+                j = i + mmax;
+                tempr = wr * real[j] - wi * imag[j];
+                tempi = wr * imag[j] + wi * real[j];
+                real[j] = real[i] - tempr;
+                imag[j] = imag[i] - tempi;
+                real[i] += tempr;
+                imag[i] += tempi;
+            }
+            wtemp = wr;
+            wr = wr*wpr - wi*wpi + wr;
+            wi = wi*wpr + wtemp*wpi + wi;
+        }
+        mmax = istep;
+    }
+}
+
+mp_obj_t fft_fft_ifft_spectrogram(size_t n_args, mp_obj_t arg_re, mp_obj_t arg_im, uint8_t type) {
+    if(!mp_obj_is_type(arg_re, &ulab_ndarray_type)) {
+        mp_raise_NotImplementedError(translate("FFT is defined for ndarrays only"));
+    }
+    if(n_args == 2) {
+        if(!mp_obj_is_type(arg_im, &ulab_ndarray_type)) {
+            mp_raise_NotImplementedError(translate("FFT is defined for ndarrays only"));
+        }
+    }
+    ndarray_obj_t *re = MP_OBJ_TO_PTR(arg_re);
+    #if ULAB_MAX_DIMS > 1
+    if(re->ndim != 1) {
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(re->dtype)
+        mp_raise_TypeError(translate("FFT is implemented for linear arrays only"));
+    }
+    #endif
+    size_t len = re->len;
+    // Check if input is of length of power of 2
+    if((len & (len-1)) != 0) {
+        mp_raise_ValueError(translate("input array length must be power of 2"));
+    }
+
+    ndarray_obj_t *out_re = ndarray_new_linear_array(len, NDARRAY_FLOAT);
+    mp_float_t *data_re = (mp_float_t *)out_re->array;
+
+    uint8_t *array = (uint8_t *)re->array;
+    mp_float_t (*func)(void *) = ndarray_get_float_function(re->dtype);
+
+    for(size_t i=0; i < len; i++) {
+        *data_re++ = func(array);
+        array += re->strides[ULAB_MAX_DIMS - 1];
+    }
+    data_re -= len;
+    ndarray_obj_t *out_im = ndarray_new_linear_array(len, NDARRAY_FLOAT);
+    mp_float_t *data_im = (mp_float_t *)out_im->array;
+
+    if(n_args == 2) {
+        ndarray_obj_t *im = MP_OBJ_TO_PTR(arg_im);
+        #if ULAB_MAX_DIMS > 1
+        if(im->ndim != 1) {
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(im->dtype)
+            mp_raise_TypeError(translate("FFT is implemented for linear arrays only"));
+        }
+        #endif
+        if (re->len != im->len) {
+            mp_raise_ValueError(translate("real and imaginary parts must be of equal length"));
+        }
+        array = (uint8_t *)im->array;
+        func = ndarray_get_float_function(im->dtype);
+        for(size_t i=0; i < len; i++) {
+           *data_im++ = func(array);
+           array += im->strides[ULAB_MAX_DIMS - 1];
+        }
+        data_im -= len;
+    }
+
+    if((type == FFT_FFT) || (type == FFT_SPECTROGRAM)) {
+        fft_kernel(data_re, data_im, len, 1);
+        if(type == FFT_SPECTROGRAM) {
+            for(size_t i=0; i < len; i++) {
+                *data_re = MICROPY_FLOAT_C_FUN(sqrt)(*data_re * *data_re + *data_im * *data_im);
+                data_re++;
+                data_im++;
+            }
+        }
+    } else { // inverse transform
+        fft_kernel(data_re, data_im, len, -1);
+        // TODO: numpy accepts the norm keyword argument
+        for(size_t i=0; i < len; i++) {
+            *data_re++ /= len;
+            *data_im++ /= len;
+        }
+    }
+    if(type == FFT_SPECTROGRAM) {
+        return MP_OBJ_TO_PTR(out_re);
+    } else {
+        mp_obj_t tuple[2];
+        tuple[0] = out_re;
+        tuple[1] = out_im;
+        return mp_obj_new_tuple(2, tuple);
+    }
+}
+#endif  /* ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE */
diff --git a/circuitpython/extmod/ulab/code/numpy/fft/fft_tools.h b/circuitpython/extmod/ulab/code/numpy/fft/fft_tools.h
new file mode 100644
index 0000000..9444232
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/fft/fft_tools.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#ifndef _FFT_TOOLS_
+#define _FFT_TOOLS_
+
+enum FFT_TYPE {
+    FFT_FFT,
+    FFT_IFFT,
+    FFT_SPECTROGRAM,
+};
+
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+void fft_kernel(mp_float_t *, size_t , int );
+mp_obj_t fft_fft_ifft_spectrogram(mp_obj_t , uint8_t );
+#else
+void fft_kernel(mp_float_t *, mp_float_t *, size_t , int );
+mp_obj_t fft_fft_ifft_spectrogram(size_t , mp_obj_t , mp_obj_t , uint8_t );
+#endif /* ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE */
+
+#endif /* _FFT_TOOLS_ */
diff --git a/circuitpython/extmod/ulab/code/numpy/filter.c b/circuitpython/extmod/ulab/code/numpy/filter.c
new file mode 100644
index 0000000..057cd6d
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/filter.c
@@ -0,0 +1,132 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020-2021 Zoltán Vörös
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../ulab.h"
+#include "../scipy/signal/signal.h"
+#include "carray/carray_tools.h"
+#include "filter.h"
+
+#if ULAB_NUMPY_HAS_CONVOLVE
+
+mp_obj_t filter_convolve(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_a, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_v, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type) || !mp_obj_is_type(args[1].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("convolve arguments must be ndarrays"));
+    }
+
+    ndarray_obj_t *a = MP_OBJ_TO_PTR(args[0].u_obj);
+    ndarray_obj_t *c = MP_OBJ_TO_PTR(args[1].u_obj);
+    // deal with linear arrays only
+    #if ULAB_MAX_DIMS > 1
+    if((a->ndim != 1) || (c->ndim != 1)) {
+        mp_raise_TypeError(translate("convolve arguments must be linear arrays"));
+    }
+    #endif
+    size_t len_a = a->len;
+    size_t len_c = c->len;
+    if(len_a == 0 || len_c == 0) {
+        mp_raise_TypeError(translate("convolve arguments must not be empty"));
+    }
+
+    int len = len_a + len_c - 1; // convolve mode "full"
+    int32_t off = len_c - 1;
+    uint8_t dtype = NDARRAY_FLOAT;
+
+    #if ULAB_SUPPORTS_COMPLEX
+    if((a->dtype == NDARRAY_COMPLEX) || (c->dtype == NDARRAY_COMPLEX)) {
+        dtype = NDARRAY_COMPLEX;
+    }
+    #endif
+    ndarray_obj_t *ndarray = ndarray_new_linear_array(len, dtype);
+    mp_float_t *array = (mp_float_t *)ndarray->array;
+
+    uint8_t *aarray = (uint8_t *)a->array;
+    uint8_t *carray = (uint8_t *)c->array;
+
+    int32_t as = a->strides[ULAB_MAX_DIMS - 1] / a->itemsize;
+    int32_t cs = c->strides[ULAB_MAX_DIMS - 1] / c->itemsize;
+
+
+    #if ULAB_SUPPORTS_COMPLEX
+    if(dtype == NDARRAY_COMPLEX) {
+        mp_float_t a_real, a_imag;
+        mp_float_t c_real, c_imag = MICROPY_FLOAT_CONST(0.0);
+        for(int32_t k = -off; k < len-off; k++) {
+            mp_float_t accum_real = MICROPY_FLOAT_CONST(0.0);
+            mp_float_t accum_imag = MICROPY_FLOAT_CONST(0.0);
+
+            int32_t top_n = MIN(len_c, len_a - k);
+            int32_t bot_n = MAX(-k, 0);
+
+            for(int32_t n = bot_n; n < top_n; n++) {
+                int32_t idx_c = (len_c - n - 1) * cs;
+                int32_t idx_a = (n + k) * as;
+                if(a->dtype != NDARRAY_COMPLEX) {
+                    a_real = ndarray_get_float_index(aarray, a->dtype, idx_a);
+                    a_imag = MICROPY_FLOAT_CONST(0.0);
+                } else {
+                    a_real = ndarray_get_float_index(aarray, NDARRAY_FLOAT, 2 * idx_a);
+                    a_imag = ndarray_get_float_index(aarray, NDARRAY_FLOAT, 2 * idx_a + 1);
+                }
+
+                if(c->dtype != NDARRAY_COMPLEX) {
+                    c_real = ndarray_get_float_index(carray, c->dtype, idx_c);
+                    c_imag = MICROPY_FLOAT_CONST(0.0);
+                } else {
+                    c_real = ndarray_get_float_index(carray, NDARRAY_FLOAT, 2 * idx_c);
+                    c_imag = ndarray_get_float_index(carray, NDARRAY_FLOAT, 2 * idx_c + 1);
+                }
+                accum_real += a_real * c_real - a_imag * c_imag;
+                accum_imag += a_real * c_imag + a_imag * c_real;
+            }
+            *array++ = accum_real;
+            *array++ = accum_imag;
+        }
+        return MP_OBJ_FROM_PTR(ndarray);
+    }
+    #endif
+
+    for(int32_t k = -off; k < len-off; k++) {
+        mp_float_t accum = MICROPY_FLOAT_CONST(0.0);
+        int32_t top_n = MIN(len_c, len_a - k);
+        int32_t bot_n = MAX(-k, 0);
+        for(int32_t n = bot_n; n < top_n; n++) {
+            int32_t idx_c = (len_c - n - 1) * cs;
+            int32_t idx_a = (n + k) * as;
+            mp_float_t ai = ndarray_get_float_index(aarray, a->dtype, idx_a);
+            mp_float_t ci = ndarray_get_float_index(carray, c->dtype, idx_c);
+            accum += ai * ci;
+        }
+        *array++ = accum;
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(filter_convolve_obj, 2, filter_convolve);
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/filter.h b/circuitpython/extmod/ulab/code/numpy/filter.h
new file mode 100644
index 0000000..d6d0f17
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/filter.h
@@ -0,0 +1,20 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2020-2021 Zoltán Vörös
+*/
+
+#ifndef _FILTER_
+#define _FILTER_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+MP_DECLARE_CONST_FUN_OBJ_KW(filter_convolve_obj);
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/linalg/linalg.c b/circuitpython/extmod/ulab/code/numpy/linalg/linalg.c
new file mode 100644
index 0000000..11dc7de
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/linalg/linalg.c
@@ -0,0 +1,541 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020 Roberto Colistete Jr.
+ *               2020 Taku Fukada
+ *
+*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../../ulab.h"
+#include "../../ulab_tools.h"
+#include "../carray/carray_tools.h"
+#include "linalg.h"
+
+#if ULAB_NUMPY_HAS_LINALG_MODULE
+//|
+//| import ulab.numpy
+//|
+//| """Linear algebra functions"""
+//|
+
+#if ULAB_MAX_DIMS > 1
+//| def cholesky(A: ulab.numpy.ndarray) -> ulab.numpy.ndarray:
+//|     """
+//|     :param ~ulab.numpy.ndarray A: a positive definite, symmetric square matrix
+//|     :return ~ulab.numpy.ndarray L: a square root matrix in the lower triangular form
+//|     :raises ValueError: If the input does not fulfill the necessary conditions
+//|
+//|     The returned matrix satisfies the equation m=LL*"""
+//|     ...
+//|
+
+static mp_obj_t linalg_cholesky(mp_obj_t oin) {
+    ndarray_obj_t *ndarray = tools_object_is_square(oin);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    ndarray_obj_t *L = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, ndarray->shape[ULAB_MAX_DIMS - 1], ndarray->shape[ULAB_MAX_DIMS - 1]), NDARRAY_FLOAT);
+    mp_float_t *Larray = (mp_float_t *)L->array;
+
+    size_t N = ndarray->shape[ULAB_MAX_DIMS - 1];
+    uint8_t *array = (uint8_t *)ndarray->array;
+    mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+
+    for(size_t m=0; m < N; m++) { // rows
+        for(size_t n=0; n < N; n++) { // columns
+            *Larray++ = func(array);
+            array += ndarray->strides[ULAB_MAX_DIMS - 1];
+        }
+        array -= ndarray->strides[ULAB_MAX_DIMS - 1] * N;
+        array += ndarray->strides[ULAB_MAX_DIMS - 2];
+    }
+    Larray -= N*N;
+    // make sure the matrix is symmetric
+    for(size_t m=0; m < N; m++) { // rows
+        for(size_t n=m+1; n < N; n++) { // columns
+            // compare entry (m, n) to (n, m)
+            if(LINALG_EPSILON < MICROPY_FLOAT_C_FUN(fabs)(Larray[m * N + n] - Larray[n * N + m])) {
+                mp_raise_ValueError(translate("input matrix is asymmetric"));
+            }
+        }
+    }
+
+    // this is actually not needed, but Cholesky in numpy returns the lower triangular matrix
+    for(size_t i=0; i < N; i++) { // rows
+        for(size_t j=i+1; j < N; j++) { // columns
+            Larray[i*N + j] = MICROPY_FLOAT_CONST(0.0);
+        }
+    }
+    mp_float_t sum = 0.0;
+    for(size_t i=0; i < N; i++) { // rows
+        for(size_t j=0; j <= i; j++) { // columns
+            sum = Larray[i * N + j];
+            for(size_t k=0; k < j; k++) {
+                sum -= Larray[i * N + k] * Larray[j * N + k];
+            }
+            if(i == j) {
+                if(sum <= MICROPY_FLOAT_CONST(0.0)) {
+                    mp_raise_ValueError(translate("matrix is not positive definite"));
+                } else {
+                    Larray[i * N + i] = MICROPY_FLOAT_C_FUN(sqrt)(sum);
+                }
+            } else {
+                Larray[i * N + j] = sum / Larray[j * N + j];
+            }
+        }
+    }
+    return MP_OBJ_FROM_PTR(L);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(linalg_cholesky_obj, linalg_cholesky);
+
+//| def det(m: ulab.numpy.ndarray) -> float:
+//|     """
+//|     :param: m, a square matrix
+//|     :return float: The determinant of the matrix
+//|
+//|     Computes the eigenvalues and eigenvectors of a square matrix"""
+//|     ...
+//|
+
+static mp_obj_t linalg_det(mp_obj_t oin) {
+    ndarray_obj_t *ndarray = tools_object_is_square(oin);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    uint8_t *array = (uint8_t *)ndarray->array;
+    size_t N = ndarray->shape[ULAB_MAX_DIMS - 1];
+    mp_float_t *tmp = m_new(mp_float_t, N * N);
+    for(size_t m=0; m < N; m++) { // rows
+        for(size_t n=0; n < N; n++) { // columns
+            *tmp++ = ndarray_get_float_value(array, ndarray->dtype);
+            array += ndarray->strides[ULAB_MAX_DIMS - 1];
+        }
+        array -= ndarray->strides[ULAB_MAX_DIMS - 1] * N;
+        array += ndarray->strides[ULAB_MAX_DIMS - 2];
+    }
+
+    // re-wind the pointer
+    tmp -= N*N;
+
+    mp_float_t c;
+    mp_float_t det_sign = 1.0;
+
+    for(size_t m=0; m < N-1; m++){
+        if(MICROPY_FLOAT_C_FUN(fabs)(tmp[m * (N+1)]) < LINALG_EPSILON) {
+            size_t m1 = m + 1;
+            for(; m1 < N; m1++) {
+                if(!(MICROPY_FLOAT_C_FUN(fabs)(tmp[m1*N+m]) < LINALG_EPSILON)) {
+                     //look for a line to swap
+                    for(size_t m2=0; m2 < N; m2++) {
+                        mp_float_t swapVal = tmp[m*N+m2];
+                        tmp[m*N+m2] = tmp[m1*N+m2];
+                        tmp[m1*N+m2] = swapVal;
+                    }
+                    det_sign = -det_sign;
+                    break;
+                }
+            }
+            if (m1 >= N) {
+                m_del(mp_float_t, tmp, N * N);
+                return mp_obj_new_float(0.0);
+            }
+        }
+        for(size_t n=0; n < N; n++) {
+            if(m != n) {
+                c = tmp[N * n + m] / tmp[m * (N+1)];
+                for(size_t k=0; k < N; k++){
+                    tmp[N * n + k] -= c * tmp[N * m + k];
+                }
+            }
+        }
+    }
+    mp_float_t det = det_sign;
+
+    for(size_t m=0; m < N; m++){
+        det *= tmp[m * (N+1)];
+    }
+    m_del(mp_float_t, tmp, N * N);
+    return mp_obj_new_float(det);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(linalg_det_obj, linalg_det);
+
+#endif
+
+#if ULAB_MAX_DIMS > 1
+//| def eig(m: ulab.numpy.ndarray) -> Tuple[ulab.numpy.ndarray, ulab.numpy.ndarray]:
+//|     """
+//|     :param m: a square matrix
+//|     :return tuple (eigenvectors, eigenvalues):
+//|
+//|     Computes the eigenvalues and eigenvectors of a square matrix"""
+//|     ...
+//|
+
+static mp_obj_t linalg_eig(mp_obj_t oin) {
+    ndarray_obj_t *in = tools_object_is_square(oin);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(in->dtype)
+    uint8_t *iarray = (uint8_t *)in->array;
+    size_t S = in->shape[ULAB_MAX_DIMS - 1];
+    mp_float_t *array = m_new(mp_float_t, S*S);
+    for(size_t i=0; i < S; i++) { // rows
+        for(size_t j=0; j < S; j++) { // columns
+            *array++ = ndarray_get_float_value(iarray, in->dtype);
+            iarray += in->strides[ULAB_MAX_DIMS - 1];
+        }
+        iarray -= in->strides[ULAB_MAX_DIMS - 1] * S;
+        iarray += in->strides[ULAB_MAX_DIMS - 2];
+    }
+    array -= S * S;
+    // make sure the matrix is symmetric
+    for(size_t m=0; m < S; m++) {
+        for(size_t n=m+1; n < S; n++) {
+            // compare entry (m, n) to (n, m)
+            // TODO: this must probably be scaled!
+            if(LINALG_EPSILON < MICROPY_FLOAT_C_FUN(fabs)(array[m * S + n] - array[n * S + m])) {
+                mp_raise_ValueError(translate("input matrix is asymmetric"));
+            }
+        }
+    }
+
+    // if we got this far, then the matrix will be symmetric
+
+    ndarray_obj_t *eigenvectors = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, S, S), NDARRAY_FLOAT);
+    mp_float_t *eigvectors = (mp_float_t *)eigenvectors->array;
+
+    size_t iterations = linalg_jacobi_rotations(array, eigvectors, S);
+
+    if(iterations == 0) {
+        // the computation did not converge; numpy raises LinAlgError
+        m_del(mp_float_t, array, in->len);
+        mp_raise_ValueError(translate("iterations did not converge"));
+    }
+    ndarray_obj_t *eigenvalues = ndarray_new_linear_array(S, NDARRAY_FLOAT);
+    mp_float_t *eigvalues = (mp_float_t *)eigenvalues->array;
+    for(size_t i=0; i < S; i++) {
+        eigvalues[i] = array[i * (S + 1)];
+    }
+    m_del(mp_float_t, array, in->len);
+
+    mp_obj_tuple_t *tuple = MP_OBJ_TO_PTR(mp_obj_new_tuple(2, NULL));
+    tuple->items[0] = MP_OBJ_FROM_PTR(eigenvalues);
+    tuple->items[1] = MP_OBJ_FROM_PTR(eigenvectors);
+    return tuple;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(linalg_eig_obj, linalg_eig);
+
+//| def inv(m: ulab.numpy.ndarray) -> ulab.numpy.ndarray:
+//|     """
+//|     :param ~ulab.numpy.ndarray m: a square matrix
+//|     :return: The inverse of the matrix, if it exists
+//|     :raises ValueError: if the matrix is not invertible
+//|
+//|     Computes the inverse of a square matrix"""
+//|     ...
+//|
+static mp_obj_t linalg_inv(mp_obj_t o_in) {
+    ndarray_obj_t *ndarray = tools_object_is_square(o_in);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    uint8_t *array = (uint8_t *)ndarray->array;
+    size_t N = ndarray->shape[ULAB_MAX_DIMS - 1];
+    ndarray_obj_t *inverted = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, N, N), NDARRAY_FLOAT);
+    mp_float_t *iarray = (mp_float_t *)inverted->array;
+
+    mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+
+    for(size_t i=0; i < N; i++) { // rows
+        for(size_t j=0; j < N; j++) { // columns
+            *iarray++ = func(array);
+            array += ndarray->strides[ULAB_MAX_DIMS - 1];
+        }
+        array -= ndarray->strides[ULAB_MAX_DIMS - 1] * N;
+        array += ndarray->strides[ULAB_MAX_DIMS - 2];
+    }
+    // re-wind the pointer
+    iarray -= N*N;
+
+    if(!linalg_invert_matrix(iarray, N)) {
+        mp_raise_ValueError(translate("input matrix is singular"));
+    }
+    return MP_OBJ_FROM_PTR(inverted);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(linalg_inv_obj, linalg_inv);
+#endif
+
+//| def norm(x: ulab.numpy.ndarray) -> float:
+//|    """
+//|    :param ~ulab.numpy.ndarray x: a vector or a matrix
+//|
+//|    Computes the 2-norm of a vector or a matrix, i.e., ``sqrt(sum(x*x))``, however, without the RAM overhead."""
+//|    ...
+//|
+
+static mp_obj_t linalg_norm(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none} } ,
+        { MP_QSTR_axis, MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t x = args[0].u_obj;
+    mp_obj_t axis = args[1].u_obj;
+
+    mp_float_t dot = 0.0, value;
+    size_t count = 1;
+
+    if(mp_obj_is_type(x, &mp_type_tuple) || mp_obj_is_type(x, &mp_type_list) || mp_obj_is_type(x, &mp_type_range)) {
+        mp_obj_iter_buf_t iter_buf;
+        mp_obj_t item, iterable = mp_getiter(x, &iter_buf);
+        while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+            value = mp_obj_get_float(item);
+            // we could simply take the sum of value ** 2,
+            // but this method is numerically stable
+            dot = dot + (value * value - dot) / count++;
+        }
+        return mp_obj_new_float(MICROPY_FLOAT_C_FUN(sqrt)(dot * (count - 1)));
+    } else if(mp_obj_is_type(x, &ulab_ndarray_type)) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(x);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+        uint8_t *array = (uint8_t *)ndarray->array;
+        // always get a float, so that we don't have to resolve the dtype later
+        mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+        shape_strides _shape_strides = tools_reduce_axes(ndarray, axis);
+        ndarray_obj_t *results = ndarray_new_dense_ndarray(_shape_strides.ndim, _shape_strides.shape, NDARRAY_FLOAT);
+        mp_float_t *rarray = (mp_float_t *)results->array;
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    if(axis != mp_const_none) {
+                        count = 1;
+                        dot = 0.0;
+                    }
+                    do {
+                        value = func(array);
+                        dot = dot + (value * value - dot) / count++;
+                        array += _shape_strides.strides[0];
+                        l++;
+                    } while(l < _shape_strides.shape[0]);
+                    *rarray = MICROPY_FLOAT_C_FUN(sqrt)(dot * (count - 1));
+                #if ULAB_MAX_DIMS > 1
+                    rarray += _shape_strides.increment;
+                    array -= _shape_strides.strides[0] * _shape_strides.shape[0];
+                    array += _shape_strides.strides[ULAB_MAX_DIMS - 1];
+                    k++;
+                } while(k < _shape_strides.shape[ULAB_MAX_DIMS - 1]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                array -= _shape_strides.strides[ULAB_MAX_DIMS - 1] * _shape_strides.shape[ULAB_MAX_DIMS - 1];
+                array += _shape_strides.strides[ULAB_MAX_DIMS - 2];
+                j++;
+            } while(j < _shape_strides.shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            array -= _shape_strides.strides[ULAB_MAX_DIMS - 2] * _shape_strides.shape[ULAB_MAX_DIMS - 2];
+            array += _shape_strides.strides[ULAB_MAX_DIMS - 3];
+            i++;
+        } while(i < _shape_strides.shape[ULAB_MAX_DIMS - 3]);
+        #endif
+        if(results->ndim == 0) {
+            return mp_obj_new_float(*rarray);
+        }
+        return results;
+    }
+    return mp_const_none; // we should never reach this point
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(linalg_norm_obj, 1, linalg_norm);
+
+#if ULAB_MAX_DIMS > 1
+//| def qr(m: ulab.numpy.ndarray) -> Tuple[ulab.numpy.ndarray, ulab.numpy.ndarray]:
+//|     """
+//|     :param m: a matrix
+//|     :return tuple (Q, R):
+//|
+//|     Factor the matrix a as QR, where Q is orthonormal and R is upper-triangular.
+//|     """
+//|     ...
+//|
+
+static mp_obj_t linalg_qr(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_mode, MP_ARG_OBJ, { .u_rom_obj = MP_ROM_QSTR(MP_QSTR_reduced) } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("operation is defined for ndarrays only"));
+    }
+    ndarray_obj_t *source = MP_OBJ_TO_PTR(args[0].u_obj);
+    if(source->ndim != 2) {
+        mp_raise_ValueError(translate("operation is defined for 2D arrays only"));
+    }
+
+    size_t m = source->shape[ULAB_MAX_DIMS - 2]; // rows
+    size_t n = source->shape[ULAB_MAX_DIMS - 1]; // columns
+
+    ndarray_obj_t *Q = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, m, m), NDARRAY_FLOAT);
+    ndarray_obj_t *R = ndarray_new_dense_ndarray(2, source->shape, NDARRAY_FLOAT);
+
+    mp_float_t *qarray = (mp_float_t *)Q->array;
+    mp_float_t *rarray = (mp_float_t *)R->array;
+
+    // simply copy the entries of source to a float array
+    mp_float_t (*func)(void *) = ndarray_get_float_function(source->dtype);
+    uint8_t *sarray = (uint8_t *)source->array;
+
+    for(size_t i = 0; i < m; i++) {
+        for(size_t j = 0; j < n; j++) {
+            *rarray++ = func(sarray);
+            sarray += source->strides[ULAB_MAX_DIMS - 1];
+        }
+        sarray -= n * source->strides[ULAB_MAX_DIMS - 1];
+        sarray += source->strides[ULAB_MAX_DIMS - 2];
+    }
+    rarray -= m * n;
+
+    // start with the unit matrix
+    for(size_t i = 0; i < m; i++) {
+        qarray[i * (m + 1)] = 1.0;
+    }
+
+    for(size_t j = 0; j < n; j++) { // columns
+        for(size_t i = m - 1; i > j; i--) { // rows
+            mp_float_t c, s;
+            // Givens matrix: note that numpy uses a strange form of the rotation
+            // [[c  s],
+            //  [s -c]]
+            if(MICROPY_FLOAT_C_FUN(fabs)(rarray[i * n + j]) < LINALG_EPSILON) { // r[i, j]
+                c = (rarray[(i - 1) * n + j] >= 0.0) ? 1.0 : -1.0; // r[i-1, j]
+                s = 0.0;
+            } else if(MICROPY_FLOAT_C_FUN(fabs)(rarray[(i - 1) * n + j]) < LINALG_EPSILON) { // r[i-1, j]
+                c = 0.0;
+                s = (rarray[i * n + j] >= 0.0) ? -1.0 : 1.0; // r[i, j]
+            } else {
+                mp_float_t t, u;
+                if(MICROPY_FLOAT_C_FUN(fabs)(rarray[(i - 1) * n + j]) > MICROPY_FLOAT_C_FUN(fabs)(rarray[i * n + j])) { // r[i-1, j], r[i, j]
+                    t = rarray[i * n + j] / rarray[(i - 1) * n + j]; // r[i, j]/r[i-1, j]
+                    u = MICROPY_FLOAT_C_FUN(sqrt)(1 + t * t);
+                    c = -1.0 / u;
+                    s = c * t;
+                } else {
+                    t = rarray[(i - 1) * n + j] / rarray[i * n + j]; // r[i-1, j]/r[i, j]
+                    u = MICROPY_FLOAT_C_FUN(sqrt)(1 + t * t);
+                    s = -1.0 / u;
+                    c = s * t;
+                }
+            }
+
+            mp_float_t r1, r2;
+            // update R: multiply with the rotation matrix from the left
+            for(size_t k = 0; k < n; k++) {
+                r1 = rarray[(i - 1) * n + k]; // r[i-1, k]
+                r2 = rarray[i * n + k]; // r[i, k]
+                rarray[(i - 1) * n + k] = c * r1 + s * r2; // r[i-1, k]
+                rarray[i * n + k] = s * r1 - c * r2; // r[i, k]
+            }
+
+            // update Q: multiply with the transpose of the rotation matrix from the right
+            for(size_t k = 0; k < m; k++) {
+                r1 = qarray[k * m + (i - 1)];
+                r2 = qarray[k * m + i];
+                qarray[k * m + (i - 1)] = c * r1 + s * r2;
+                qarray[k * m + i] = s * r1 - c * r2;
+            }
+        }
+    }
+
+    mp_obj_tuple_t *tuple = MP_OBJ_TO_PTR(mp_obj_new_tuple(2, NULL));
+    GET_STR_DATA_LEN(args[1].u_obj, mode, len);
+    if(memcmp(mode, "complete", 8) == 0) {
+        tuple->items[0] = MP_OBJ_FROM_PTR(Q);
+        tuple->items[1] = MP_OBJ_FROM_PTR(R);
+    } else if(memcmp(mode, "reduced", 7) == 0) {
+        size_t k = MAX(m, n) - MIN(m, n);
+        ndarray_obj_t *q = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, m, m - k), NDARRAY_FLOAT);
+        ndarray_obj_t *r = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, m - k, n), NDARRAY_FLOAT);
+        mp_float_t *qa = (mp_float_t *)q->array;
+        mp_float_t *ra = (mp_float_t *)r->array;
+        for(size_t i = 0; i < m; i++) {
+            memcpy(qa, qarray, (m - k) * q->itemsize);
+            qa += (m - k);
+            qarray += m;
+        }
+        for(size_t i = 0; i < m - k; i++) {
+            memcpy(ra, rarray, n * r->itemsize);
+            ra += n;
+            rarray += n;
+        }
+        tuple->items[0] = MP_OBJ_FROM_PTR(q);
+        tuple->items[1] = MP_OBJ_FROM_PTR(r);
+    } else {
+        mp_raise_ValueError(translate("mode must be complete, or reduced"));
+    }
+    return tuple;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(linalg_qr_obj, 1, linalg_qr);
+#endif
+
+STATIC const mp_rom_map_elem_t ulab_linalg_globals_table[] = {
+    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_linalg) },
+    #if ULAB_MAX_DIMS > 1
+        #if ULAB_LINALG_HAS_CHOLESKY
+        { MP_ROM_QSTR(MP_QSTR_cholesky), (mp_obj_t)&linalg_cholesky_obj },
+        #endif
+        #if ULAB_LINALG_HAS_DET
+        { MP_ROM_QSTR(MP_QSTR_det), (mp_obj_t)&linalg_det_obj },
+        #endif
+        #if ULAB_LINALG_HAS_EIG
+        { MP_ROM_QSTR(MP_QSTR_eig), (mp_obj_t)&linalg_eig_obj },
+        #endif
+        #if ULAB_LINALG_HAS_INV
+        { MP_ROM_QSTR(MP_QSTR_inv), (mp_obj_t)&linalg_inv_obj },
+        #endif
+        #if ULAB_LINALG_HAS_QR
+        { MP_ROM_QSTR(MP_QSTR_qr), (mp_obj_t)&linalg_qr_obj },
+        #endif
+    #endif
+    #if ULAB_LINALG_HAS_NORM
+    { MP_ROM_QSTR(MP_QSTR_norm), (mp_obj_t)&linalg_norm_obj },
+    #endif
+};
+
+STATIC MP_DEFINE_CONST_DICT(mp_module_ulab_linalg_globals, ulab_linalg_globals_table);
+
+const mp_obj_module_t ulab_linalg_module = {
+    .base = { &mp_type_module },
+    .globals = (mp_obj_dict_t*)&mp_module_ulab_linalg_globals,
+};
+MP_REGISTER_MODULE(MP_QSTR_ulab_dot_linalg, ulab_linalg_module, MODULE_ULAB_ENABLED && CIRCUITPY_ULAB);
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/linalg/linalg.h b/circuitpython/extmod/ulab/code/numpy/linalg/linalg.h
new file mode 100644
index 0000000..35fc403
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/linalg/linalg.h
@@ -0,0 +1,27 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#ifndef _LINALG_
+#define _LINALG_
+
+#include "../../ulab.h"
+#include "../../ndarray.h"
+#include "linalg_tools.h"
+
+extern const mp_obj_module_t ulab_linalg_module;
+
+MP_DECLARE_CONST_FUN_OBJ_1(linalg_cholesky_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(linalg_det_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(linalg_eig_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(linalg_inv_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(linalg_norm_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(linalg_qr_obj);
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/linalg/linalg_tools.c b/circuitpython/extmod/ulab/code/numpy/linalg/linalg_tools.c
new file mode 100644
index 0000000..5e03a50
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/linalg/linalg_tools.c
@@ -0,0 +1,171 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2010 Zoltán Vörös
+*/
+
+#include <math.h>
+#include <string.h>
+#include "py/runtime.h"
+
+#include "linalg_tools.h"
+
+/* 
+ * The following function inverts a matrix, whose entries are given in the input array 
+ * The function has no dependencies beyond micropython itself (for the definition of mp_float_t),
+ * and can be used independent of ulab.
+ */
+
+bool linalg_invert_matrix(mp_float_t *data, size_t N) {
+    // returns true, of the inversion was successful,
+    // false, if the matrix is singular
+
+    // initially, this is the unit matrix: the contents of this matrix is what
+    // will be returned after all the transformations
+    mp_float_t *unit = m_new(mp_float_t, N*N);
+    mp_float_t elem = 1.0;
+    // initialise the unit matrix
+    memset(unit, 0, sizeof(mp_float_t)*N*N);
+    for(size_t m=0; m < N; m++) {
+        memcpy(&unit[m * (N+1)], &elem, sizeof(mp_float_t));
+    }
+    for(size_t m=0; m < N; m++){
+        // this could be faster with ((c < epsilon) && (c > -epsilon))
+        if(MICROPY_FLOAT_C_FUN(fabs)(data[m * (N+1)]) < LINALG_EPSILON) {
+            //look for a line to swap
+            size_t m1 = m + 1;
+            for(; m1 < N; m1++) {
+                if(!(MICROPY_FLOAT_C_FUN(fabs)(data[m1*N + m]) < LINALG_EPSILON)) {
+                    for(size_t m2=0; m2 < N; m2++) {
+                        mp_float_t swapVal = data[m*N+m2];
+                        data[m*N+m2] = data[m1*N+m2];
+                        data[m1*N+m2] = swapVal;
+                        swapVal = unit[m*N+m2];
+                        unit[m*N+m2] = unit[m1*N+m2];
+                        unit[m1*N+m2] = swapVal;
+                    }
+                    break;
+                }
+            }
+            if (m1 >= N) {
+                m_del(mp_float_t, unit, N*N);
+                return false;
+            }
+        }
+        for(size_t n=0; n < N; n++) {
+            if(m != n){
+                elem = data[N * n + m] / data[m * (N+1)];
+                for(size_t k=0; k < N; k++) {
+                    data[N * n + k] -= elem * data[N * m + k];
+                    unit[N * n + k] -= elem * unit[N * m + k];
+                }
+            }
+        }
+    }
+    for(size_t m=0; m < N; m++) {
+        elem = data[m * (N+1)];
+        for(size_t n=0; n < N; n++) {
+            data[N * m + n] /= elem;
+            unit[N * m + n] /= elem;
+        }
+    }
+    memcpy(data, unit, sizeof(mp_float_t)*N*N);
+    m_del(mp_float_t, unit, N * N);
+    return true;
+}
+
+/* 
+ * The following function calculates the eigenvalues and eigenvectors of a symmetric 
+ * real matrix, whose entries are given in the input array. 
+ * The function has no dependencies beyond micropython itself (for the definition of mp_float_t),
+ * and can be used independent of ulab.
+ */
+
+size_t linalg_jacobi_rotations(mp_float_t *array, mp_float_t *eigvectors, size_t S) {
+    // eigvectors should be a 0-array; start out with the unit matrix
+    for(size_t m=0; m < S; m++) {
+        eigvectors[m * (S+1)] = 1.0;
+    }
+    mp_float_t largest, w, t, c, s, tau, aMk, aNk, vm, vn;
+    size_t M, N;
+    size_t iterations = JACOBI_MAX * S * S;
+    do {
+        iterations--;
+        // find the pivot here
+        M = 0;
+        N = 0;
+        largest = 0.0;
+        for(size_t m=0; m < S-1; m++) { // -1: no need to inspect last row
+            for(size_t n=m+1; n < S; n++) {
+                w = MICROPY_FLOAT_C_FUN(fabs)(array[m * S + n]);
+                if((largest < w) && (LINALG_EPSILON < w)) {
+                    M = m;
+                    N = n;
+                    largest = w;
+                }
+            }
+        }
+        if(M + N == 0) { // all entries are smaller than epsilon, there is not much we can do...
+            break;
+        }
+        // at this point, we have the pivot, and it is the entry (M, N)
+        // now we have to find the rotation angle
+        w = (array[N * S + N] - array[M * S + M]) / (MICROPY_FLOAT_CONST(2.0)*array[M * S + N]);
+        // The following if/else chooses the smaller absolute value for the tangent
+        // of the rotation angle. Going with the smaller should be numerically stabler.
+        if(w > 0) {
+            t = MICROPY_FLOAT_C_FUN(sqrt)(w*w + MICROPY_FLOAT_CONST(1.0)) - w;
+        } else {
+            t = MICROPY_FLOAT_CONST(-1.0)*(MICROPY_FLOAT_C_FUN(sqrt)(w*w + MICROPY_FLOAT_CONST(1.0)) + w);
+        }
+        s = t / MICROPY_FLOAT_C_FUN(sqrt)(t*t + MICROPY_FLOAT_CONST(1.0)); // the sine of the rotation angle
+        c = MICROPY_FLOAT_CONST(1.0) / MICROPY_FLOAT_C_FUN(sqrt)(t*t + MICROPY_FLOAT_CONST(1.0)); // the cosine of the rotation angle
+        tau = (MICROPY_FLOAT_CONST(1.0)-c)/s; // this is equal to the tangent of the half of the rotation angle
+
+        // at this point, we have the rotation angles, so we can transform the matrix
+        // first the two diagonal elements
+        // a(M, M) = a(M, M) - t*a(M, N)
+        array[M * S + M] = array[M * S + M] - t * array[M * S + N];
+        // a(N, N) = a(N, N) + t*a(M, N)
+        array[N * S + N] = array[N * S + N] + t * array[M * S + N];
+        // after the rotation, the a(M, N), and a(N, M) entries should become zero
+        array[M * S + N] = array[N * S + M] = MICROPY_FLOAT_CONST(0.0);
+        // then all other elements in the column
+        for(size_t k=0; k < S; k++) {
+            if((k == M) || (k == N)) {
+                continue;
+            }
+            aMk = array[M * S + k];
+            aNk = array[N * S + k];
+            // a(M, k) = a(M, k) - s*(a(N, k) + tau*a(M, k))
+            array[M * S + k] -= s * (aNk + tau * aMk);
+            // a(N, k) = a(N, k) + s*(a(M, k) - tau*a(N, k))
+            array[N * S + k] += s * (aMk - tau * aNk);
+            // a(k, M) = a(M, k)
+            array[k * S + M] = array[M * S + k];
+            // a(k, N) = a(N, k)
+            array[k * S + N] = array[N * S + k];
+        }
+        // now we have to update the eigenvectors
+        // the rotation matrix, R, multiplies from the right
+        // R is the unit matrix, except for the
+        // R(M,M) = R(N, N) = c
+        // R(N, M) = s
+        // (M, N) = -s
+        // entries. This means that only the Mth, and Nth columns will change
+        for(size_t m=0; m < S; m++) {
+            vm = eigvectors[m * S + M];
+            vn = eigvectors[m * S + N];
+            // the new value of eigvectors(m, M)
+            eigvectors[m * S + M] = c * vm - s * vn;
+            // the new value of eigvectors(m, N)
+            eigvectors[m * S + N] = s * vm + c * vn;
+        }
+    } while(iterations > 0);
+    
+    return iterations;
+}
diff --git a/circuitpython/extmod/ulab/code/numpy/linalg/linalg_tools.h b/circuitpython/extmod/ulab/code/numpy/linalg/linalg_tools.h
new file mode 100644
index 0000000..942da00
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/linalg/linalg_tools.h
@@ -0,0 +1,28 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#ifndef _TOOLS_TOOLS_
+#define _TOOLS_TOOLS_
+
+#ifndef LINALG_EPSILON
+#if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
+#define LINALG_EPSILON      MICROPY_FLOAT_CONST(1.2e-7)
+#elif MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_DOUBLE
+#define LINALG_EPSILON      MICROPY_FLOAT_CONST(2.3e-16)
+#endif
+#endif /* LINALG_EPSILON */
+
+#define JACOBI_MAX     20
+
+bool linalg_invert_matrix(mp_float_t *, size_t );
+size_t linalg_jacobi_rotations(mp_float_t *, mp_float_t *, size_t );
+
+#endif /* _TOOLS_TOOLS_ */
+
diff --git a/circuitpython/extmod/ulab/code/numpy/ndarray/ndarray_iter.c b/circuitpython/extmod/ulab/code/numpy/ndarray/ndarray_iter.c
new file mode 100644
index 0000000..8704836
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/ndarray/ndarray_iter.c
@@ -0,0 +1,66 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2021 Zoltán Vörös
+ *
+*/
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+
+#include "ndarray_iter.h"
+
+#ifdef NDARRAY_HAS_FLATITER
+mp_obj_t ndarray_flatiter_make_new(mp_obj_t self_in) {
+    ndarray_flatiter_t *flatiter = m_new_obj(ndarray_flatiter_t);
+    flatiter->base.type = &ndarray_flatiter_type;
+    flatiter->iternext = ndarray_flatiter_next;
+    flatiter->ndarray = MP_OBJ_TO_PTR(self_in);
+    flatiter->cur = 0;
+    return flatiter;
+}
+
+mp_obj_t ndarray_flatiter_next(mp_obj_t self_in) {
+    ndarray_flatiter_t *self = MP_OBJ_TO_PTR(self_in);
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(self->ndarray);
+    uint8_t *array = (uint8_t *)ndarray->array;
+
+    if(self->cur < ndarray->len) {
+        uint32_t remainder = self->cur;
+        uint8_t i = ULAB_MAX_DIMS - 1;
+        do {
+            size_t div = (remainder / ndarray->shape[i]);
+            array += remainder * ndarray->strides[i];
+            remainder -= div * ndarray->shape[i];
+            i--;
+        } while(i > ULAB_MAX_DIMS - ndarray->ndim);
+        self->cur++;
+        return ndarray_get_item(ndarray, array);
+    }
+    return MP_OBJ_STOP_ITERATION;
+}
+
+mp_obj_t ndarray_new_flatiterator(mp_obj_t flatiter_in, mp_obj_iter_buf_t *iter_buf) {
+    assert(sizeof(ndarray_flatiter_t) <= sizeof(mp_obj_iter_buf_t));
+    ndarray_flatiter_t *iter = (ndarray_flatiter_t *)iter_buf;
+    ndarray_flatiter_t *flatiter = MP_OBJ_TO_PTR(flatiter_in);
+    iter->base.type = &mp_type_polymorph_iter;
+    iter->iternext = ndarray_flatiter_next;
+    iter->ndarray = flatiter->ndarray;
+    iter->cur = 0;
+    return MP_OBJ_FROM_PTR(iter);
+}
+
+mp_obj_t ndarray_get_flatiterator(mp_obj_t o_in, mp_obj_iter_buf_t *iter_buf) {
+    return ndarray_new_flatiterator(o_in, iter_buf);
+}
+#endif /* NDARRAY_HAS_FLATITER */
diff --git a/circuitpython/extmod/ulab/code/numpy/ndarray/ndarray_iter.h b/circuitpython/extmod/ulab/code/numpy/ndarray/ndarray_iter.h
new file mode 100644
index 0000000..b3fc48d
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/ndarray/ndarray_iter.h
@@ -0,0 +1,36 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2020-2021 Zoltán Vörös
+*/
+
+#ifndef _NDARRAY_ITER_
+#define _NDARRAY_ITER_
+
+#include "py/runtime.h"
+#include "py/binary.h"
+#include "py/obj.h"
+#include "py/objarray.h"
+
+#include "../../ulab.h"
+#include "../../ndarray.h"
+
+// TODO: take simply mp_obj_ndarray_it_t from ndarray.c
+typedef struct _mp_obj_ndarray_flatiter_t {
+    mp_obj_base_t base;
+    mp_fun_1_t iternext;
+    mp_obj_t ndarray;
+    size_t cur;
+} ndarray_flatiter_t;
+
+mp_obj_t ndarray_get_flatiterator(mp_obj_t , mp_obj_iter_buf_t *);
+mp_obj_t ndarray_flatiter_make_new(mp_obj_t );
+mp_obj_t ndarray_flatiter_next(mp_obj_t );
+
+#endif
+\ No newline at end of file
diff --git a/circuitpython/extmod/ulab/code/numpy/numerical.c b/circuitpython/extmod/ulab/code/numpy/numerical.c
new file mode 100644
index 0000000..d6983c0
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/numerical.c
@@ -0,0 +1,1402 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/objint.h"
+#include "py/runtime.h"
+#include "py/builtin.h"
+#include "py/misc.h"
+
+#include "../ulab.h"
+#include "../ulab_tools.h"
+#include "./carray/carray_tools.h"
+#include "numerical.h"
+
+enum NUMERICAL_FUNCTION_TYPE {
+    NUMERICAL_ALL,
+    NUMERICAL_ANY,
+    NUMERICAL_ARGMAX,
+    NUMERICAL_ARGMIN,
+    NUMERICAL_MAX,
+    NUMERICAL_MEAN,
+    NUMERICAL_MIN,
+    NUMERICAL_STD,
+    NUMERICAL_SUM,
+};
+
+//| """Numerical and Statistical functions
+//|
+//| Most of these functions take an "axis" argument, which indicates whether to
+//| operate over the flattened array (None), or a particular axis (integer)."""
+//|
+//| from typing import Dict
+//|
+//| _ArrayLike = Union[ndarray, List[_float], Tuple[_float], range]
+//|
+//| _DType = int
+//| """`ulab.numpy.int8`, `ulab.numpy.uint8`, `ulab.numpy.int16`, `ulab.numpy.uint16`, `ulab.numpy.float` or `ulab.numpy.bool`"""
+//|
+//| from builtins import float as _float
+//| from builtins import bool as _bool
+//|
+//| int8: _DType
+//| """Type code for signed integers in the range -128 .. 127 inclusive, like the 'b' typecode of `array.array`"""
+//|
+//| int16: _DType
+//| """Type code for signed integers in the range -32768 .. 32767 inclusive, like the 'h' typecode of `array.array`"""
+//|
+//| float: _DType
+//| """Type code for floating point values, like the 'f' typecode of `array.array`"""
+//|
+//| uint8: _DType
+//| """Type code for unsigned integers in the range 0 .. 255 inclusive, like the 'H' typecode of `array.array`"""
+//|
+//| uint16: _DType
+//| """Type code for unsigned integers in the range 0 .. 65535 inclusive, like the 'h' typecode of `array.array`"""
+//|
+//| bool: _DType
+//| """Type code for boolean values"""
+//|
+
+static void numerical_reduce_axes(ndarray_obj_t *ndarray, int8_t axis, size_t *shape, int32_t *strides) {
+    // removes the values corresponding to a single axis from the shape and strides array
+    uint8_t index = ULAB_MAX_DIMS - ndarray->ndim + axis;
+    if((ndarray->ndim == 1) && (axis == 0)) {
+        index = 0;
+        shape[ULAB_MAX_DIMS - 1] = 1;
+        return;
+    }
+    for(uint8_t i = ULAB_MAX_DIMS - 1; i > 0; i--) {
+        if(i > index) {
+            shape[i] = ndarray->shape[i];
+            strides[i] = ndarray->strides[i];
+        } else {
+            shape[i] = ndarray->shape[i-1];
+            strides[i] = ndarray->strides[i-1];
+        }
+    }
+}
+
+#if ULAB_NUMPY_HAS_ALL | ULAB_NUMPY_HAS_ANY
+static mp_obj_t numerical_all_any(mp_obj_t oin, mp_obj_t axis, uint8_t optype) {
+    bool anytype = optype == NUMERICAL_ALL ? 1 : 0;
+    if(mp_obj_is_type(oin, &ulab_ndarray_type)) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(oin);
+        uint8_t *array = (uint8_t *)ndarray->array;
+        if(ndarray->len == 0) { // return immediately with empty arrays
+        if(optype == NUMERICAL_ALL) {
+                return mp_const_true;
+            } else {
+                return mp_const_false;
+            }
+        }
+        // always get a float, so that we don't have to resolve the dtype later
+        mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+        ndarray_obj_t *results = NULL;
+        uint8_t *rarray = NULL;
+        shape_strides _shape_strides = tools_reduce_axes(ndarray, axis);
+        if(axis != mp_const_none) {
+            results = ndarray_new_dense_ndarray(_shape_strides.ndim, _shape_strides.shape, NDARRAY_BOOL);
+            rarray = results->array;
+            if(optype == NUMERICAL_ALL) {
+                memset(rarray, 1, results->len);
+            }
+        }
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    if(axis == mp_const_none) {
+                        do {
+                            #if ULAB_SUPPORTS_COMPLEX
+                            if(ndarray->dtype == NDARRAY_COMPLEX) {
+                                mp_float_t real = *((mp_float_t *)array);
+                                mp_float_t imag = *((mp_float_t *)(array + sizeof(mp_float_t)));
+                                if(((real != MICROPY_FLOAT_CONST(0.0)) | (imag != MICROPY_FLOAT_CONST(0.0))) & !anytype) {
+                                    // optype = NUMERICAL_ANY
+                                    return mp_const_true;
+                                } else if(((real == MICROPY_FLOAT_CONST(0.0)) & (imag == MICROPY_FLOAT_CONST(0.0))) & anytype) {
+                                    // optype == NUMERICAL_ALL
+                                    return mp_const_false;
+                                }
+                            } else {
+                            #endif
+                                mp_float_t value = func(array);
+                                if((value != MICROPY_FLOAT_CONST(0.0)) & !anytype) {
+                                    // optype = NUMERICAL_ANY
+                                    return mp_const_true;
+                                } else if((value == MICROPY_FLOAT_CONST(0.0)) & anytype) {
+                                    // optype == NUMERICAL_ALL
+                                    return mp_const_false;
+                                }
+                            #if ULAB_SUPPORTS_COMPLEX
+                            }
+                            #endif
+                            array += _shape_strides.strides[0];
+                            l++;
+                        } while(l < _shape_strides.shape[0]);
+                    } else { // a scalar axis keyword was supplied
+                        do {
+                            #if ULAB_SUPPORTS_COMPLEX
+                            if(ndarray->dtype == NDARRAY_COMPLEX) {
+                                mp_float_t real = *((mp_float_t *)array);
+                                mp_float_t imag = *((mp_float_t *)(array + sizeof(mp_float_t)));
+                                if(((real != MICROPY_FLOAT_CONST(0.0)) | (imag != MICROPY_FLOAT_CONST(0.0))) & !anytype) {
+                                    // optype = NUMERICAL_ANY
+                                    *rarray = 1;
+                                    // since we are breaking out of the loop, move the pointer forward
+                                    array += _shape_strides.strides[0] * (_shape_strides.shape[0] - l);
+                                    break;
+                                } else if(((real == MICROPY_FLOAT_CONST(0.0)) & (imag == MICROPY_FLOAT_CONST(0.0))) & anytype) {
+                                    // optype == NUMERICAL_ALL
+                                    *rarray = 0;
+                                    // since we are breaking out of the loop, move the pointer forward
+                                    array += _shape_strides.strides[0] * (_shape_strides.shape[0] - l);
+                                    break;
+                                }
+                            } else {
+                            #endif
+                                mp_float_t value = func(array);
+                                if((value != MICROPY_FLOAT_CONST(0.0)) & !anytype) {
+                                    // optype == NUMERICAL_ANY
+                                    *rarray = 1;
+                                    // since we are breaking out of the loop, move the pointer forward
+                                    array += _shape_strides.strides[0] * (_shape_strides.shape[0] - l);
+                                    break;
+                                } else if((value == MICROPY_FLOAT_CONST(0.0)) & anytype) {
+                                    // optype == NUMERICAL_ALL
+                                    *rarray = 0;
+                                    // since we are breaking out of the loop, move the pointer forward
+                                    array += _shape_strides.strides[0] * (_shape_strides.shape[0] - l);
+                                    break;
+                                }
+                            #if ULAB_SUPPORTS_COMPLEX
+                            }
+                            #endif
+                            array += _shape_strides.strides[0];
+                            l++;
+                        } while(l < _shape_strides.shape[0]);
+                    }
+                #if ULAB_MAX_DIMS > 1
+                    rarray += _shape_strides.increment;
+                    array -= _shape_strides.strides[0] * _shape_strides.shape[0];
+                    array += _shape_strides.strides[ULAB_MAX_DIMS - 1];
+                    k++;
+                } while(k < _shape_strides.shape[ULAB_MAX_DIMS - 1]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                array -= _shape_strides.strides[ULAB_MAX_DIMS - 1] * _shape_strides.shape[ULAB_MAX_DIMS - 1];
+                array += _shape_strides.strides[ULAB_MAX_DIMS - 2];
+                j++;
+            } while(j < _shape_strides.shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            array -= _shape_strides.strides[ULAB_MAX_DIMS - 2] * _shape_strides.shape[ULAB_MAX_DIMS - 2];
+            array += _shape_strides.strides[ULAB_MAX_DIMS - 3];
+            i++;
+        } while(i < _shape_strides.shape[ULAB_MAX_DIMS - 3]);
+        #endif
+        if(axis == mp_const_none) {
+            // the innermost loop fell through, so return the result here
+            if(!anytype) {
+                return mp_const_false;
+            } else {
+                return mp_const_true;
+            }
+        }
+        return results;
+    } else if(mp_obj_is_int(oin) || mp_obj_is_float(oin)) {
+        return mp_obj_is_true(oin) ? mp_const_true : mp_const_false;
+    } else {
+        mp_obj_iter_buf_t iter_buf;
+        mp_obj_t item, iterable = mp_getiter(oin, &iter_buf);
+        while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+            if(!mp_obj_is_true(item) & !anytype) {
+                return mp_const_false;
+            } else if(mp_obj_is_true(item) & anytype) {
+                return mp_const_true;
+            }
+        }
+    }
+    return anytype ? mp_const_true : mp_const_false;
+}
+#endif
+
+#if ULAB_NUMPY_HAS_SUM | ULAB_NUMPY_HAS_MEAN | ULAB_NUMPY_HAS_STD
+static mp_obj_t numerical_sum_mean_std_iterable(mp_obj_t oin, uint8_t optype, size_t ddof) {
+    mp_float_t value = MICROPY_FLOAT_CONST(0.0);
+    mp_float_t M = MICROPY_FLOAT_CONST(0.0);
+    mp_float_t m = MICROPY_FLOAT_CONST(0.0);
+    mp_float_t S = MICROPY_FLOAT_CONST(0.0);
+    mp_float_t s = MICROPY_FLOAT_CONST(0.0);
+    size_t count = 0;
+    mp_obj_iter_buf_t iter_buf;
+    mp_obj_t item, iterable = mp_getiter(oin, &iter_buf);
+    while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+        value = mp_obj_get_float(item);
+        m = M + (value - M) / (count + 1);
+        s = S + (value - M) * (value - m);
+        M = m;
+        S = s;
+        count++;
+    }
+    if(optype == NUMERICAL_SUM) {
+        return mp_obj_new_float(m * count);
+    } else if(optype == NUMERICAL_MEAN) {
+        return count > 0 ? mp_obj_new_float(m) : mp_obj_new_float(MICROPY_FLOAT_CONST(0.0));
+    } else { // this should be the case of the standard deviation
+        return count > ddof ? mp_obj_new_float(MICROPY_FLOAT_C_FUN(sqrt)(s / (count - ddof))) : mp_obj_new_float(MICROPY_FLOAT_CONST(0.0));
+    }
+}
+
+static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t axis, uint8_t optype, size_t ddof) {
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    uint8_t *array = (uint8_t *)ndarray->array;
+    shape_strides _shape_strides = tools_reduce_axes(ndarray, axis);
+
+    if(axis == mp_const_none) {
+        // work with the flattened array
+        if((optype == NUMERICAL_STD) && (ddof > ndarray->len)) {
+            // if there are too many degrees of freedom, there is no point in calculating anything
+            return mp_obj_new_float(MICROPY_FLOAT_CONST(0.0));
+        }
+        mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+        mp_float_t M = MICROPY_FLOAT_CONST(0.0);
+        mp_float_t m = MICROPY_FLOAT_CONST(0.0);
+        mp_float_t S = MICROPY_FLOAT_CONST(0.0);
+        mp_float_t s = MICROPY_FLOAT_CONST(0.0);
+        size_t count = 0;
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        count++;
+                        mp_float_t value = func(array);
+                        m = M + (value - M) / (mp_float_t)count;
+                        if(optype == NUMERICAL_STD) {
+                            s = S + (value - M) * (value - m);
+                            S = s;
+                        }
+                        M = m;
+                        array += _shape_strides.strides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < _shape_strides.shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    array -= _shape_strides.strides[ULAB_MAX_DIMS - 1] * _shape_strides.shape[ULAB_MAX_DIMS - 1];
+                    array += _shape_strides.strides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < _shape_strides.shape[ULAB_MAX_DIMS - 2]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                array -= _shape_strides.strides[ULAB_MAX_DIMS - 2] * _shape_strides.shape[ULAB_MAX_DIMS - 2];
+                array += _shape_strides.strides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < _shape_strides.shape[ULAB_MAX_DIMS - 3]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            array -= _shape_strides.strides[ULAB_MAX_DIMS - 3] * _shape_strides.shape[ULAB_MAX_DIMS - 3];
+            array += _shape_strides.strides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < _shape_strides.shape[ULAB_MAX_DIMS - 4]);
+        #endif
+        if(optype == NUMERICAL_SUM) {
+            // numpy returns an integer for integer input types
+            if(ndarray->dtype == NDARRAY_FLOAT) {
+                return mp_obj_new_float(M * ndarray->len);
+            } else {
+                return mp_obj_new_int((int32_t)MICROPY_FLOAT_C_FUN(round)(M * ndarray->len));
+            }
+        } else if(optype == NUMERICAL_MEAN) {
+            return mp_obj_new_float(M);
+        } else { // this must be the case of the standard deviation
+            // we have already made certain that ddof < ndarray->len holds
+            return mp_obj_new_float(MICROPY_FLOAT_C_FUN(sqrt)(S / (ndarray->len - ddof)));
+        }
+    } else {
+        ndarray_obj_t *results = NULL;
+        uint8_t *rarray = NULL;
+        mp_float_t *farray = NULL;
+        if(optype == NUMERICAL_SUM) {
+            results = ndarray_new_dense_ndarray(_shape_strides.ndim, _shape_strides.shape, ndarray->dtype);
+            rarray = (uint8_t *)results->array;
+            // TODO: numpy promotes the output to the highest integer type
+            if(ndarray->dtype == NDARRAY_UINT8) {
+                RUN_SUM(uint8_t, array, results, rarray, _shape_strides);
+            } else if(ndarray->dtype == NDARRAY_INT8) {
+                RUN_SUM(int8_t, array, results, rarray, _shape_strides);
+            } else if(ndarray->dtype == NDARRAY_UINT16) {
+                RUN_SUM(uint16_t, array, results, rarray, _shape_strides);
+            } else if(ndarray->dtype == NDARRAY_INT16) {
+                RUN_SUM(int16_t, array, results, rarray, _shape_strides);
+            } else {
+                // for floats, the sum might be inaccurate with the naive summation
+                // call mean, and multiply with the number of samples
+                farray = (mp_float_t *)results->array;
+                RUN_MEAN_STD(mp_float_t, array, farray, _shape_strides, MICROPY_FLOAT_CONST(0.0), 0);
+                mp_float_t norm = (mp_float_t)_shape_strides.shape[0];
+                // re-wind the array here
+                farray = (mp_float_t *)results->array;
+                for(size_t i=0; i < results->len; i++) {
+                    *farray++ *= norm;
+                }
+            }
+        } else {
+            bool isStd = optype == NUMERICAL_STD ? 1 : 0;
+            results = ndarray_new_dense_ndarray(_shape_strides.ndim, _shape_strides.shape, NDARRAY_FLOAT);
+            farray = (mp_float_t *)results->array;
+            // we can return the 0 array here, if the degrees of freedom is larger than the length of the axis
+            if((optype == NUMERICAL_STD) && (_shape_strides.shape[0] <= ddof)) {
+                return MP_OBJ_FROM_PTR(results);
+            }
+            mp_float_t div = optype == NUMERICAL_STD ? (mp_float_t)(_shape_strides.shape[0] - ddof) : MICROPY_FLOAT_CONST(0.0);
+            if(ndarray->dtype == NDARRAY_UINT8) {
+                RUN_MEAN_STD(uint8_t, array, farray, _shape_strides, div, isStd);
+            } else if(ndarray->dtype == NDARRAY_INT8) {
+                RUN_MEAN_STD(int8_t, array, farray, _shape_strides, div, isStd);
+            } else if(ndarray->dtype == NDARRAY_UINT16) {
+                RUN_MEAN_STD(uint16_t, array, farray, _shape_strides, div, isStd);
+            } else if(ndarray->dtype == NDARRAY_INT16) {
+                RUN_MEAN_STD(int16_t, array, farray, _shape_strides, div, isStd);
+            } else {
+                RUN_MEAN_STD(mp_float_t, array, farray, _shape_strides, div, isStd);
+            }
+        }
+        if(results->ndim == 0) { // return a scalar here
+            return mp_binary_get_val_array(results->dtype, results->array, 0);
+        }
+        return MP_OBJ_FROM_PTR(results);
+    }
+    return mp_const_none;
+}
+#endif
+
+#if ULAB_NUMPY_HAS_ARGMINMAX
+static mp_obj_t numerical_argmin_argmax_iterable(mp_obj_t oin, uint8_t optype) {
+    if(MP_OBJ_SMALL_INT_VALUE(mp_obj_len_maybe(oin)) == 0) {
+        mp_raise_ValueError(translate("attempt to get argmin/argmax of an empty sequence"));
+    }
+    size_t idx = 0, best_idx = 0;
+    mp_obj_iter_buf_t iter_buf;
+    mp_obj_t iterable = mp_getiter(oin, &iter_buf);
+    mp_obj_t item;
+    uint8_t op = 0; // argmin, min
+    if((optype == NUMERICAL_ARGMAX) || (optype == NUMERICAL_MAX)) op = 1;
+    item = mp_iternext(iterable);
+    mp_obj_t best_obj = item;
+    mp_float_t value, best_value = mp_obj_get_float(item);
+    value = best_value;
+    while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+        idx++;
+        value = mp_obj_get_float(item);
+        if((op == 0) && (value < best_value)) {
+            best_obj = item;
+            best_idx = idx;
+            best_value = value;
+        } else if((op == 1) && (value > best_value)) {
+            best_obj = item;
+            best_idx = idx;
+            best_value = value;
+        }
+    }
+    if((optype == NUMERICAL_ARGMIN) || (optype == NUMERICAL_ARGMAX)) {
+        return MP_OBJ_NEW_SMALL_INT(best_idx);
+    } else {
+        return best_obj;
+    }
+}
+
+static mp_obj_t numerical_argmin_argmax_ndarray(ndarray_obj_t *ndarray, mp_obj_t axis, uint8_t optype) {
+    // TODO: treat the flattened array
+    if(ndarray->len == 0) {
+        mp_raise_ValueError(translate("attempt to get (arg)min/(arg)max of empty sequence"));
+    }
+
+    if(axis == mp_const_none) {
+        // work with the flattened array
+        mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+        uint8_t *array = (uint8_t *)ndarray->array;
+        mp_float_t best_value = func(array);
+        mp_float_t value;
+        size_t index = 0, best_index = 0;
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        value = func(array);
+                        if((optype == NUMERICAL_ARGMAX) || (optype == NUMERICAL_MAX)) {
+                            if(best_value < value) {
+                                best_value = value;
+                                best_index = index;
+                            }
+                        } else {
+                            if(best_value > value) {
+                                best_value = value;
+                                best_index = index;
+                            }
+                        }
+                        array += ndarray->strides[ULAB_MAX_DIMS - 1];
+                        l++;
+                        index++;
+                    } while(l < ndarray->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    array -= ndarray->strides[ULAB_MAX_DIMS - 1] * ndarray->shape[ULAB_MAX_DIMS-1];
+                    array += ndarray->strides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < ndarray->shape[ULAB_MAX_DIMS - 2]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                array -= ndarray->strides[ULAB_MAX_DIMS - 2] * ndarray->shape[ULAB_MAX_DIMS-2];
+                array += ndarray->strides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < ndarray->shape[ULAB_MAX_DIMS - 3]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            array -= ndarray->strides[ULAB_MAX_DIMS - 3] * ndarray->shape[ULAB_MAX_DIMS-3];
+            array += ndarray->strides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < ndarray->shape[ULAB_MAX_DIMS - 4]);
+        #endif
+
+        if((optype == NUMERICAL_ARGMIN) || (optype == NUMERICAL_ARGMAX)) {
+            return mp_obj_new_int(best_index);
+        } else {
+            if(ndarray->dtype == NDARRAY_FLOAT) {
+                return mp_obj_new_float(best_value);
+            } else {
+                return MP_OBJ_NEW_SMALL_INT((int32_t)best_value);
+            }
+        }
+    } else {
+        int8_t ax = tools_get_axis(axis, ndarray->ndim);
+
+        uint8_t *array = (uint8_t *)ndarray->array;
+        size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+        memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+        int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+        memset(strides, 0, sizeof(uint32_t)*ULAB_MAX_DIMS);
+        numerical_reduce_axes(ndarray, ax, shape, strides);
+        uint8_t index = ULAB_MAX_DIMS - ndarray->ndim + ax;
+
+        ndarray_obj_t *results = NULL;
+
+        if((optype == NUMERICAL_ARGMIN) || (optype == NUMERICAL_ARGMAX)) {
+            results = ndarray_new_dense_ndarray(MAX(1, ndarray->ndim-1), shape, NDARRAY_INT16);
+        } else {
+            results = ndarray_new_dense_ndarray(MAX(1, ndarray->ndim-1), shape, ndarray->dtype);
+        }
+
+        uint8_t *rarray = (uint8_t *)results->array;
+
+        if(ndarray->dtype == NDARRAY_UINT8) {
+            RUN_ARGMIN(ndarray, uint8_t, array, results, rarray, shape, strides, index, optype);
+        } else if(ndarray->dtype == NDARRAY_INT8) {
+            RUN_ARGMIN(ndarray, int8_t, array, results, rarray, shape, strides, index, optype);
+        } else if(ndarray->dtype == NDARRAY_UINT16) {
+            RUN_ARGMIN(ndarray, uint16_t, array, results, rarray, shape, strides, index, optype);
+        } else if(ndarray->dtype == NDARRAY_INT16) {
+            RUN_ARGMIN(ndarray, int16_t, array, results, rarray, shape, strides, index, optype);
+        } else {
+            RUN_ARGMIN(ndarray, mp_float_t, array, results, rarray, shape, strides, index, optype);
+        }
+        if(results->len == 1) {
+            return mp_binary_get_val_array(results->dtype, results->array, 0);
+        }
+        return MP_OBJ_FROM_PTR(results);
+    }
+    return mp_const_none;
+}
+#endif
+
+static mp_obj_t numerical_function(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args, uint8_t optype) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none} } ,
+        { MP_QSTR_axis, MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t oin = args[0].u_obj;
+    mp_obj_t axis = args[1].u_obj;
+    if((axis != mp_const_none) && (!mp_obj_is_int(axis))) {
+        mp_raise_TypeError(translate("axis must be None, or an integer"));
+    }
+
+    if((optype == NUMERICAL_ALL) || (optype == NUMERICAL_ANY)) {
+        return numerical_all_any(oin, axis, optype);
+    }
+    if(mp_obj_is_type(oin, &mp_type_tuple) || mp_obj_is_type(oin, &mp_type_list) ||
+        mp_obj_is_type(oin, &mp_type_range)) {
+        switch(optype) {
+            case NUMERICAL_MIN:
+            case NUMERICAL_ARGMIN:
+            case NUMERICAL_MAX:
+            case NUMERICAL_ARGMAX:
+                return numerical_argmin_argmax_iterable(oin, optype);
+            case NUMERICAL_SUM:
+            case NUMERICAL_MEAN:
+                return numerical_sum_mean_std_iterable(oin, optype, 0);
+            default: // we should never reach this point, but whatever
+                return mp_const_none;
+        }
+    } else if(mp_obj_is_type(oin, &ulab_ndarray_type)) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(oin);
+        switch(optype) {
+            case NUMERICAL_MIN:
+            case NUMERICAL_MAX:
+            case NUMERICAL_ARGMIN:
+            case NUMERICAL_ARGMAX:
+                COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+                return numerical_argmin_argmax_ndarray(ndarray, axis, optype);
+            case NUMERICAL_SUM:
+            case NUMERICAL_MEAN:
+                COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+                return numerical_sum_mean_std_ndarray(ndarray, axis, optype, 0);
+            default:
+                mp_raise_NotImplementedError(translate("operation is not implemented on ndarrays"));
+        }
+    } else {
+        mp_raise_TypeError(translate("input must be tuple, list, range, or ndarray"));
+    }
+    return mp_const_none;
+}
+
+#if ULAB_NUMPY_HAS_SORT | NDARRAY_HAS_SORT
+static mp_obj_t numerical_sort_helper(mp_obj_t oin, mp_obj_t axis, uint8_t inplace) {
+    if(!mp_obj_is_type(oin, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("sort argument must be an ndarray"));
+    }
+
+    ndarray_obj_t *ndarray;
+    if(inplace == 1) {
+        ndarray = MP_OBJ_TO_PTR(oin);
+    } else {
+        ndarray = ndarray_copy_view(MP_OBJ_TO_PTR(oin));
+    }
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+
+    int8_t ax = 0;
+    if(axis == mp_const_none) {
+        // flatten the array
+        #if ULAB_MAX_DIMS > 1
+        for(uint8_t i=0; i < ULAB_MAX_DIMS - 1; i++) {
+            ndarray->shape[i] = 0;
+            ndarray->strides[i] = 0;
+        }
+        ndarray->shape[ULAB_MAX_DIMS - 1] = ndarray->len;
+        ndarray->strides[ULAB_MAX_DIMS - 1] = ndarray->itemsize;
+        ndarray->ndim = 1;
+        #endif
+    } else {
+        ax = tools_get_axis(axis, ndarray->ndim);
+    }
+
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+    memset(strides, 0, sizeof(uint32_t)*ULAB_MAX_DIMS);
+    numerical_reduce_axes(ndarray, ax, shape, strides);
+    ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
+    // we work with the typed array, so re-scale the stride
+    int32_t increment = ndarray->strides[ax] / ndarray->itemsize;
+
+    uint8_t *array = (uint8_t *)ndarray->array;
+    if((ndarray->dtype == NDARRAY_UINT8) || (ndarray->dtype == NDARRAY_INT8)) {
+        HEAPSORT(ndarray, uint8_t, array, shape, strides, ax, increment, ndarray->shape[ax]);
+    } else if((ndarray->dtype == NDARRAY_INT16) || (ndarray->dtype == NDARRAY_INT16)) {
+        HEAPSORT(ndarray, uint16_t, array, shape, strides, ax, increment, ndarray->shape[ax]);
+    } else {
+        HEAPSORT(ndarray, mp_float_t, array, shape, strides, ax, increment, ndarray->shape[ax]);
+    }
+    if(inplace == 1) {
+        return mp_const_none;
+    } else {
+        return MP_OBJ_FROM_PTR(ndarray);
+    }
+}
+#endif /* ULAB_NUMERICAL_HAS_SORT | NDARRAY_HAS_SORT */
+
+#if ULAB_NUMPY_HAS_ALL
+mp_obj_t numerical_all(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return numerical_function(n_args, pos_args, kw_args, NUMERICAL_ALL);
+}
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_all_obj, 1, numerical_all);
+#endif
+
+#if ULAB_NUMPY_HAS_ANY
+mp_obj_t numerical_any(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return numerical_function(n_args, pos_args, kw_args, NUMERICAL_ANY);
+}
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_any_obj, 1, numerical_any);
+#endif
+
+#if ULAB_NUMPY_HAS_ARGMINMAX
+//| def argmax(array: _ArrayLike, *, axis: Optional[int] = None) -> int:
+//|     """Return the index of the maximum element of the 1D array"""
+//|     ...
+//|
+
+mp_obj_t numerical_argmax(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return numerical_function(n_args, pos_args, kw_args, NUMERICAL_ARGMAX);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_argmax_obj, 1, numerical_argmax);
+
+//| def argmin(array: _ArrayLike, *, axis: Optional[int] = None) -> int:
+//|     """Return the index of the minimum element of the 1D array"""
+//|     ...
+//|
+
+static mp_obj_t numerical_argmin(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return numerical_function(n_args, pos_args, kw_args, NUMERICAL_ARGMIN);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_argmin_obj, 1, numerical_argmin);
+#endif
+
+#if ULAB_NUMPY_HAS_ARGSORT
+//| def argsort(array: ulab.numpy.ndarray, *, axis: int = -1) -> ulab.numpy.ndarray:
+//|     """Returns an array which gives indices into the input array from least to greatest."""
+//|     ...
+//|
+
+mp_obj_t numerical_argsort(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("argsort argument must be an ndarray"));
+    }
+
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    if(args[1].u_obj == mp_const_none) {
+        // bail out, though dense arrays could still be sorted
+        mp_raise_NotImplementedError(translate("argsort is not implemented for flattened arrays"));
+    }
+    // Since we are returning an NDARRAY_UINT16 array, bail out,
+    // if the axis is longer than what we can hold
+    for(uint8_t i=0; i < ULAB_MAX_DIMS; i++) {
+        if(ndarray->shape[i] > 65535) {
+            mp_raise_ValueError(translate("axis too long"));
+        }
+    }
+    int8_t ax = tools_get_axis(args[1].u_obj, ndarray->ndim);
+
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+    memset(strides, 0, sizeof(uint32_t)*ULAB_MAX_DIMS);
+    numerical_reduce_axes(ndarray, ax, shape, strides);
+
+    // We could return an NDARRAY_UINT8 array, if all lengths are shorter than 256
+    ndarray_obj_t *indices = ndarray_new_ndarray(ndarray->ndim, ndarray->shape, NULL, NDARRAY_UINT16);
+    int32_t *istrides = m_new(int32_t, ULAB_MAX_DIMS);
+    memset(istrides, 0, sizeof(uint32_t)*ULAB_MAX_DIMS);
+    numerical_reduce_axes(indices, ax, shape, istrides);
+    for(uint8_t i=0; i < ULAB_MAX_DIMS; i++) {
+        istrides[i] /= sizeof(uint16_t);
+    }
+
+    ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
+    // we work with the typed array, so re-scale the stride
+    int32_t increment = ndarray->strides[ax] / ndarray->itemsize;
+    uint16_t iincrement = indices->strides[ax] / sizeof(uint16_t);
+
+    uint8_t *array = (uint8_t *)ndarray->array;
+    uint16_t *iarray = (uint16_t *)indices->array;
+
+    // fill in the index values
+    #if ULAB_MAX_DIMS > 3
+    size_t j = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t k = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t l = 0;
+            do {
+            #endif
+            uint16_t m = 0;
+                do {
+                    *iarray = m++;
+                    iarray += iincrement;
+                } while(m < indices->shape[ax]);
+            #if ULAB_MAX_DIMS > 1
+                iarray -= iincrement * indices->shape[ax];
+                iarray += istrides[ULAB_MAX_DIMS - 1];
+                l++;
+            } while(l < shape[ULAB_MAX_DIMS - 1]);
+            iarray -= istrides[ULAB_MAX_DIMS - 1] * shape[ULAB_MAX_DIMS - 1];
+            iarray += istrides[ULAB_MAX_DIMS - 2];
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            k++;
+        } while(k < shape[ULAB_MAX_DIMS - 2]);
+        iarray -= istrides[ULAB_MAX_DIMS - 2] * shape[ULAB_MAX_DIMS - 2];
+        iarray += istrides[ULAB_MAX_DIMS - 3];
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        j++;
+    } while(j < shape[ULAB_MAX_DIMS - 3]);
+    #endif
+    // reset the array
+    iarray = indices->array;
+
+    if((ndarray->dtype == NDARRAY_UINT8) || (ndarray->dtype == NDARRAY_INT8)) {
+        HEAP_ARGSORT(ndarray, uint8_t, array, shape, strides, ax, increment, ndarray->shape[ax], iarray, istrides, iincrement);
+    } else if((ndarray->dtype == NDARRAY_UINT16) || (ndarray->dtype == NDARRAY_INT16)) {
+        HEAP_ARGSORT(ndarray, uint16_t, array, shape, strides, ax, increment, ndarray->shape[ax], iarray, istrides, iincrement);
+    } else {
+        HEAP_ARGSORT(ndarray, mp_float_t, array, shape, strides, ax, increment, ndarray->shape[ax], iarray, istrides, iincrement);
+    }
+    return MP_OBJ_FROM_PTR(indices);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_argsort_obj, 1, numerical_argsort);
+#endif
+
+#if ULAB_NUMPY_HAS_CROSS
+//| def cross(a: ulab.numpy.ndarray, b: ulab.numpy.ndarray) -> ulab.numpy.ndarray:
+//|     """Return the cross product of two vectors of length 3"""
+//|     ...
+//|
+
+static mp_obj_t numerical_cross(mp_obj_t _a, mp_obj_t _b) {
+    if (!mp_obj_is_type(_a, &ulab_ndarray_type) || !mp_obj_is_type(_b, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("arguments must be ndarrays"));
+    }
+    ndarray_obj_t *a = MP_OBJ_TO_PTR(_a);
+    ndarray_obj_t *b = MP_OBJ_TO_PTR(_b);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(a->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(b->dtype)
+    if((a->ndim != 1) || (b->ndim != 1) || (a->len != b->len) || (a->len != 3)) {
+        mp_raise_ValueError(translate("cross is defined for 1D arrays of length 3"));
+    }
+
+    mp_float_t *results = m_new(mp_float_t, 3);
+    results[0] = ndarray_get_float_index(a->array, a->dtype, 1) * ndarray_get_float_index(b->array, b->dtype, 2);
+    results[0] -= ndarray_get_float_index(a->array, a->dtype, 2) * ndarray_get_float_index(b->array, b->dtype, 1);
+    results[1] = -ndarray_get_float_index(a->array, a->dtype, 0) * ndarray_get_float_index(b->array, b->dtype, 2);
+    results[1] += ndarray_get_float_index(a->array, a->dtype, 2) * ndarray_get_float_index(b->array, b->dtype, 0);
+    results[2] = ndarray_get_float_index(a->array, a->dtype, 0) * ndarray_get_float_index(b->array, b->dtype, 1);
+    results[2] -= ndarray_get_float_index(a->array, a->dtype, 1) * ndarray_get_float_index(b->array, b->dtype, 0);
+
+    /* The upcasting happens here with the rules
+
+        - if one of the operarands is a float, the result is always float
+        - operation on identical types preserves type
+
+        uint8 + int8 => int16
+        uint8 + int16 => int16
+        uint8 + uint16 => uint16
+        int8 + int16 => int16
+        int8 + uint16 => uint16
+        uint16 + int16 => float
+
+    */
+
+    uint8_t dtype = NDARRAY_FLOAT;
+    if(a->dtype == b->dtype) {
+        dtype = a->dtype;
+    } else if(((a->dtype == NDARRAY_UINT8) && (b->dtype == NDARRAY_INT8)) || ((a->dtype == NDARRAY_INT8) && (b->dtype == NDARRAY_UINT8))) {
+        dtype = NDARRAY_INT16;
+    } else if(((a->dtype == NDARRAY_UINT8) && (b->dtype == NDARRAY_INT16)) || ((a->dtype == NDARRAY_INT16) && (b->dtype == NDARRAY_UINT8))) {
+        dtype = NDARRAY_INT16;
+    } else if(((a->dtype == NDARRAY_UINT8) && (b->dtype == NDARRAY_UINT16)) || ((a->dtype == NDARRAY_UINT16) && (b->dtype == NDARRAY_UINT8))) {
+        dtype = NDARRAY_UINT16;
+    } else if(((a->dtype == NDARRAY_INT8) && (b->dtype == NDARRAY_INT16)) || ((a->dtype == NDARRAY_INT16) && (b->dtype == NDARRAY_INT8))) {
+        dtype = NDARRAY_INT16;
+    } else if(((a->dtype == NDARRAY_INT8) && (b->dtype == NDARRAY_UINT16)) || ((a->dtype == NDARRAY_UINT16) && (b->dtype == NDARRAY_INT8))) {
+        dtype = NDARRAY_UINT16;
+    }
+
+    ndarray_obj_t *ndarray = ndarray_new_linear_array(3, dtype);
+    if(dtype == NDARRAY_UINT8) {
+        uint8_t *array = (uint8_t *)ndarray->array;
+        for(uint8_t i=0; i < 3; i++) array[i] = (uint8_t)results[i];
+    } else if(dtype == NDARRAY_INT8) {
+        int8_t *array = (int8_t *)ndarray->array;
+        for(uint8_t i=0; i < 3; i++) array[i] = (int8_t)results[i];
+    } else if(dtype == NDARRAY_UINT16) {
+        uint16_t *array = (uint16_t *)ndarray->array;
+        for(uint8_t i=0; i < 3; i++) array[i] = (uint16_t)results[i];
+    } else if(dtype == NDARRAY_INT16) {
+        int16_t *array = (int16_t *)ndarray->array;
+        for(uint8_t i=0; i < 3; i++) array[i] = (int16_t)results[i];
+    } else {
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+        for(uint8_t i=0; i < 3; i++) array[i] = results[i];
+    }
+    m_del(mp_float_t, results, 3);
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(numerical_cross_obj, numerical_cross);
+
+#endif /* ULAB_NUMERICAL_HAS_CROSS */
+
+#if ULAB_NUMPY_HAS_DIFF
+//| def diff(array: ulab.numpy.ndarray, *, n: int = 1, axis: int = -1) -> ulab.numpy.ndarray:
+//|     """Return the numerical derivative of successive elements of the array, as
+//|        an array.  axis=None is not supported."""
+//|     ...
+//|
+
+mp_obj_t numerical_diff(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_n, MP_ARG_KW_ONLY | MP_ARG_INT, {.u_int = 1 } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_INT, {.u_int = -1 } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("diff argument must be an ndarray"));
+    }
+
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    int8_t ax = args[2].u_int;
+    if(ax < 0) ax += ndarray->ndim;
+
+    if((ax < 0) || (ax > ndarray->ndim - 1)) {
+        mp_raise_ValueError(translate("index out of range"));
+    }
+
+    if((args[1].u_int < 0) || (args[1].u_int > 9)) {
+        mp_raise_ValueError(translate("differentiation order out of range"));
+    }
+    uint8_t N = (uint8_t)args[1].u_int;
+    uint8_t index = ULAB_MAX_DIMS - ndarray->ndim + ax;
+    if(N > ndarray->shape[index]) {
+        mp_raise_ValueError(translate("differentiation order out of range"));
+    }
+
+    int8_t *stencil = m_new(int8_t, N+1);
+    stencil[0] = 1;
+    for(uint8_t i=1; i < N+1; i++) {
+        stencil[i] = -stencil[i-1]*(N-i+1)/i;
+    }
+
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    for(uint8_t i=0; i < ULAB_MAX_DIMS; i++) {
+        shape[i] = ndarray->shape[i];
+        if(i == index) {
+            shape[i] -= N;
+        }
+    }
+    uint8_t *array = (uint8_t *)ndarray->array;
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndarray->ndim, shape, ndarray->dtype);
+    uint8_t *rarray = (uint8_t *)results->array;
+
+    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+    memset(strides, 0, sizeof(int32_t)*ULAB_MAX_DIMS);
+    numerical_reduce_axes(ndarray, ax, shape, strides);
+
+    if(ndarray->dtype == NDARRAY_UINT8) {
+        RUN_DIFF(ndarray, uint8_t, array, results, rarray, shape, strides, index, stencil, N);
+    } else if(ndarray->dtype == NDARRAY_INT8) {
+        RUN_DIFF(ndarray, int8_t, array, results, rarray, shape, strides, index, stencil, N);
+    }  else if(ndarray->dtype == NDARRAY_UINT16) {
+        RUN_DIFF(ndarray, uint16_t, array, results, rarray, shape, strides, index, stencil, N);
+    } else if(ndarray->dtype == NDARRAY_INT16) {
+        RUN_DIFF(ndarray, int16_t, array, results, rarray, shape, strides, index, stencil, N);
+    } else {
+        RUN_DIFF(ndarray, mp_float_t, array, results, rarray, shape, strides, index, stencil, N);
+    }
+    m_del(int8_t, stencil, N+1);
+    m_del(size_t, shape, ULAB_MAX_DIMS);
+    m_del(int32_t, strides, ULAB_MAX_DIMS);
+    return MP_OBJ_FROM_PTR(results);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_diff_obj, 1, numerical_diff);
+#endif
+
+#if ULAB_NUMPY_HAS_FLIP
+//| def flip(array: ulab.numpy.ndarray, *, axis: Optional[int] = None) -> ulab.numpy.ndarray:
+//|     """Returns a new array that reverses the order of the elements along the
+//|        given axis, or along all axes if axis is None."""
+//|     ...
+//|
+
+mp_obj_t numerical_flip(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("flip argument must be an ndarray"));
+    }
+
+    ndarray_obj_t *results = NULL;
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
+    if(args[1].u_obj == mp_const_none) { // flip the flattened array
+        results = ndarray_new_linear_array(ndarray->len, ndarray->dtype);
+        ndarray_copy_array(ndarray, results, 0);
+        uint8_t *rarray = (uint8_t *)results->array;
+        rarray += (results->len - 1) * results->itemsize;
+        results->array = rarray;
+        results->strides[ULAB_MAX_DIMS - 1] = -results->strides[ULAB_MAX_DIMS - 1];
+    } else if(mp_obj_is_int(args[1].u_obj)){
+        int8_t ax = tools_get_axis(args[1].u_obj, ndarray->ndim);
+
+        ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
+        int32_t offset = (ndarray->shape[ax] - 1) * ndarray->strides[ax];
+        results = ndarray_new_view(ndarray, ndarray->ndim, ndarray->shape, ndarray->strides, offset);
+        results->strides[ax] = -results->strides[ax];
+    } else {
+        mp_raise_TypeError(translate("wrong axis index"));
+    }
+    return results;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_flip_obj, 1, numerical_flip);
+#endif
+
+#if ULAB_NUMPY_HAS_MINMAX
+//| def max(array: _ArrayLike, *, axis: Optional[int] = None) -> _float:
+//|     """Return the maximum element of the 1D array"""
+//|     ...
+//|
+
+mp_obj_t numerical_max(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return numerical_function(n_args, pos_args, kw_args, NUMERICAL_MAX);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_max_obj, 1, numerical_max);
+#endif
+
+#if ULAB_NUMPY_HAS_MEAN
+//| def mean(array: _ArrayLike, *, axis: Optional[int] = None) -> _float:
+//|     """Return the mean element of the 1D array, as a number if axis is None, otherwise as an array."""
+//|     ...
+//|
+
+mp_obj_t numerical_mean(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return numerical_function(n_args, pos_args, kw_args, NUMERICAL_MEAN);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_mean_obj, 1, numerical_mean);
+#endif
+
+#if ULAB_NUMPY_HAS_MEDIAN
+//| def median(array: ulab.numpy.ndarray, *, axis: int = -1) -> ulab.numpy.ndarray:
+//|     """Find the median value in an array along the given axis, or along all axes if axis is None."""
+//|     ...
+//|
+
+mp_obj_t numerical_median(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("median argument must be an ndarray"));
+    }
+
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
+    if(ndarray->len == 0) {
+        return mp_obj_new_float(MICROPY_FLOAT_C_FUN(nan)(""));
+    }
+
+    ndarray = numerical_sort_helper(args[0].u_obj, args[1].u_obj, 0);
+
+    if((args[1].u_obj == mp_const_none) || (ndarray->ndim == 1)) {
+        // at this point, the array holding the sorted values should be flat
+        uint8_t *array = (uint8_t *)ndarray->array;
+        size_t len = ndarray->len;
+        array += (len >> 1) * ndarray->itemsize;
+        mp_float_t median = ndarray_get_float_value(array, ndarray->dtype);
+        if(!(len & 0x01)) { // len is an even number
+            array -= ndarray->itemsize;
+            median += ndarray_get_float_value(array, ndarray->dtype);
+            median *= MICROPY_FLOAT_CONST(0.5);
+        }
+        return mp_obj_new_float(median);
+    } else {
+        int8_t ax = tools_get_axis(args[1].u_obj, ndarray->ndim);
+
+        size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+        memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+        int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+        memset(strides, 0, sizeof(uint32_t)*ULAB_MAX_DIMS);
+        numerical_reduce_axes(ndarray, ax, shape, strides);
+        ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
+        ndarray_obj_t *results = ndarray_new_dense_ndarray(ndarray->ndim-1, shape, NDARRAY_FLOAT);
+        mp_float_t *rarray = (mp_float_t *)results->array;
+
+        uint8_t *array = (uint8_t *)ndarray->array;
+
+        size_t len = ndarray->shape[ax];
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                size_t k = 0;
+                do {
+                    array += ndarray->strides[ax] * (len >> 1);
+                    mp_float_t median = ndarray_get_float_value(array, ndarray->dtype);
+                    if(!(len & 0x01)) { // len is an even number
+                        array -= ndarray->strides[ax];
+                        median += ndarray_get_float_value(array, ndarray->dtype);
+                        median *= MICROPY_FLOAT_CONST(0.5);
+                        array += ndarray->strides[ax];
+                    }
+                    array -= ndarray->strides[ax] * (len >> 1);
+                    array += strides[ULAB_MAX_DIMS - 1];
+                    *rarray = median;
+                    rarray++;
+                    k++;
+                } while(k < shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 2
+                array -= strides[ULAB_MAX_DIMS - 1] * shape[ULAB_MAX_DIMS - 1];
+                array += strides[ULAB_MAX_DIMS - 2];
+                j++;
+            } while(j < shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            array -= strides[ULAB_MAX_DIMS - 2] * shape[ULAB_MAX_DIMS-2];
+            array += strides[ULAB_MAX_DIMS - 3];
+            i++;
+        } while(i < shape[ULAB_MAX_DIMS - 3]);
+        #endif
+
+        return MP_OBJ_FROM_PTR(results);
+    }
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_median_obj, 1, numerical_median);
+#endif
+
+#if ULAB_NUMPY_HAS_MINMAX
+//| def min(array: _ArrayLike, *, axis: Optional[int] = None) -> _float:
+//|     """Return the minimum element of the 1D array"""
+//|     ...
+//|
+
+mp_obj_t numerical_min(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return numerical_function(n_args, pos_args, kw_args, NUMERICAL_MIN);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_min_obj, 1, numerical_min);
+#endif
+
+#if ULAB_NUMPY_HAS_ROLL
+//| def roll(array: ulab.numpy.ndarray, distance: int, *, axis: Optional[int] = None) -> None:
+//|     """Shift the content of a vector by the positions given as the second
+//|        argument. If the ``axis`` keyword is supplied, the shift is applied to
+//|        the given axis.  The array is modified in place."""
+//|     ...
+//|
+
+mp_obj_t numerical_roll(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none  } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("roll argument must be an ndarray"));
+    }
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
+    uint8_t *array = ndarray->array;
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndarray->ndim, ndarray->shape, ndarray->dtype);
+
+    int32_t shift = mp_obj_get_int(args[1].u_obj);
+    int32_t _shift = shift < 0 ? -shift : shift;
+
+    size_t counter;
+    uint8_t *rarray = (uint8_t *)results->array;
+
+    if(args[2].u_obj == mp_const_none) { // roll the flattened array
+        _shift = _shift % results->len;
+        if(shift > 0) { // shift to the right
+            rarray += _shift * results->itemsize;
+            counter = results->len - _shift;
+        } else { // shift to the left
+            rarray += (results->len - _shift) * results->itemsize;
+            counter = _shift;
+        }
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        memcpy(rarray, array, ndarray->itemsize);
+                        rarray += results->itemsize;
+                        array += ndarray->strides[ULAB_MAX_DIMS - 1];
+                        l++;
+                        if(--counter == 0) {
+                            rarray = results->array;
+                        }
+                    } while(l <  ndarray->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    array -= ndarray->strides[ULAB_MAX_DIMS - 1] * ndarray->shape[ULAB_MAX_DIMS-1];
+                    array += ndarray->strides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k <  ndarray->shape[ULAB_MAX_DIMS - 2]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                array -= ndarray->strides[ULAB_MAX_DIMS - 2] * ndarray->shape[ULAB_MAX_DIMS-2];
+                array += ndarray->strides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j <  ndarray->shape[ULAB_MAX_DIMS - 3]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            array -= ndarray->strides[ULAB_MAX_DIMS - 3] * ndarray->shape[ULAB_MAX_DIMS-3];
+            array += ndarray->strides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i <  ndarray->shape[ULAB_MAX_DIMS - 4]);
+        #endif
+    } else if(mp_obj_is_int(args[2].u_obj)){
+        int8_t ax = tools_get_axis(args[2].u_obj, ndarray->ndim);
+
+        size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+        memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+        int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+        memset(strides, 0, sizeof(int32_t)*ULAB_MAX_DIMS);
+        numerical_reduce_axes(ndarray, ax, shape, strides);
+
+        size_t *rshape = m_new(size_t, ULAB_MAX_DIMS);
+        memset(rshape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+        int32_t *rstrides = m_new(int32_t, ULAB_MAX_DIMS);
+        memset(rstrides, 0, sizeof(int32_t)*ULAB_MAX_DIMS);
+        numerical_reduce_axes(results, ax, rshape, rstrides);
+
+        ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
+        uint8_t *_rarray;
+        _shift = _shift % results->shape[ax];
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    _rarray = rarray;
+                    if(shift < 0) {
+                        rarray += (results->shape[ax] - _shift) * results->strides[ax];
+                        counter = _shift;
+                    } else {
+                        rarray += _shift * results->strides[ax];
+                        counter = results->shape[ax] - _shift;
+                    }
+                    do {
+                        memcpy(rarray, array, ndarray->itemsize);
+                        array += ndarray->strides[ax];
+                        rarray += results->strides[ax];
+                        if(--counter == 0) {
+                            rarray = _rarray;
+                        }
+                        l++;
+                    } while(l < ndarray->shape[ax]);
+                #if ULAB_MAX_DIMS > 1
+                    rarray = _rarray;
+                    rarray += rstrides[ULAB_MAX_DIMS - 1];
+                    array -= ndarray->strides[ax] * ndarray->shape[ax];
+                    array += strides[ULAB_MAX_DIMS - 1];
+                    k++;
+                } while(k < shape[ULAB_MAX_DIMS - 1]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                rarray -= rstrides[ULAB_MAX_DIMS - 1] * rshape[ULAB_MAX_DIMS-1];
+                rarray += rstrides[ULAB_MAX_DIMS - 2];
+                array -= strides[ULAB_MAX_DIMS - 1] * shape[ULAB_MAX_DIMS-1];
+                array += strides[ULAB_MAX_DIMS - 2];
+                j++;
+            } while(j < shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            rarray -= rstrides[ULAB_MAX_DIMS - 2] * rshape[ULAB_MAX_DIMS-2];
+            rarray += rstrides[ULAB_MAX_DIMS - 3];
+            array -= strides[ULAB_MAX_DIMS - 2] * shape[ULAB_MAX_DIMS-2];
+            array += strides[ULAB_MAX_DIMS - 3];
+            i++;
+        } while(i < shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    } else {
+        mp_raise_TypeError(translate("wrong axis index"));
+    }
+    return results;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_roll_obj, 2, numerical_roll);
+#endif
+
+#if ULAB_NUMPY_HAS_SORT
+//| def sort(array: ulab.numpy.ndarray, *, axis: int = -1) -> ulab.numpy.ndarray:
+//|     """Sort the array along the given axis, or along all axes if axis is None.
+//|        The array is modified in place."""
+//|     ...
+//|
+
+mp_obj_t numerical_sort(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    return numerical_sort_helper(args[0].u_obj, args[1].u_obj, 0);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_sort_obj, 1, numerical_sort);
+#endif
+
+#if NDARRAY_HAS_SORT
+// method of an ndarray
+static mp_obj_t numerical_sort_inplace(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_int = -1 } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    return numerical_sort_helper(args[0].u_obj, args[1].u_obj, 1);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_sort_inplace_obj, 1, numerical_sort_inplace);
+#endif /* NDARRAY_HAS_SORT */
+
+#if ULAB_NUMPY_HAS_STD
+//| def std(array: _ArrayLike, *, axis: Optional[int] = None, ddof: int = 0) -> _float:
+//|     """Return the standard deviation of the array, as a number if axis is None, otherwise as an array."""
+//|     ...
+//|
+
+mp_obj_t numerical_std(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none } } ,
+        { MP_QSTR_axis, MP_ARG_OBJ, {.u_rom_obj = mp_const_none } },
+        { MP_QSTR_ddof, MP_ARG_KW_ONLY | MP_ARG_INT, {.u_int = 0} },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t oin = args[0].u_obj;
+    mp_obj_t axis = args[1].u_obj;
+    size_t ddof = args[2].u_int;
+    if((axis != mp_const_none) && (mp_obj_get_int(axis) != 0) && (mp_obj_get_int(axis) != 1)) {
+        // this seems to pass with False, and True...
+        mp_raise_ValueError(translate("axis must be None, or an integer"));
+    }
+    if(mp_obj_is_type(oin, &mp_type_tuple) || mp_obj_is_type(oin, &mp_type_list) || mp_obj_is_type(oin, &mp_type_range)) {
+        return numerical_sum_mean_std_iterable(oin, NUMERICAL_STD, ddof);
+    } else if(mp_obj_is_type(oin, &ulab_ndarray_type)) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(oin);
+        return numerical_sum_mean_std_ndarray(ndarray, axis, NUMERICAL_STD, ddof);
+    } else {
+        mp_raise_TypeError(translate("input must be tuple, list, range, or ndarray"));
+    }
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_std_obj, 1, numerical_std);
+#endif
+
+#if ULAB_NUMPY_HAS_SUM
+//| def sum(array: _ArrayLike, *, axis: Optional[int] = None) -> Union[_float, int, ulab.numpy.ndarray]:
+//|     """Return the sum of the array, as a number if axis is None, otherwise as an array."""
+//|     ...
+//|
+
+mp_obj_t numerical_sum(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    return numerical_function(n_args, pos_args, kw_args, NUMERICAL_SUM);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(numerical_sum_obj, 1, numerical_sum);
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/numerical.h b/circuitpython/extmod/ulab/code/numpy/numerical.h
new file mode 100644
index 0000000..8d2971c
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/numerical.h
@@ -0,0 +1,652 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#ifndef _NUMERICAL_
+#define _NUMERICAL_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+// TODO: implement cumsum
+
+#define RUN_ARGMIN1(ndarray, type, array, results, rarray, index, op)\
+({\
+    uint16_t best_index = 0;\
+    type best_value = *((type *)(array));\
+    if(((op) == NUMERICAL_MAX) || ((op) == NUMERICAL_ARGMAX)) {\
+        for(uint16_t i=0; i < (ndarray)->shape[(index)]; i++) {\
+            if(*((type *)(array)) > best_value) {\
+                best_index = i;\
+                best_value = *((type *)(array));\
+            }\
+            (array) += (ndarray)->strides[(index)];\
+        }\
+    } else {\
+        for(uint16_t i=0; i < (ndarray)->shape[(index)]; i++) {\
+            if(*((type *)(array)) < best_value) {\
+                best_index = i;\
+                best_value = *((type *)(array));\
+            }\
+            (array) += (ndarray)->strides[(index)];\
+        }\
+    }\
+    if(((op) == NUMERICAL_ARGMAX) || ((op) == NUMERICAL_ARGMIN)) {\
+        memcpy((rarray), &best_index, (results)->itemsize);\
+    } else {\
+        memcpy((rarray), &best_value, (results)->itemsize);\
+    }\
+    (rarray) += (results)->itemsize;\
+})
+
+#define RUN_SUM1(type, array, results, rarray, ss)\
+({\
+    type sum = 0;\
+    for(size_t i=0; i < (ss).shape[0]; i++) {\
+        sum += *((type *)(array));\
+        (array) += (ss).strides[0];\
+    }\
+    memcpy((rarray), &sum, (results)->itemsize);\
+    (rarray) += (results)->itemsize;\
+})
+
+// The mean could be calculated by simply dividing the sum by
+// the number of elements, but that method is numerically unstable
+#define RUN_MEAN1(type, array, rarray, ss)\
+({\
+    mp_float_t M = 0.0;\
+    for(size_t i=0; i < (ss).shape[0]; i++) {\
+        mp_float_t value = (mp_float_t)(*(type *)(array));\
+        M = M + (value - M) / (mp_float_t)(i+1);\
+        (array) += (ss).strides[0];\
+    }\
+    *(rarray)++ = M;\
+})
+
+// Instead of the straightforward implementation of the definition,
+// we take the numerically stable Welford algorithm here
+// https://www.johndcook.com/blog/2008/09/26/comparing-three-methods-of-computing-standard-deviation/
+#define RUN_STD1(type, array, rarray, ss, div)\
+({\
+    mp_float_t M = 0.0, m = 0.0, S = 0.0;\
+    for(size_t i=0; i < (ss).shape[0]; i++) {\
+        mp_float_t value = (mp_float_t)(*(type *)(array));\
+        m = M + (value - M) / (mp_float_t)(i+1);\
+        S = S + (value - M) * (value - m);\
+        M = m;\
+        (array) += (ss).strides[0];\
+    }\
+    *(rarray)++ = MICROPY_FLOAT_C_FUN(sqrt)(S / (div));\
+})
+
+#define RUN_MEAN_STD1(type, array, rarray, ss, div, isStd)\
+({\
+    mp_float_t M = 0.0, m = 0.0, S = 0.0;\
+    for(size_t i=0; i < (ss).shape[0]; i++) {\
+        mp_float_t value = (mp_float_t)(*(type *)(array));\
+        m = M + (value - M) / (mp_float_t)(i+1);\
+        if(isStd) {\
+            S += (value - M) * (value - m);\
+        }\
+        M = m;\
+        (array) += (ss).strides[0];\
+    }\
+    *(rarray)++ = isStd ? MICROPY_FLOAT_C_FUN(sqrt)(S / (div)) : M;\
+})
+
+#define RUN_DIFF1(ndarray, type, array, results, rarray, index, stencil, N)\
+({\
+    for(size_t i=0; i < (results)->shape[ULAB_MAX_DIMS - 1]; i++) {\
+        type sum = 0;\
+        uint8_t *source = (array);\
+        for(uint8_t d=0; d < (N)+1; d++) {\
+            sum -= (stencil)[d] * *((type *)source);\
+            source += (ndarray)->strides[(index)];\
+        }\
+        (array) += (ndarray)->strides[ULAB_MAX_DIMS - 1];\
+        *(type *)(rarray) = sum;\
+        (rarray) += (results)->itemsize;\
+    }\
+})
+
+#define HEAPSORT1(type, array, increment, N)\
+({\
+    type *_array = (type *)array;\
+    type tmp;\
+    size_t c, q = (N), p, r = (N) >> 1;\
+    for (;;) {\
+        if (r > 0) {\
+            tmp = _array[(--r)*(increment)];\
+        } else {\
+            q--;\
+            if(q == 0) {\
+                break;\
+            }\
+            tmp = _array[q*(increment)];\
+            _array[q*(increment)] = _array[0];\
+        }\
+        p = r;\
+        c = r + r + 1;\
+        while (c < q) {\
+            if((c + 1 < q)  &&  (_array[(c+1)*(increment)] > _array[c*(increment)])) {\
+                c++;\
+            }\
+            if(_array[c*(increment)] > tmp) {\
+                _array[p*(increment)] = _array[c*(increment)];\
+                p = c;\
+                c = p + p + 1;\
+            } else {\
+                break;\
+            }\
+        }\
+        _array[p*(increment)] = tmp;\
+    }\
+})
+
+#define HEAP_ARGSORT1(type, array, increment, N, iarray, iincrement)\
+({\
+    type *_array = (type *)array;\
+    type tmp;\
+    uint16_t itmp, c, q = (N), p, r = (N) >> 1;\
+    for (;;) {\
+        if (r > 0) {\
+            r--;\
+            itmp = (iarray)[r*(iincrement)];\
+            tmp = _array[itmp*(increment)];\
+        } else {\
+            q--;\
+            if(q == 0) {\
+                break;\
+            }\
+            itmp = (iarray)[q*(iincrement)];\
+            tmp = _array[itmp*(increment)];\
+            (iarray)[q*(iincrement)] = (iarray)[0];\
+        }\
+        p = r;\
+        c = r + r + 1;\
+        while (c < q) {\
+            if((c + 1 < q)  &&  (_array[(iarray)[(c+1)*(iincrement)]*(increment)] > _array[(iarray)[c*(iincrement)]*(increment)])) {\
+                c++;\
+            }\
+            if(_array[(iarray)[c*(iincrement)]*(increment)] > tmp) {\
+                (iarray)[p*(iincrement)] = (iarray)[c*(iincrement)];\
+                p = c;\
+                c = p + p + 1;\
+            } else {\
+                break;\
+            }\
+        }\
+        (iarray)[p*(iincrement)] = itmp;\
+    }\
+})
+
+#if ULAB_MAX_DIMS == 1
+#define RUN_SUM(type, array, results, rarray, ss) do {\
+    RUN_SUM1(type, (array), (results), (rarray), (ss));\
+} while(0)
+
+#define RUN_MEAN(type, array, rarray, ss) do {\
+    RUN_MEAN1(type, (array), (rarray), (ss));\
+} while(0)
+
+#define RUN_STD(type, array, rarray, ss, div) do {\
+    RUN_STD1(type, (array), (results), (rarray), (ss), (div));\
+} while(0)
+
+#define RUN_MEAN_STD(type, array, rarray, ss, div, isStd) do {\
+    RUN_MEAN_STD1(type, (array), (rarray), (ss), (div), (isStd));\
+} while(0)
+
+#define RUN_ARGMIN(ndarray, type, array, results, rarray, shape, strides, index, op) do {\
+    RUN_ARGMIN1((ndarray), type, (array), (results), (rarray), (index), (op));\
+} while(0)
+
+#define RUN_DIFF(ndarray, type, array, results, rarray, shape, strides, index, stencil, N) do {\
+    RUN_DIFF1((ndarray), type, (array), (results), (rarray), (index), (stencil), (N));\
+} while(0)
+
+#define HEAPSORT(ndarray, type, array, shape, strides, index, increment, N) do {\
+    HEAPSORT1(type, (array), (increment), (N));\
+} while(0)
+
+#define HEAP_ARGSORT(ndarray, type, array, shape, strides, index, increment, N, iarray, istrides, iincrement) do {\
+    HEAP_ARGSORT1(type, (array), (increment), (N), (iarray), (iincrement));\
+} while(0)
+
+#endif
+
+#if ULAB_MAX_DIMS == 2
+#define RUN_SUM(type, array, results, rarray, ss) do {\
+    size_t l = 0;\
+    do {\
+        RUN_SUM1(type, (array), (results), (rarray), (ss));\
+        (array) -= (ss).strides[0] * (ss).shape[0];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+} while(0)
+
+#define RUN_MEAN(type, array, rarray, ss) do {\
+    size_t l = 0;\
+    do {\
+        RUN_MEAN1(type, (array), (rarray), (ss));\
+        (array) -= (ss).strides[0] * (ss).shape[0];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+} while(0)
+
+#define RUN_STD(type, array, rarray, ss, div) do {\
+    size_t l = 0;\
+    do {\
+        RUN_STD1(type, (array), (rarray), (ss), (div));\
+        (array) -= (ss).strides[0] * (ss).shape[0];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+} while(0)
+
+#define RUN_MEAN_STD(type, array, rarray, ss, div, isStd) do {\
+    size_t l = 0;\
+    do {\
+        RUN_MEAN_STD1(type, (array), (rarray), (ss), (div), (isStd));\
+        (array) -= (ss).strides[0] * (ss).shape[0];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+} while(0)
+
+
+#define RUN_ARGMIN(ndarray, type, array, results, rarray, shape, strides, index, op) do {\
+    size_t l = 0;\
+    do {\
+        RUN_ARGMIN1((ndarray), type, (array), (results), (rarray), (index), (op));\
+        (array) -= (ndarray)->strides[(index)] * (ndarray)->shape[(index)];\
+        (array) += (strides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+} while(0)
+
+#define RUN_DIFF(ndarray, type, array, results, rarray, shape, strides, index, stencil, N) do {\
+    size_t l = 0;\
+    do {\
+        RUN_DIFF1((ndarray), type, (array), (results), (rarray), (index), (stencil), (N));\
+        (array) -= (ndarray)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (array) += (ndarray)->strides[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (results)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (rarray) += (results)->strides[ULAB_MAX_DIMS - 2];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#define HEAPSORT(ndarray, type, array, shape, strides, index, increment, N) do {\
+    size_t l = 0;\
+    do {\
+        HEAPSORT1(type, (array), (increment), (N));\
+        (array) += (strides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+} while(0)
+
+#define HEAP_ARGSORT(ndarray, type, array, shape, strides, index, increment, N, iarray, istrides, iincrement) do {\
+    size_t l = 0;\
+    do {\
+        HEAP_ARGSORT1(type, (array), (increment), (N), (iarray), (iincrement));\
+        (array) += (strides)[ULAB_MAX_DIMS - 1];\
+        (iarray) += (istrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+} while(0)
+
+#endif
+
+#if ULAB_MAX_DIMS == 3
+#define RUN_SUM(type, array, results, rarray, ss) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            RUN_SUM1(type, (array), (results), (rarray), (ss));\
+            (array) -= (ss).strides[0] * (ss).shape[0];\
+            (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#define RUN_MEAN(type, array, rarray, ss) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            RUN_MEAN1(type, (array), (rarray), (ss));\
+            (array) -= (ss).strides[0] * (ss).shape[0];\
+            (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#define RUN_STD(type, array, rarray, ss, div) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            RUN_STD1(type, (array), (rarray), (ss), (div));\
+            (array) -= (ss).strides[0] * (ss).shape[0];\
+            (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#define RUN_MEAN_STD(type, array, rarray, ss, div, isStd) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            RUN_MEAN_STD1(type, (array), (rarray), (ss), (div), (isStd));\
+            (array) -= (ss).strides[0] * (ss).shape[0];\
+            (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#define RUN_ARGMIN(ndarray, type, array, results, rarray, shape, strides, index, op) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            RUN_ARGMIN1((ndarray), type, (array), (results), (rarray), (index), (op));\
+            (array) -= (ndarray)->strides[(index)] * (ndarray)->shape[(index)];\
+            (array) += (strides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+        (array) -= (strides)[ULAB_MAX_DIMS - 1] * (shape)[ULAB_MAX_DIMS-1];\
+        (array) += (strides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (shape)[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#define RUN_DIFF(ndarray, type, array, results, rarray, shape, strides, index, stencil, N) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            RUN_DIFF1((ndarray), type, (array), (results), (rarray), (index), (stencil), (N));\
+            (array) -= (ndarray)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+			(array) += (ndarray)->strides[ULAB_MAX_DIMS - 2];\
+            (rarray) -= (results)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+            (rarray) += (results)->strides[ULAB_MAX_DIMS - 2];\
+            l++;\
+        } while(l < (shape)[ULAB_MAX_DIMS - 2]);\
+        (array) -= (ndarray)->strides[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS-2];\
+        (array) += (ndarray)->strides[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (results)->strides[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (rarray) += (results)->strides[ULAB_MAX_DIMS - 3];\
+        k++;\
+    } while(k < (shape)[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#define HEAPSORT(ndarray, type, array, shape, strides, index, increment, N) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            HEAPSORT1(type, (array), (increment), (N));\
+            (array) += (strides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+        (array) -= (strides)[ULAB_MAX_DIMS - 1] * (shape)[ULAB_MAX_DIMS-1];\
+        (array) += (strides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (shape)[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#define HEAP_ARGSORT(ndarray, type, array, shape, strides, index, increment, N, iarray, istrides, iincrement) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            HEAP_ARGSORT1(type, (array), (increment), (N), (iarray), (iincrement));\
+            (array) += (strides)[ULAB_MAX_DIMS - 1];\
+            (iarray) += (istrides)[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+        (iarray) -= (istrides)[ULAB_MAX_DIMS - 1] * (shape)[ULAB_MAX_DIMS-1];\
+        (iarray) += (istrides)[ULAB_MAX_DIMS - 2];\
+        (array) -= (strides)[ULAB_MAX_DIMS - 1] * (shape)[ULAB_MAX_DIMS-1];\
+        (array) += (strides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (shape)[ULAB_MAX_DIMS - 2]);\
+} while(0)
+
+#endif
+
+#if ULAB_MAX_DIMS == 4
+#define RUN_SUM(type, array, results, rarray, ss) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                RUN_SUM1(type, (array), (results), (rarray), (ss));\
+                (array) -= (ss).strides[0] * (ss).shape[0];\
+                (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+            (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+            (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (ss).shape[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#define RUN_MEAN(type, array, rarray, ss) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                RUN_MEAN1(type, (array), (rarray), (ss));\
+                (array) -= (ss).strides[0] * (ss).shape[0];\
+                (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+            (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+            (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (ss).shape[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#define RUN_STD(type, array, rarray, ss, div) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                RUN_STD1(type, (array), (rarray), (ss), (div));\
+                (array) -= (ss).strides[0] * (ss).shape[0];\
+                (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+            (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+            (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (ss).shape[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#define RUN_MEAN_STD(type, array, rarray, ss, div, isStd) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                RUN_MEAN_STD1(type, (array), (rarray), (ss), (div), (isStd));\
+                (array) -= (ss).strides[0] * (ss).shape[0];\
+                (array) += (ss).strides[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (ss).shape[ULAB_MAX_DIMS - 1]);\
+            (array) -= (ss).strides[ULAB_MAX_DIMS - 1] * (ss).shape[ULAB_MAX_DIMS - 1];\
+            (array) += (ss).strides[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (ss).shape[ULAB_MAX_DIMS - 2]);\
+        (array) -= (ss).strides[ULAB_MAX_DIMS - 2] * (ss).shape[ULAB_MAX_DIMS - 2];\
+        (array) += (ss).strides[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (ss).shape[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#define RUN_ARGMIN(ndarray, type, array, results, rarray, shape, strides, index, op) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                RUN_ARGMIN1((ndarray), type, (array), (results), (rarray), (index), (op));\
+                (array) -= (ndarray)->strides[(index)] * (ndarray)->shape[(index)];\
+                (array) += (strides)[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+            (array) -= (strides)[ULAB_MAX_DIMS - 1] * (shape)[ULAB_MAX_DIMS-1];\
+            (array) += (strides)[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (shape)[ULAB_MAX_DIMS - 2]);\
+        (array) -= (strides)[ULAB_MAX_DIMS - 2] * (shape)[ULAB_MAX_DIMS-2];\
+        (array) += (strides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (shape)[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#define RUN_DIFF(ndarray, type, array, results, rarray, shape, strides, index, stencil, N) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                RUN_DIFF1((ndarray), type, (array), (results), (rarray), (index), (stencil), (N));\
+                (array) -= (ndarray)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+                (array) += (ndarray)->strides[ULAB_MAX_DIMS - 2];\
+                (rarray) -= (results)->strides[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+                (rarray) += (results)->strides[ULAB_MAX_DIMS - 2];\
+                l++;\
+            } while(l < (shape)[ULAB_MAX_DIMS - 2]);\
+            (array) -= (strides)[ULAB_MAX_DIMS - 2] * (shape)[ULAB_MAX_DIMS-2];\
+            (array) += (strides)[ULAB_MAX_DIMS - 3];\
+            (rarray) -= (results)->strides[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+            (rarray) += (results)->strides[ULAB_MAX_DIMS - 3];\
+            k++;\
+        } while(k < (shape)[ULAB_MAX_DIMS - 3]);\
+        (array) -= (strides)[ULAB_MAX_DIMS - 3] * (shape)[ULAB_MAX_DIMS-3];\
+        (array) += (strides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (results)->strides[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (rarray) += (results)->strides[ULAB_MAX_DIMS - 4];\
+        j++;\
+    } while(j < (shape)[ULAB_MAX_DIMS - 4]);\
+} while(0)
+
+#define HEAPSORT(ndarray, type, array, shape, strides, index, increment, N) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                HEAPSORT1(type, (array), (increment), (N));\
+                (array) += (strides)[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+            (array) -= (strides)[ULAB_MAX_DIMS - 1] * (shape)[ULAB_MAX_DIMS-1];\
+            (array) += (strides)[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (shape)[ULAB_MAX_DIMS - 2]);\
+        (array) -= (strides)[ULAB_MAX_DIMS - 2] * (shape)[ULAB_MAX_DIMS-2];\
+        (array) += (strides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (shape)[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#define HEAP_ARGSORT(ndarray, type, array, shape, strides, index, increment, N, iarray, istrides, iincrement) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                HEAP_ARGSORT1(type, (array), (increment), (N), (iarray), (iincrement));\
+                (array) += (strides)[ULAB_MAX_DIMS - 1];\
+                (iarray) += (istrides)[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (shape)[ULAB_MAX_DIMS - 1]);\
+            (iarray) -= (istrides)[ULAB_MAX_DIMS - 1] * (shape)[ULAB_MAX_DIMS-1];\
+            (iarray) += (istrides)[ULAB_MAX_DIMS - 2];\
+            (array) -= (strides)[ULAB_MAX_DIMS - 1] * (shape)[ULAB_MAX_DIMS-1];\
+            (array) += (strides)[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (shape)[ULAB_MAX_DIMS - 2]);\
+        (iarray) -= (istrides)[ULAB_MAX_DIMS - 2] * (shape)[ULAB_MAX_DIMS-2];\
+        (iarray) += (istrides)[ULAB_MAX_DIMS - 3];\
+        (array) -= (strides)[ULAB_MAX_DIMS - 2] * (shape)[ULAB_MAX_DIMS-2];\
+        (array) += (strides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (shape)[ULAB_MAX_DIMS - 3]);\
+} while(0)
+
+#endif
+
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_all_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_any_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_argmax_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_argmin_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_argsort_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(numerical_cross_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_diff_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_flip_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_max_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_mean_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_median_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_min_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_roll_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_std_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_sum_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_sort_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(numerical_sort_inplace_obj);
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/numpy.c b/circuitpython/extmod/ulab/code/numpy/numpy.c
new file mode 100644
index 0000000..ebd171d
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/numpy.c
@@ -0,0 +1,383 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020-2022 Zoltán Vörös
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include <string.h>
+#include "py/runtime.h"
+
+#include "numpy.h"
+#include "approx.h"
+#include "carray/carray.h"
+#include "compare.h"
+#include "create.h"
+#include "fft/fft.h"
+#include "filter.h"
+#include "linalg/linalg.h"
+#include "numerical.h"
+#include "stats.h"
+#include "transform.h"
+#include "poly.h"
+#include "vector.h"
+
+//| """Compatibility layer for numpy"""
+//|
+
+//| class ndarray: ...
+
+//| def get_printoptions() -> Dict[str, int]:
+//|     """Get printing options"""
+//|     ...
+//|
+//| def set_printoptions(threshold: Optional[int] = None, edgeitems: Optional[int] = None) -> None:
+//|     """Set printing options"""
+//|     ...
+//|
+//| def ndinfo(array: ulab.numpy.ndarray) -> None:
+//|     ...
+//|
+//| def array(
+//|     values: Union[ndarray, Iterable[Union[_float, _bool, Iterable[Any]]]],
+//|     *,
+//|     dtype: _DType = ulab.numpy.float
+//| ) -> ulab.numpy.ndarray:
+//|     """alternate constructor function for `ulab.numpy.ndarray`. Mirrors numpy.array"""
+//|     ...
+
+// math constants
+#if ULAB_NUMPY_HAS_E
+#if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_C
+#define ulab_const_float_e MP_ROM_PTR((mp_obj_t)(((0x402df854 & ~3) | 2) + 0x80800000))
+#elif MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D
+#define ulab_const_float_e {((mp_obj_t)((uint64_t)0x4005bf0a8b145769 + 0x8004000000000000))}
+#else
+mp_obj_float_t ulab_const_float_e_obj = {{&mp_type_float}, MP_E};
+#define ulab_const_float_e MP_ROM_PTR(&ulab_const_float_e_obj)
+#endif
+#endif
+
+#if ULAB_NUMPY_HAS_INF
+#if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_C
+#define numpy_const_float_inf MP_ROM_PTR((mp_obj_t)(0x7f800002 + 0x80800000))
+#elif MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D
+#define numpy_const_float_inf {((mp_obj_t)((uint64_t)0x7ff0000000000000 + 0x8004000000000000))}
+#else
+mp_obj_float_t numpy_const_float_inf_obj = {{&mp_type_float}, (mp_float_t)INFINITY};
+#define numpy_const_float_inf MP_ROM_PTR(&numpy_const_float_inf_obj)
+#endif
+#endif
+
+#if ULAB_NUMPY_HAS_NAN
+#if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_C
+#define numpy_const_float_nan MP_ROM_PTR((mp_obj_t)(0x7fc00002 + 0x80800000))
+#elif MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D
+#define numpy_const_float_nan {((mp_obj_t)((uint64_t)0x7ff8000000000000 + 0x8004000000000000))}
+#else
+mp_obj_float_t numpy_const_float_nan_obj = {{&mp_type_float}, (mp_float_t)NAN};
+#define numpy_const_float_nan MP_ROM_PTR(&numpy_const_float_nan_obj)
+#endif
+#endif
+
+#if ULAB_NUMPY_HAS_PI
+#if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_C
+#define ulab_const_float_pi MP_ROM_PTR((mp_obj_t)(((0x40490fdb & ~3) | 2) + 0x80800000))
+#elif MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D
+#define ulab_const_float_pi {((mp_obj_t)((uint64_t)0x400921fb54442d18 + 0x8004000000000000))}
+#else
+mp_obj_float_t ulab_const_float_pi_obj = {{&mp_type_float}, MP_PI};
+#define ulab_const_float_pi MP_ROM_PTR(&ulab_const_float_pi_obj)
+#endif
+#endif
+
+static const mp_rom_map_elem_t ulab_numpy_globals_table[] = {
+    { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_numpy) },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_ndarray), (mp_obj_t)&ulab_ndarray_type },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_array), MP_ROM_PTR(&ndarray_array_constructor_obj) },
+    #if ULAB_NUMPY_HAS_FROMBUFFER
+        { MP_ROM_QSTR(MP_QSTR_frombuffer), MP_ROM_PTR(&create_frombuffer_obj) },
+    #endif
+    // math constants
+    #if ULAB_NUMPY_HAS_E
+        { MP_ROM_QSTR(MP_QSTR_e), ulab_const_float_e },
+    #endif
+    #if ULAB_NUMPY_HAS_INF
+        { MP_ROM_QSTR(MP_QSTR_inf), numpy_const_float_inf },
+    #endif
+    #if ULAB_NUMPY_HAS_NAN
+        { MP_ROM_QSTR(MP_QSTR_nan), numpy_const_float_nan },
+    #endif
+    #if ULAB_NUMPY_HAS_PI
+        { MP_ROM_QSTR(MP_QSTR_pi), ulab_const_float_pi },
+    #endif
+    // class constants, always included
+    { MP_ROM_QSTR(MP_QSTR_bool), MP_ROM_INT(NDARRAY_BOOL) },
+    { MP_ROM_QSTR(MP_QSTR_uint8), MP_ROM_INT(NDARRAY_UINT8) },
+    { MP_ROM_QSTR(MP_QSTR_int8), MP_ROM_INT(NDARRAY_INT8) },
+    { MP_ROM_QSTR(MP_QSTR_uint16), MP_ROM_INT(NDARRAY_UINT16) },
+    { MP_ROM_QSTR(MP_QSTR_int16), MP_ROM_INT(NDARRAY_INT16) },
+    { MP_ROM_QSTR(MP_QSTR_float), MP_ROM_INT(NDARRAY_FLOAT) },
+    #if ULAB_SUPPORTS_COMPLEX
+        { MP_ROM_QSTR(MP_QSTR_complex), MP_ROM_INT(NDARRAY_COMPLEX) },
+    #endif
+    // modules of numpy
+    #if ULAB_NUMPY_HAS_FFT_MODULE
+        { MP_ROM_QSTR(MP_QSTR_fft), MP_ROM_PTR(&ulab_fft_module) },
+    #endif
+    #if ULAB_NUMPY_HAS_LINALG_MODULE
+        { MP_ROM_QSTR(MP_QSTR_linalg), MP_ROM_PTR(&ulab_linalg_module) },
+    #endif
+    #if ULAB_HAS_PRINTOPTIONS
+        { MP_ROM_QSTR(MP_QSTR_set_printoptions), (mp_obj_t)&ndarray_set_printoptions_obj },
+        { MP_ROM_QSTR(MP_QSTR_get_printoptions), (mp_obj_t)&ndarray_get_printoptions_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_NDINFO
+        { MP_ROM_QSTR(MP_QSTR_ndinfo), (mp_obj_t)&ndarray_info_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ARANGE
+        { MP_ROM_QSTR(MP_QSTR_arange), (mp_obj_t)&create_arange_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_COMPRESS
+        { MP_ROM_QSTR(MP_QSTR_compress), (mp_obj_t)&transform_compress_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_CONCATENATE
+        { MP_ROM_QSTR(MP_QSTR_concatenate), (mp_obj_t)&create_concatenate_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_DIAG
+        #if ULAB_MAX_DIMS > 1
+            { MP_ROM_QSTR(MP_QSTR_diag), (mp_obj_t)&create_diag_obj },
+        #endif
+    #endif
+    #if ULAB_NUMPY_HAS_EMPTY
+        { MP_ROM_QSTR(MP_QSTR_empty), (mp_obj_t)&create_zeros_obj },
+    #endif
+    #if ULAB_MAX_DIMS > 1
+        #if ULAB_NUMPY_HAS_EYE
+            { MP_ROM_QSTR(MP_QSTR_eye), (mp_obj_t)&create_eye_obj },
+        #endif
+    #endif /* ULAB_MAX_DIMS */
+    // functions of the approx sub-module
+    #if ULAB_NUMPY_HAS_INTERP
+        { MP_OBJ_NEW_QSTR(MP_QSTR_interp), (mp_obj_t)&approx_interp_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_TRAPZ
+        { MP_OBJ_NEW_QSTR(MP_QSTR_trapz), (mp_obj_t)&approx_trapz_obj },
+    #endif
+    // functions of the create sub-module
+    #if ULAB_NUMPY_HAS_FULL
+        { MP_ROM_QSTR(MP_QSTR_full), (mp_obj_t)&create_full_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_LINSPACE
+        { MP_ROM_QSTR(MP_QSTR_linspace), (mp_obj_t)&create_linspace_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_LOGSPACE
+        { MP_ROM_QSTR(MP_QSTR_logspace), (mp_obj_t)&create_logspace_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ONES
+        { MP_ROM_QSTR(MP_QSTR_ones), (mp_obj_t)&create_ones_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ZEROS
+        { MP_ROM_QSTR(MP_QSTR_zeros), (mp_obj_t)&create_zeros_obj },
+    #endif
+    // functions of the compare sub-module
+    #if ULAB_NUMPY_HAS_CLIP
+        { MP_OBJ_NEW_QSTR(MP_QSTR_clip), (mp_obj_t)&compare_clip_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_EQUAL
+        { MP_OBJ_NEW_QSTR(MP_QSTR_equal), (mp_obj_t)&compare_equal_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_NOTEQUAL
+        { MP_OBJ_NEW_QSTR(MP_QSTR_not_equal), (mp_obj_t)&compare_not_equal_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ISFINITE
+        { MP_OBJ_NEW_QSTR(MP_QSTR_isfinite), (mp_obj_t)&compare_isfinite_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ISINF
+        { MP_OBJ_NEW_QSTR(MP_QSTR_isinf), (mp_obj_t)&compare_isinf_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_MAXIMUM
+        { MP_OBJ_NEW_QSTR(MP_QSTR_maximum), (mp_obj_t)&compare_maximum_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_MINIMUM
+        { MP_OBJ_NEW_QSTR(MP_QSTR_minimum), (mp_obj_t)&compare_minimum_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_WHERE
+        { MP_OBJ_NEW_QSTR(MP_QSTR_where), (mp_obj_t)&compare_where_obj },
+    #endif
+    // functions of the filter sub-module
+    #if ULAB_NUMPY_HAS_CONVOLVE
+        { MP_OBJ_NEW_QSTR(MP_QSTR_convolve), (mp_obj_t)&filter_convolve_obj },
+    #endif
+    // functions of the numerical sub-module
+    #if ULAB_NUMPY_HAS_ALL
+        { MP_OBJ_NEW_QSTR(MP_QSTR_all), (mp_obj_t)&numerical_all_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ANY
+        { MP_OBJ_NEW_QSTR(MP_QSTR_any), (mp_obj_t)&numerical_any_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ARGMINMAX
+        { MP_OBJ_NEW_QSTR(MP_QSTR_argmax), (mp_obj_t)&numerical_argmax_obj },
+        { MP_OBJ_NEW_QSTR(MP_QSTR_argmin), (mp_obj_t)&numerical_argmin_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ARGSORT
+        { MP_OBJ_NEW_QSTR(MP_QSTR_argsort), (mp_obj_t)&numerical_argsort_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_CROSS
+        { MP_OBJ_NEW_QSTR(MP_QSTR_cross), (mp_obj_t)&numerical_cross_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_DIFF
+        { MP_OBJ_NEW_QSTR(MP_QSTR_diff), (mp_obj_t)&numerical_diff_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_DOT
+        #if ULAB_MAX_DIMS > 1
+            { MP_OBJ_NEW_QSTR(MP_QSTR_dot), (mp_obj_t)&transform_dot_obj },
+        #endif
+    #endif
+    #if ULAB_NUMPY_HAS_TRACE
+        #if ULAB_MAX_DIMS > 1
+            { MP_ROM_QSTR(MP_QSTR_trace), (mp_obj_t)&stats_trace_obj },
+        #endif
+    #endif
+    #if ULAB_NUMPY_HAS_FLIP
+        { MP_OBJ_NEW_QSTR(MP_QSTR_flip), (mp_obj_t)&numerical_flip_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_MINMAX
+        { MP_OBJ_NEW_QSTR(MP_QSTR_max), (mp_obj_t)&numerical_max_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_MEAN
+        { MP_OBJ_NEW_QSTR(MP_QSTR_mean), (mp_obj_t)&numerical_mean_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_MEDIAN
+        { MP_OBJ_NEW_QSTR(MP_QSTR_median), (mp_obj_t)&numerical_median_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_MINMAX
+        { MP_OBJ_NEW_QSTR(MP_QSTR_min), (mp_obj_t)&numerical_min_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ROLL
+        { MP_OBJ_NEW_QSTR(MP_QSTR_roll), (mp_obj_t)&numerical_roll_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_SORT
+        { MP_OBJ_NEW_QSTR(MP_QSTR_sort), (mp_obj_t)&numerical_sort_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_STD
+        { MP_OBJ_NEW_QSTR(MP_QSTR_std), (mp_obj_t)&numerical_std_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_SUM
+        { MP_OBJ_NEW_QSTR(MP_QSTR_sum), (mp_obj_t)&numerical_sum_obj },
+    #endif
+    // functions of the poly sub-module
+    #if ULAB_NUMPY_HAS_POLYFIT
+        { MP_OBJ_NEW_QSTR(MP_QSTR_polyfit), (mp_obj_t)&poly_polyfit_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_POLYVAL
+        { MP_OBJ_NEW_QSTR(MP_QSTR_polyval), (mp_obj_t)&poly_polyval_obj },
+    #endif
+    // functions of the vector sub-module
+    #if ULAB_NUMPY_HAS_ACOS
+    { MP_OBJ_NEW_QSTR(MP_QSTR_acos), (mp_obj_t)&vector_acos_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ACOSH
+    { MP_OBJ_NEW_QSTR(MP_QSTR_acosh), (mp_obj_t)&vector_acosh_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ARCTAN2
+    { MP_OBJ_NEW_QSTR(MP_QSTR_arctan2), (mp_obj_t)&vector_arctan2_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_AROUND
+    { MP_OBJ_NEW_QSTR(MP_QSTR_around), (mp_obj_t)&vector_around_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ASIN
+    { MP_OBJ_NEW_QSTR(MP_QSTR_asin), (mp_obj_t)&vector_asin_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ASINH
+    { MP_OBJ_NEW_QSTR(MP_QSTR_asinh), (mp_obj_t)&vector_asinh_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ATAN
+    { MP_OBJ_NEW_QSTR(MP_QSTR_atan), (mp_obj_t)&vector_atan_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_ATANH
+    { MP_OBJ_NEW_QSTR(MP_QSTR_atanh), (mp_obj_t)&vector_atanh_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_CEIL
+    { MP_OBJ_NEW_QSTR(MP_QSTR_ceil), (mp_obj_t)&vector_ceil_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_COS
+    { MP_OBJ_NEW_QSTR(MP_QSTR_cos), (mp_obj_t)&vector_cos_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_COSH
+    { MP_OBJ_NEW_QSTR(MP_QSTR_cosh), (mp_obj_t)&vector_cosh_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_DEGREES
+    { MP_OBJ_NEW_QSTR(MP_QSTR_degrees), (mp_obj_t)&vector_degrees_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_EXP
+    { MP_OBJ_NEW_QSTR(MP_QSTR_exp), (mp_obj_t)&vector_exp_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_EXPM1
+    { MP_OBJ_NEW_QSTR(MP_QSTR_expm1), (mp_obj_t)&vector_expm1_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_FLOOR
+    { MP_OBJ_NEW_QSTR(MP_QSTR_floor), (mp_obj_t)&vector_floor_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_LOG
+    { MP_OBJ_NEW_QSTR(MP_QSTR_log), (mp_obj_t)&vector_log_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_LOG10
+    { MP_OBJ_NEW_QSTR(MP_QSTR_log10), (mp_obj_t)&vector_log10_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_LOG2
+    { MP_OBJ_NEW_QSTR(MP_QSTR_log2), (mp_obj_t)&vector_log2_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_RADIANS
+    { MP_OBJ_NEW_QSTR(MP_QSTR_radians), (mp_obj_t)&vector_radians_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_SIN
+    { MP_OBJ_NEW_QSTR(MP_QSTR_sin), (mp_obj_t)&vector_sin_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_SINH
+    { MP_OBJ_NEW_QSTR(MP_QSTR_sinh), (mp_obj_t)&vector_sinh_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_SQRT
+    { MP_OBJ_NEW_QSTR(MP_QSTR_sqrt), (mp_obj_t)&vector_sqrt_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_TAN
+    { MP_OBJ_NEW_QSTR(MP_QSTR_tan), (mp_obj_t)&vector_tan_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_TANH
+    { MP_OBJ_NEW_QSTR(MP_QSTR_tanh), (mp_obj_t)&vector_tanh_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_VECTORIZE
+    { MP_OBJ_NEW_QSTR(MP_QSTR_vectorize), (mp_obj_t)&vector_vectorize_obj },
+    #endif
+    #if ULAB_SUPPORTS_COMPLEX
+        #if ULAB_NUMPY_HAS_REAL
+        { MP_OBJ_NEW_QSTR(MP_QSTR_real), (mp_obj_t)&carray_real_obj },
+        #endif
+        #if ULAB_NUMPY_HAS_IMAG
+        { MP_OBJ_NEW_QSTR(MP_QSTR_imag), (mp_obj_t)&carray_imag_obj },
+        #endif
+        #if ULAB_NUMPY_HAS_CONJUGATE
+            { MP_ROM_QSTR(MP_QSTR_conjugate), (mp_obj_t)&carray_conjugate_obj },
+        #endif
+        #if ULAB_NUMPY_HAS_SORT_COMPLEX
+            { MP_ROM_QSTR(MP_QSTR_sort_complex), (mp_obj_t)&carray_sort_complex_obj },
+        #endif
+    #endif
+};
+
+static MP_DEFINE_CONST_DICT(mp_module_ulab_numpy_globals, ulab_numpy_globals_table);
+
+const mp_obj_module_t ulab_numpy_module = {
+    .base = { &mp_type_module },
+    .globals = (mp_obj_dict_t*)&mp_module_ulab_numpy_globals,
+};
+
+MP_REGISTER_MODULE(MP_QSTR_ulab_dot_numpy, ulab_numpy_module, MODULE_ULAB_ENABLED && CIRCUITPY_ULAB);
diff --git a/circuitpython/extmod/ulab/code/numpy/numpy.h b/circuitpython/extmod/ulab/code/numpy/numpy.h
new file mode 100644
index 0000000..f1348f3
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/numpy.h
@@ -0,0 +1,21 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020-2021 Zoltán Vörös
+ *               
+*/
+
+#ifndef _NUMPY_
+#define _NUMPY_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+extern const mp_obj_module_t ulab_numpy_module;
+
+#endif /* _NUMPY_ */
diff --git a/circuitpython/extmod/ulab/code/numpy/poly.c b/circuitpython/extmod/ulab/code/numpy/poly.c
new file mode 100644
index 0000000..97ee5c7
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/poly.c
@@ -0,0 +1,250 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *               2020 Jeff Epler for Adafruit Industries
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020 Taku Fukada
+*/
+
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/objarray.h"
+
+#include "../ulab.h"
+#include "linalg/linalg_tools.h"
+#include "../ulab_tools.h"
+#include "carray/carray_tools.h"
+#include "poly.h"
+
+#if ULAB_NUMPY_HAS_POLYFIT
+
+mp_obj_t poly_polyfit(size_t n_args, const mp_obj_t *args) {
+    if(!ndarray_object_is_array_like(args[0])) {
+        mp_raise_ValueError(translate("input data must be an iterable"));
+    }
+    #if ULAB_SUPPORTS_COMPLEX
+    if(mp_obj_is_type(args[0], &ulab_ndarray_type)) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0]);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    }
+    #endif
+    size_t lenx = 0, leny = 0;
+    uint8_t deg = 0;
+    mp_float_t *x, *XT, *y, *prod;
+
+    if(n_args == 2) { // only the y values are supplied
+        // TODO: this is actually not enough: the first argument can very well be a matrix,
+        // in which case we are between the rock and a hard place
+        leny = (size_t)mp_obj_get_int(mp_obj_len_maybe(args[0]));
+        deg = (uint8_t)mp_obj_get_int(args[1]);
+        if(leny < deg) {
+            mp_raise_ValueError(translate("more degrees of freedom than data points"));
+        }
+        lenx = leny;
+        x = m_new(mp_float_t, lenx); // assume uniformly spaced data points
+        for(size_t i=0; i < lenx; i++) {
+            x[i] = i;
+        }
+        y = m_new(mp_float_t, leny);
+        fill_array_iterable(y, args[0]);
+    } else /* n_args == 3 */ {
+        if(!ndarray_object_is_array_like(args[1])) {
+            mp_raise_ValueError(translate("input data must be an iterable"));
+        }
+        lenx = (size_t)mp_obj_get_int(mp_obj_len_maybe(args[0]));
+        leny = (size_t)mp_obj_get_int(mp_obj_len_maybe(args[1]));
+        if(lenx != leny) {
+            mp_raise_ValueError(translate("input vectors must be of equal length"));
+        }
+        deg = (uint8_t)mp_obj_get_int(args[2]);
+        if(leny < deg) {
+            mp_raise_ValueError(translate("more degrees of freedom than data points"));
+        }
+        x = m_new(mp_float_t, lenx);
+        fill_array_iterable(x, args[0]);
+        y = m_new(mp_float_t, leny);
+        fill_array_iterable(y, args[1]);
+    }
+
+    // one could probably express X as a function of XT,
+    // and thereby save RAM, because X is used only in the product
+    XT = m_new(mp_float_t, (deg+1)*leny); // XT is a matrix of shape (deg+1, len) (rows, columns)
+    for(size_t i=0; i < leny; i++) { // column index
+        XT[i+0*lenx] = 1.0; // top row
+        for(uint8_t j=1; j < deg+1; j++) { // row index
+            XT[i+j*leny] = XT[i+(j-1)*leny]*x[i];
+        }
+    }
+
+    prod = m_new(mp_float_t, (deg+1)*(deg+1)); // the product matrix is of shape (deg+1, deg+1)
+    mp_float_t sum;
+    for(uint8_t i=0; i < deg+1; i++) { // column index
+        for(uint8_t j=0; j < deg+1; j++) { // row index
+            sum = 0.0;
+            for(size_t k=0; k < lenx; k++) {
+                // (j, k) * (k, i)
+                // Note that the second matrix is simply the transpose of the first:
+                // X(k, i) = XT(i, k) = XT[k*lenx+i]
+                sum += XT[j*lenx+k]*XT[i*lenx+k]; // X[k*(deg+1)+i];
+            }
+            prod[j*(deg+1)+i] = sum;
+        }
+    }
+    if(!linalg_invert_matrix(prod, deg+1)) {
+        // Although X was a Vandermonde matrix, whose inverse is guaranteed to exist,
+        // we bail out here, if prod couldn't be inverted: if the values in x are not all
+        // distinct, prod is singular
+        m_del(mp_float_t, XT, (deg+1)*lenx);
+        m_del(mp_float_t, x, lenx);
+        m_del(mp_float_t, y, lenx);
+        m_del(mp_float_t, prod, (deg+1)*(deg+1));
+        mp_raise_ValueError(translate("could not invert Vandermonde matrix"));
+    }
+    // at this point, we have the inverse of X^T * X
+    // y is a column vector; x is free now, we can use it for storing intermediate values
+    for(uint8_t i=0; i < deg+1; i++) { // row index
+        sum = 0.0;
+        for(size_t j=0; j < lenx; j++) { // column index
+            sum += XT[i*lenx+j]*y[j];
+        }
+        x[i] = sum;
+    }
+    // XT is no longer needed
+    m_del(mp_float_t, XT, (deg+1)*leny);
+
+    ndarray_obj_t *beta = ndarray_new_linear_array(deg+1, NDARRAY_FLOAT);
+    mp_float_t *betav = (mp_float_t *)beta->array;
+    // x[0..(deg+1)] contains now the product X^T * y; we can get rid of y
+    m_del(float, y, leny);
+
+    // now, we calculate beta, i.e., we apply prod = (X^T * X)^(-1) on x = X^T * y; x is a column vector now
+    for(uint8_t i=0; i < deg+1; i++) {
+        sum = 0.0;
+        for(uint8_t j=0; j < deg+1; j++) {
+            sum += prod[i*(deg+1)+j]*x[j];
+        }
+        betav[i] = sum;
+    }
+    m_del(mp_float_t, x, lenx);
+    m_del(mp_float_t, prod, (deg+1)*(deg+1));
+    for(uint8_t i=0; i < (deg+1)/2; i++) {
+        // We have to reverse the array, for the leading coefficient comes first.
+        SWAP(mp_float_t, betav[i], betav[deg-i]);
+    }
+    return MP_OBJ_FROM_PTR(beta);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(poly_polyfit_obj, 2, 3, poly_polyfit);
+#endif
+
+#if ULAB_NUMPY_HAS_POLYVAL
+
+mp_obj_t poly_polyval(mp_obj_t o_p, mp_obj_t o_x) {
+    if(!ndarray_object_is_array_like(o_p) || !ndarray_object_is_array_like(o_x)) {
+        mp_raise_TypeError(translate("inputs are not iterable"));
+    }
+    #if ULAB_SUPPORTS_COMPLEX
+    ndarray_obj_t *input;
+    if(mp_obj_is_type(o_p, &ulab_ndarray_type)) {
+        input = MP_OBJ_TO_PTR(o_p);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(input->dtype)
+    }
+    if(mp_obj_is_type(o_x, &ulab_ndarray_type)) {
+        input = MP_OBJ_TO_PTR(o_x);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(input->dtype)
+    }
+    #endif
+    // p had better be a one-dimensional standard iterable
+    uint8_t plen = mp_obj_get_int(mp_obj_len_maybe(o_p));
+    mp_float_t *p = m_new(mp_float_t, plen);
+    mp_obj_iter_buf_t p_buf;
+    mp_obj_t p_item, p_iterable = mp_getiter(o_p, &p_buf);
+    uint8_t i = 0;
+    while((p_item = mp_iternext(p_iterable)) != MP_OBJ_STOP_ITERATION) {
+        p[i] = mp_obj_get_float(p_item);
+        i++;
+    }
+
+    // polynomials are going to be of type float, except, when both
+    // the coefficients and the independent variable are integers
+    ndarray_obj_t *ndarray;
+    if(mp_obj_is_type(o_x, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(o_x);
+        uint8_t *sarray = (uint8_t *)source->array;
+        ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_FLOAT);
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+
+        mp_float_t (*func)(void *) = ndarray_get_float_function(source->dtype);
+
+        // TODO: these loops are really nothing, but the re-impplementation of
+        // ITERATE_VECTOR from vectorise.c. We could pass a function pointer here
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        mp_float_t y = p[0];
+                        mp_float_t _x = func(sarray);
+                        for(uint8_t m=0; m < plen-1; m++) {
+                            y *= _x;
+                            y += p[m+1];
+                        }
+                        *array++ = y;
+                        sarray += source->strides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                    sarray += source->strides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                sarray += source->strides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+            sarray += source->strides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+        #endif
+    } else {
+        // o_x had better be a one-dimensional standard iterable
+        ndarray = ndarray_new_linear_array(mp_obj_get_int(mp_obj_len_maybe(o_x)), NDARRAY_FLOAT);
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+        mp_obj_iter_buf_t x_buf;
+        mp_obj_t x_item, x_iterable = mp_getiter(o_x, &x_buf);
+        while ((x_item = mp_iternext(x_iterable)) != MP_OBJ_STOP_ITERATION) {
+            mp_float_t _x = mp_obj_get_float(x_item);
+            mp_float_t y = p[0];
+            for(uint8_t j=0; j < plen-1; j++) {
+                y *= _x;
+                y += p[j+1];
+            }
+            *array++ = y;
+        }
+    }
+    m_del(mp_float_t, p, plen);
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(poly_polyval_obj, poly_polyval);
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/poly.h b/circuitpython/extmod/ulab/code/numpy/poly.h
new file mode 100644
index 0000000..59cb9f5
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/poly.h
@@ -0,0 +1,21 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#ifndef _POLY_
+#define _POLY_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+MP_DECLARE_CONST_FUN_OBJ_VAR_BETWEEN(poly_polyfit_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(poly_polyval_obj);
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/stats.c b/circuitpython/extmod/ulab/code/numpy/stats.c
new file mode 100644
index 0000000..2d34889
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/stats.c
@@ -0,0 +1,54 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020 Roberto Colistete Jr.
+ *               2020 Taku Fukada
+ *
+*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../ulab.h"
+#include "../ulab_tools.h"
+#include "carray/carray_tools.h"
+#include "stats.h"
+
+#if ULAB_MAX_DIMS > 1
+#if ULAB_NUMPY_HAS_TRACE
+
+//| def trace(m: ulab.numpy.ndarray) -> _float:
+//|     """
+//|     :param m: a square matrix
+//|
+//|     Compute the trace of the matrix, the sum of its diagonal elements."""
+//|     ...
+//|
+
+static mp_obj_t stats_trace(mp_obj_t oin) {
+    ndarray_obj_t *ndarray = tools_object_is_square(oin);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    mp_float_t trace = 0.0;
+    for(size_t i=0; i < ndarray->shape[ULAB_MAX_DIMS - 1]; i++) {
+        int32_t pos = i * (ndarray->strides[ULAB_MAX_DIMS - 1] + ndarray->strides[ULAB_MAX_DIMS - 2]);
+        trace += ndarray_get_float_index(ndarray->array, ndarray->dtype, pos/ndarray->itemsize);
+    }
+    if(ndarray->dtype == NDARRAY_FLOAT) {
+        return mp_obj_new_float(trace);
+    }
+    return mp_obj_new_int_from_float(trace);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(stats_trace_obj, stats_trace);
+#endif
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/stats.h b/circuitpython/extmod/ulab/code/numpy/stats.h
new file mode 100644
index 0000000..62bba9f
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/stats.h
@@ -0,0 +1,20 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#ifndef _STATS_
+#define _STATS_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+MP_DECLARE_CONST_FUN_OBJ_1(stats_trace_obj);
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/transform.c b/circuitpython/extmod/ulab/code/numpy/transform.c
new file mode 100644
index 0000000..f0e3e70
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/transform.c
@@ -0,0 +1,224 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *
+*/
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../ulab.h"
+#include "../ulab_tools.h"
+#include "carray/carray_tools.h"
+#include "transform.h"
+
+#if ULAB_NUMPY_HAS_COMPRESS
+static mp_obj_t transform_compress(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t condition = args[0].u_obj;
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[1].u_obj);
+    uint8_t *array = (uint8_t *)ndarray->array;
+    mp_obj_t axis = args[2].u_obj;
+
+    size_t len = MP_OBJ_SMALL_INT_VALUE(mp_obj_len_maybe(condition));
+    int8_t ax, shift_ax;
+
+    if(axis != mp_const_none) {
+        ax = tools_get_axis(axis, ndarray->ndim);
+        shift_ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
+    }
+
+    if(((axis == mp_const_none) && (len != ndarray->len)) ||
+        ((axis != mp_const_none) && (len != ndarray->shape[shift_ax]))) {
+        mp_raise_ValueError(translate("wrong length of condition array"));
+    }
+
+    size_t true_count = 0;
+    mp_obj_iter_buf_t iter_buf;
+    mp_obj_t item, iterable = mp_getiter(condition, &iter_buf);
+    while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+        if(mp_obj_is_true(item)) {
+            true_count++;
+        }
+    }
+
+    iterable = mp_getiter(condition, &iter_buf);
+
+    ndarray_obj_t *result = NULL;
+    uint8_t *rarray = NULL;
+
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    memcpy(shape, ndarray->shape, ULAB_MAX_DIMS * sizeof(size_t));
+
+    size_t *rshape = m_new(size_t, ULAB_MAX_DIMS);
+    memcpy(rshape, ndarray->shape, ULAB_MAX_DIMS * sizeof(size_t));
+
+    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+    memcpy(strides, ndarray->strides, ULAB_MAX_DIMS * sizeof(int32_t));
+
+    int32_t *rstrides = m_new(int32_t, ULAB_MAX_DIMS);
+
+    if(axis == mp_const_none) {
+        result = ndarray_new_linear_array(true_count, ndarray->dtype);
+        rarray = (uint8_t *)result->array;
+        memset(rstrides, 0, ndarray->ndim * sizeof(int32_t));
+        rstrides[ULAB_MAX_DIMS - 1] = ndarray->itemsize;
+        rshape[ULAB_MAX_DIMS - 1] = 0;
+    } else {
+        rshape[shift_ax] = true_count;
+
+        result = ndarray_new_dense_ndarray(ndarray->ndim, rshape, ndarray->dtype);
+        rarray = (uint8_t *)result->array;
+
+        SWAP(size_t, shape[shift_ax], shape[ULAB_MAX_DIMS - 1]);
+        SWAP(size_t, rshape[shift_ax], rshape[ULAB_MAX_DIMS - 1]);
+        SWAP(int32_t, strides[shift_ax], strides[ULAB_MAX_DIMS - 1]);
+
+        memcpy(rstrides, result->strides, ULAB_MAX_DIMS * sizeof(int32_t));
+        SWAP(int32_t, rstrides[shift_ax], rstrides[ULAB_MAX_DIMS - 1]);
+    }
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                if(axis != mp_const_none) {
+                    iterable = mp_getiter(condition, &iter_buf);
+                }
+                do {
+                    item = mp_iternext(iterable);
+                    if(mp_obj_is_true(item)) {
+                        memcpy(rarray, array, ndarray->itemsize);
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                    }
+                    array += strides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                array -= strides[ULAB_MAX_DIMS - 1] * shape[ULAB_MAX_DIMS - 1];
+                array += strides[ULAB_MAX_DIMS - 2];
+                rarray -= rstrides[ULAB_MAX_DIMS - 1] * rshape[ULAB_MAX_DIMS - 1];
+                rarray += rstrides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            array -= strides[ULAB_MAX_DIMS - 2] * shape[ULAB_MAX_DIMS - 2];
+            array += strides[ULAB_MAX_DIMS - 3];
+            rarray -= rstrides[ULAB_MAX_DIMS - 2] * rshape[ULAB_MAX_DIMS - 2];
+            rarray += rstrides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        array -= strides[ULAB_MAX_DIMS - 3] * shape[ULAB_MAX_DIMS - 3];
+        array += strides[ULAB_MAX_DIMS - 4];
+        rarray -= rstrides[ULAB_MAX_DIMS - 2] * rshape[ULAB_MAX_DIMS - 2];
+        rarray += rstrides[ULAB_MAX_DIMS - 3];
+        i++;
+    } while(i < shape[ULAB_MAX_DIMS - 4]);
+    #endif
+
+    return result;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(transform_compress_obj, 2, transform_compress);
+#endif /* ULAB_NUMPY_HAS_COMPRESS */
+
+#if ULAB_MAX_DIMS > 1
+#if ULAB_NUMPY_HAS_DOT
+//| def dot(m1: ulab.numpy.ndarray, m2: ulab.numpy.ndarray) -> Union[ulab.numpy.ndarray, _float]:
+//|    """
+//|    :param ~ulab.numpy.ndarray m1: a matrix, or a vector
+//|    :param ~ulab.numpy.ndarray m2: a matrix, or a vector
+//|
+//|    Computes the product of two matrices, or two vectors. In the letter case, the inner product is returned."""
+//|    ...
+//|
+
+mp_obj_t transform_dot(mp_obj_t _m1, mp_obj_t _m2) {
+    // TODO: should the results be upcast?
+    // This implements 2D operations only!
+    if(!mp_obj_is_type(_m1, &ulab_ndarray_type) || !mp_obj_is_type(_m2, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("arguments must be ndarrays"));
+    }
+    ndarray_obj_t *m1 = MP_OBJ_TO_PTR(_m1);
+    ndarray_obj_t *m2 = MP_OBJ_TO_PTR(_m2);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(m1->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(m2->dtype)
+
+    uint8_t *array1 = (uint8_t *)m1->array;
+    uint8_t *array2 = (uint8_t *)m2->array;
+
+    mp_float_t (*func1)(void *) = ndarray_get_float_function(m1->dtype);
+    mp_float_t (*func2)(void *) = ndarray_get_float_function(m2->dtype);
+
+    if(m1->shape[ULAB_MAX_DIMS - 1] != m2->shape[ULAB_MAX_DIMS - m2->ndim]) {
+        mp_raise_ValueError(translate("dimensions do not match"));
+    }
+    uint8_t ndim = MIN(m1->ndim, m2->ndim);
+    size_t shape1 = m1->ndim == 2 ? m1->shape[ULAB_MAX_DIMS - m1->ndim] : 1;
+    size_t shape2 = m2->ndim == 2 ? m2->shape[ULAB_MAX_DIMS - 1] : 1;
+
+    size_t *shape = NULL;
+    if(ndim == 2) { // matrix times matrix -> matrix
+        shape = ndarray_shape_vector(0, 0, shape1, shape2);
+    } else { // matrix times vector -> vector, vector times vector -> vector (size 1)
+        shape = ndarray_shape_vector(0, 0, 0, shape1);
+    }
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+    mp_float_t *rarray = (mp_float_t *)results->array;
+
+    for(size_t i=0; i < shape1; i++) { // rows of m1
+        for(size_t j=0; j < shape2; j++) { // columns of m2
+            mp_float_t dot = 0.0;
+            for(size_t k=0; k < m1->shape[ULAB_MAX_DIMS - 1]; k++) {
+                // (i, k) * (k, j)
+                dot += func1(array1) * func2(array2);
+                array1 += m1->strides[ULAB_MAX_DIMS - 1];
+                array2 += m2->strides[ULAB_MAX_DIMS - m2->ndim];
+            }
+            *rarray++ = dot;
+            array1 -= m1->strides[ULAB_MAX_DIMS - 1] * m1->shape[ULAB_MAX_DIMS - 1];
+            array2 -= m2->strides[ULAB_MAX_DIMS - m2->ndim] * m2->shape[ULAB_MAX_DIMS - m2->ndim];
+            array2 += m2->strides[ULAB_MAX_DIMS - 1];
+        }
+        array1 += m1->strides[ULAB_MAX_DIMS - m1->ndim];
+        array2 = m2->array;
+    }
+    if((m1->ndim * m2->ndim) == 1) { // return a scalar, if product of two vectors
+        return mp_obj_new_float(*(--rarray));
+    } else {
+        return MP_OBJ_FROM_PTR(results);
+    }
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(transform_dot_obj, transform_dot);
+#endif
+#endif
+\ No newline at end of file
diff --git a/circuitpython/extmod/ulab/code/numpy/transform.h b/circuitpython/extmod/ulab/code/numpy/transform.h
new file mode 100644
index 0000000..039dcea
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/transform.h
@@ -0,0 +1,29 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *
+*/
+
+#ifndef _TRANSFORM_
+#define _TRANSFORM_
+
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../ulab.h"
+#include "../ulab_tools.h"
+#include "transform.h"
+
+MP_DECLARE_CONST_FUN_OBJ_KW(transform_compress_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(transform_dot_obj);
+
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/vector.c b/circuitpython/extmod/ulab/code/numpy/vector.c
new file mode 100644
index 0000000..97ab66d
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/vector.c
@@ -0,0 +1,844 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+ *               2020 Jeff Epler for Adafruit Industries
+ *               2020 Scott Shawcroft for Adafruit Industries
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "py/runtime.h"
+#include "py/binary.h"
+#include "py/obj.h"
+#include "py/objarray.h"
+
+#include "../ulab.h"
+#include "../ulab_tools.h"
+#include "carray/carray_tools.h"
+#include "vector.h"
+
+//| """Element-by-element functions
+//|
+//| These functions can operate on numbers, 1-D iterables, and arrays of 1 to 4 dimensions by
+//| applying the function to every element in the array.  This is typically
+//| much more efficient than expressing the same operation as a Python loop."""
+//|
+
+static mp_obj_t vector_generic_vector(mp_obj_t o_in, mp_float_t (*f)(mp_float_t)) {
+    // Return a single value, if o_in is not iterable
+    if(mp_obj_is_float(o_in) || mp_obj_is_int(o_in)) {
+        return mp_obj_new_float(f(mp_obj_get_float(o_in)));
+    }
+    ndarray_obj_t *ndarray = NULL;
+    if(mp_obj_is_type(o_in, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(o_in);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(source->dtype)
+        uint8_t *sarray = (uint8_t *)source->array;
+        ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_FLOAT);
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+
+        #if ULAB_VECTORISE_USES_FUN_POINTER
+
+            mp_float_t (*func)(void *) = ndarray_get_float_function(source->dtype);
+
+            #if ULAB_MAX_DIMS > 3
+            size_t i = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 2
+                size_t j = 0;
+                do {
+                #endif
+                    #if ULAB_MAX_DIMS > 1
+                    size_t k = 0;
+                    do {
+                    #endif
+                        size_t l = 0;
+                        do {
+                            mp_float_t value = func(sarray);
+                            *array++ = f(value);
+                            sarray += source->strides[ULAB_MAX_DIMS - 1];
+                            l++;
+                        } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+                    #if ULAB_MAX_DIMS > 1
+                        sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                        sarray += source->strides[ULAB_MAX_DIMS - 2];
+                        k++;
+                    } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+                    #endif /* ULAB_MAX_DIMS > 1 */
+                #if ULAB_MAX_DIMS > 2
+                    sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                    sarray += source->strides[ULAB_MAX_DIMS - 3];
+                    j++;
+                } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+                #endif /* ULAB_MAX_DIMS > 2 */
+            #if ULAB_MAX_DIMS > 3
+                sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+                sarray += source->strides[ULAB_MAX_DIMS - 4];
+                i++;
+            } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+            #endif /* ULAB_MAX_DIMS > 3 */
+        #else
+        if(source->dtype == NDARRAY_UINT8) {
+            ITERATE_VECTOR(uint8_t, array, source, sarray);
+        } else if(source->dtype == NDARRAY_INT8) {
+            ITERATE_VECTOR(int8_t, array, source, sarray);
+        } else if(source->dtype == NDARRAY_UINT16) {
+            ITERATE_VECTOR(uint16_t, array, source, sarray);
+        } else if(source->dtype == NDARRAY_INT16) {
+            ITERATE_VECTOR(int16_t, array, source, sarray);
+        } else {
+            ITERATE_VECTOR(mp_float_t, array, source, sarray);
+        }
+        #endif /* ULAB_VECTORISE_USES_FUN_POINTER */
+    } else {
+        ndarray = ndarray_from_mp_obj(o_in, 0);
+        mp_float_t *narray = (mp_float_t *)ndarray->array;
+        for(size_t i = 0; i < ndarray->len; i++) {
+            *narray = f(*narray);
+            narray++;
+        }
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+#if ULAB_NUMPY_HAS_ACOS
+//| def acos(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the inverse cosine function"""
+//|    ...
+//|
+
+MATH_FUN_1(acos, acos);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_acos_obj, vector_acos);
+#endif
+
+#if ULAB_NUMPY_HAS_ACOSH
+//| def acosh(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the inverse hyperbolic cosine function"""
+//|    ...
+//|
+
+MATH_FUN_1(acosh, acosh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_acosh_obj, vector_acosh);
+#endif
+
+#if ULAB_NUMPY_HAS_ASIN
+//| def asin(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the inverse sine function"""
+//|    ...
+//|
+
+MATH_FUN_1(asin, asin);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_asin_obj, vector_asin);
+#endif
+
+#if ULAB_NUMPY_HAS_ASINH
+//| def asinh(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the inverse hyperbolic sine function"""
+//|    ...
+//|
+
+MATH_FUN_1(asinh, asinh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_asinh_obj, vector_asinh);
+#endif
+
+#if ULAB_NUMPY_HAS_AROUND
+//| def around(a: _ArrayLike, *, decimals: int = 0) -> ulab.numpy.ndarray:
+//|    """Returns a new float array in which each element is rounded to
+//|       ``decimals`` places."""
+//|    ...
+//|
+
+mp_obj_t vector_around(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} },
+        { MP_QSTR_decimals, MP_ARG_KW_ONLY | MP_ARG_INT, {.u_int = 0 } }
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("first argument must be an ndarray"));
+    }
+    int8_t n = args[1].u_int;
+    mp_float_t mul = MICROPY_FLOAT_C_FUN(pow)(10.0, n);
+    ndarray_obj_t *source = MP_OBJ_TO_PTR(args[0].u_obj);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(source->dtype)
+    ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_FLOAT);
+    mp_float_t *narray = (mp_float_t *)ndarray->array;
+    uint8_t *sarray = (uint8_t *)source->array;
+
+    mp_float_t (*func)(void *) = ndarray_get_float_function(source->dtype);
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    mp_float_t f = func(sarray);
+                    *narray++ = MICROPY_FLOAT_C_FUN(round)(f * mul) / mul;
+                    sarray += source->strides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                sarray += source->strides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+            sarray += source->strides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+        sarray += source->strides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+    #endif
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(vector_around_obj, 1, vector_around);
+#endif
+
+#if ULAB_NUMPY_HAS_ATAN
+//| def atan(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the inverse tangent function; the return values are in the
+//|       range [-pi/2,pi/2]."""
+//|    ...
+//|
+
+MATH_FUN_1(atan, atan);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_atan_obj, vector_atan);
+#endif
+
+#if ULAB_NUMPY_HAS_ARCTAN2
+//| def arctan2(ya: _ArrayLike, xa: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the inverse tangent function of y/x; the return values are in
+//|       the range [-pi, pi]."""
+//|    ...
+//|
+
+mp_obj_t vector_arctan2(mp_obj_t y, mp_obj_t x) {
+    ndarray_obj_t *ndarray_x = ndarray_from_mp_obj(x, 0);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray_x->dtype)
+
+    ndarray_obj_t *ndarray_y = ndarray_from_mp_obj(y, 0);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray_y->dtype)
+
+    uint8_t ndim = 0;
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    int32_t *xstrides = m_new(int32_t, ULAB_MAX_DIMS);
+    int32_t *ystrides = m_new(int32_t, ULAB_MAX_DIMS);
+    if(!ndarray_can_broadcast(ndarray_x, ndarray_y, &ndim, shape, xstrides, ystrides)) {
+        mp_raise_ValueError(translate("operands could not be broadcast together"));
+        m_del(size_t, shape, ULAB_MAX_DIMS);
+        m_del(int32_t, xstrides, ULAB_MAX_DIMS);
+        m_del(int32_t, ystrides, ULAB_MAX_DIMS);
+    }
+
+    uint8_t *xarray = (uint8_t *)ndarray_x->array;
+    uint8_t *yarray = (uint8_t *)ndarray_y->array;
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
+    mp_float_t *rarray = (mp_float_t *)results->array;
+
+    mp_float_t (*funcx)(void *) = ndarray_get_float_function(ndarray_x->dtype);
+    mp_float_t (*funcy)(void *) = ndarray_get_float_function(ndarray_y->dtype);
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    mp_float_t _x = funcx(xarray);
+                    mp_float_t _y = funcy(yarray);
+                    *rarray++ = MICROPY_FLOAT_C_FUN(atan2)(_y, _x);
+                    xarray += xstrides[ULAB_MAX_DIMS - 1];
+                    yarray += ystrides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                xarray -= xstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                xarray += xstrides[ULAB_MAX_DIMS - 2];
+                yarray -= ystrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                yarray += ystrides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            xarray -= xstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+            xarray += xstrides[ULAB_MAX_DIMS - 3];
+            yarray -= ystrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+            yarray += ystrides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        xarray -= xstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+        xarray += xstrides[ULAB_MAX_DIMS - 4];
+        yarray -= ystrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+        yarray += ystrides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+    #endif
+
+    return MP_OBJ_FROM_PTR(results);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(vector_arctan2_obj, vector_arctan2);
+#endif /* ULAB_VECTORISE_HAS_ARCTAN2 */
+
+#if ULAB_NUMPY_HAS_ATANH
+//| def atanh(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the inverse hyperbolic tangent function"""
+//|    ...
+//|
+
+MATH_FUN_1(atanh, atanh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_atanh_obj, vector_atanh);
+#endif
+
+#if ULAB_NUMPY_HAS_CEIL
+//| def ceil(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Rounds numbers up to the next whole number"""
+//|    ...
+//|
+
+MATH_FUN_1(ceil, ceil);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_ceil_obj, vector_ceil);
+#endif
+
+#if ULAB_NUMPY_HAS_COS
+//| def cos(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the cosine function"""
+//|    ...
+//|
+
+MATH_FUN_1(cos, cos);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_cos_obj, vector_cos);
+#endif
+
+#if ULAB_NUMPY_HAS_COSH
+//| def cosh(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the hyperbolic cosine function"""
+//|    ...
+//|
+
+MATH_FUN_1(cosh, cosh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_cosh_obj, vector_cosh);
+#endif
+
+#if ULAB_NUMPY_HAS_DEGREES
+//| def degrees(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Converts angles from radians to degrees"""
+//|    ...
+//|
+
+static mp_float_t vector_degrees_(mp_float_t value) {
+    return value * MICROPY_FLOAT_CONST(180.0) / MP_PI;
+}
+
+static mp_obj_t vector_degrees(mp_obj_t x_obj) {
+    return vector_generic_vector(x_obj, vector_degrees_);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(vector_degrees_obj, vector_degrees);
+#endif
+
+#if ULAB_SCIPY_SPECIAL_HAS_ERF
+//| def erf(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the error function, which has applications in statistics"""
+//|    ...
+//|
+
+MATH_FUN_1(erf, erf);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_erf_obj, vector_erf);
+#endif
+
+#if ULAB_SCIPY_SPECIAL_HAS_ERFC
+//| def erfc(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the complementary error function, which has applications in statistics"""
+//|    ...
+//|
+
+MATH_FUN_1(erfc, erfc);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_erfc_obj, vector_erfc);
+#endif
+
+#if ULAB_NUMPY_HAS_EXP
+//| def exp(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the exponent function."""
+//|    ...
+//|
+
+static mp_obj_t vector_exp(mp_obj_t o_in) {
+    #if ULAB_SUPPORTS_COMPLEX
+    if(mp_obj_is_type(o_in, &mp_type_complex)) {
+        mp_float_t real, imag;
+        mp_obj_get_complex(o_in, &real, &imag);
+        mp_float_t exp_real = MICROPY_FLOAT_C_FUN(exp)(real);
+        return mp_obj_new_complex(exp_real * MICROPY_FLOAT_C_FUN(cos)(imag), exp_real * MICROPY_FLOAT_C_FUN(sin)(imag));
+    } else if(mp_obj_is_type(o_in, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(o_in);
+        if(source->dtype == NDARRAY_COMPLEX) {
+            uint8_t *sarray = (uint8_t *)source->array;
+            ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_COMPLEX);
+            mp_float_t *array = (mp_float_t *)ndarray->array;
+            uint8_t itemsize = sizeof(mp_float_t);
+
+            #if ULAB_MAX_DIMS > 3
+            size_t i = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 2
+                size_t j = 0;
+                do {
+                #endif
+                    #if ULAB_MAX_DIMS > 1
+                    size_t k = 0;
+                    do {
+                    #endif
+                        size_t l = 0;
+                        do {
+                            mp_float_t real = *(mp_float_t *)sarray;
+                            mp_float_t imag = *(mp_float_t *)(sarray + itemsize);
+                            mp_float_t exp_real = MICROPY_FLOAT_C_FUN(exp)(real);
+                            *array++ = exp_real * MICROPY_FLOAT_C_FUN(cos)(imag);
+                            *array++ = exp_real * MICROPY_FLOAT_C_FUN(sin)(imag);
+                            sarray += source->strides[ULAB_MAX_DIMS - 1];
+                            l++;
+                        } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+                    #if ULAB_MAX_DIMS > 1
+                        sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                        sarray += source->strides[ULAB_MAX_DIMS - 2];
+                        k++;
+                    } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+                    #endif /* ULAB_MAX_DIMS > 1 */
+                #if ULAB_MAX_DIMS > 2
+                    sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                    sarray += source->strides[ULAB_MAX_DIMS - 3];
+                    j++;
+                } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+                #endif /* ULAB_MAX_DIMS > 2 */
+            #if ULAB_MAX_DIMS > 3
+                sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+                sarray += source->strides[ULAB_MAX_DIMS - 4];
+                i++;
+            } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+            #endif /* ULAB_MAX_DIMS > 3 */
+            return MP_OBJ_FROM_PTR(ndarray);
+        }
+    }
+    #endif
+    return vector_generic_vector(o_in, MICROPY_FLOAT_C_FUN(exp));
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(vector_exp_obj, vector_exp);
+#endif
+
+#if ULAB_NUMPY_HAS_EXPM1
+//| def expm1(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes $e^x-1$.  In certain applications, using this function preserves numeric accuracy better than the `exp` function."""
+//|    ...
+//|
+
+MATH_FUN_1(expm1, expm1);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_expm1_obj, vector_expm1);
+#endif
+
+#if ULAB_NUMPY_HAS_FLOOR
+//| def floor(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Rounds numbers up to the next whole number"""
+//|    ...
+//|
+
+MATH_FUN_1(floor, floor);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_floor_obj, vector_floor);
+#endif
+
+#if ULAB_SCIPY_SPECIAL_HAS_GAMMA
+//| def gamma(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the gamma function"""
+//|    ...
+//|
+
+MATH_FUN_1(gamma, tgamma);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_gamma_obj, vector_gamma);
+#endif
+
+#if ULAB_SCIPY_SPECIAL_HAS_GAMMALN
+//| def lgamma(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the natural log of the gamma function"""
+//|    ...
+//|
+
+MATH_FUN_1(lgamma, lgamma);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_lgamma_obj, vector_lgamma);
+#endif
+
+#if ULAB_NUMPY_HAS_LOG
+//| def log(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the natural log"""
+//|    ...
+//|
+
+MATH_FUN_1(log, log);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_log_obj, vector_log);
+#endif
+
+#if ULAB_NUMPY_HAS_LOG10
+//| def log10(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the log base 10"""
+//|    ...
+//|
+
+MATH_FUN_1(log10, log10);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_log10_obj, vector_log10);
+#endif
+
+#if ULAB_NUMPY_HAS_LOG2
+//| def log2(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the log base 2"""
+//|    ...
+//|
+
+MATH_FUN_1(log2, log2);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_log2_obj, vector_log2);
+#endif
+
+#if ULAB_NUMPY_HAS_RADIANS
+//| def radians(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Converts angles from degrees to radians"""
+//|    ...
+//|
+
+static mp_float_t vector_radians_(mp_float_t value) {
+    return value * MP_PI / MICROPY_FLOAT_CONST(180.0);
+}
+
+static mp_obj_t vector_radians(mp_obj_t x_obj) {
+    return vector_generic_vector(x_obj, vector_radians_);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(vector_radians_obj, vector_radians);
+#endif
+
+#if ULAB_NUMPY_HAS_SIN
+//| def sin(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the sine function"""
+//|    ...
+//|
+
+MATH_FUN_1(sin, sin);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_sin_obj, vector_sin);
+#endif
+
+#if ULAB_NUMPY_HAS_SINH
+//| def sinh(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the hyperbolic sine"""
+//|    ...
+//|
+
+MATH_FUN_1(sinh, sinh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_sinh_obj, vector_sinh);
+#endif
+
+
+#if ULAB_NUMPY_HAS_SQRT
+//| def sqrt(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the square root"""
+//|    ...
+//|
+
+#if ULAB_SUPPORTS_COMPLEX
+mp_obj_t vector_sqrt(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_INT(NDARRAY_FLOAT) } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t o_in = args[0].u_obj;
+    uint8_t dtype = mp_obj_get_int(args[1].u_obj);
+    if((dtype != NDARRAY_FLOAT) && (dtype != NDARRAY_COMPLEX)) {
+        mp_raise_TypeError(translate("dtype must be float, or complex"));
+    }
+
+    if(mp_obj_is_type(o_in, &mp_type_complex)) {
+        mp_float_t real, imag;
+        mp_obj_get_complex(o_in, &real, &imag);
+        mp_float_t sqrt_abs = MICROPY_FLOAT_C_FUN(sqrt)(real * real + imag * imag);
+        sqrt_abs = MICROPY_FLOAT_C_FUN(sqrt)(sqrt_abs);
+        mp_float_t theta = MICROPY_FLOAT_CONST(0.5) * MICROPY_FLOAT_C_FUN(atan2)(imag, real);
+        return mp_obj_new_complex(sqrt_abs * MICROPY_FLOAT_C_FUN(cos)(theta), sqrt_abs * MICROPY_FLOAT_C_FUN(sin)(theta));
+    } else if(mp_obj_is_type(o_in, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(o_in);
+        if((source->dtype == NDARRAY_COMPLEX) && (dtype == NDARRAY_FLOAT)) {
+            mp_raise_TypeError(translate("can't convert complex to float"));
+        }
+
+        if(dtype == NDARRAY_COMPLEX) {
+            if(source->dtype == NDARRAY_COMPLEX) {
+                uint8_t *sarray = (uint8_t *)source->array;
+                ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_COMPLEX);
+                mp_float_t *array = (mp_float_t *)ndarray->array;
+                uint8_t itemsize = sizeof(mp_float_t);
+
+                #if ULAB_MAX_DIMS > 3
+                size_t i = 0;
+                do {
+                #endif
+                    #if ULAB_MAX_DIMS > 2
+                    size_t j = 0;
+                    do {
+                    #endif
+                        #if ULAB_MAX_DIMS > 1
+                        size_t k = 0;
+                        do {
+                        #endif
+                            size_t l = 0;
+                            do {
+                                mp_float_t real = *(mp_float_t *)sarray;
+                                mp_float_t imag = *(mp_float_t *)(sarray + itemsize);
+                                mp_float_t sqrt_abs = MICROPY_FLOAT_C_FUN(sqrt)(real * real + imag * imag);
+                                sqrt_abs = MICROPY_FLOAT_C_FUN(sqrt)(sqrt_abs);
+                                mp_float_t theta = MICROPY_FLOAT_CONST(0.5) * MICROPY_FLOAT_C_FUN(atan2)(imag, real);
+                                *array++ = sqrt_abs * MICROPY_FLOAT_C_FUN(cos)(theta);
+                                *array++ = sqrt_abs * MICROPY_FLOAT_C_FUN(sin)(theta);
+                                sarray += source->strides[ULAB_MAX_DIMS - 1];
+                                l++;
+                            } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+                        #if ULAB_MAX_DIMS > 1
+                            sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                            sarray += source->strides[ULAB_MAX_DIMS - 2];
+                            k++;
+                        } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+                        #endif /* ULAB_MAX_DIMS > 1 */
+                    #if ULAB_MAX_DIMS > 2
+                        sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                        sarray += source->strides[ULAB_MAX_DIMS - 3];
+                        j++;
+                    } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+                    #endif /* ULAB_MAX_DIMS > 2 */
+                #if ULAB_MAX_DIMS > 3
+                    sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+                    sarray += source->strides[ULAB_MAX_DIMS - 4];
+                    i++;
+                } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+                #endif /* ULAB_MAX_DIMS > 3 */
+                return MP_OBJ_FROM_PTR(ndarray);
+            } else if(source->dtype == NDARRAY_FLOAT) {
+                uint8_t *sarray = (uint8_t *)source->array;
+                ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_COMPLEX);
+                mp_float_t *array = (mp_float_t *)ndarray->array;
+
+                #if ULAB_MAX_DIMS > 3
+                size_t i = 0;
+                do {
+                #endif
+                    #if ULAB_MAX_DIMS > 2
+                    size_t j = 0;
+                    do {
+                    #endif
+                        #if ULAB_MAX_DIMS > 1
+                        size_t k = 0;
+                        do {
+                        #endif
+                            size_t l = 0;
+                            do {
+                                mp_float_t value = *(mp_float_t *)sarray;
+                                if(value >= MICROPY_FLOAT_CONST(0.0)) {
+                                    *array++ = MICROPY_FLOAT_C_FUN(sqrt)(value);
+                                    array++;
+                                } else {
+                                    array++;
+                                    *array++ = MICROPY_FLOAT_C_FUN(sqrt)(-value);
+                                }
+                                sarray += source->strides[ULAB_MAX_DIMS - 1];
+                                l++;
+                            } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+                        #if ULAB_MAX_DIMS > 1
+                            sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                            sarray += source->strides[ULAB_MAX_DIMS - 2];
+                            k++;
+                        } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+                        #endif /* ULAB_MAX_DIMS > 1 */
+                    #if ULAB_MAX_DIMS > 2
+                        sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                        sarray += source->strides[ULAB_MAX_DIMS - 3];
+                        j++;
+                    } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+                    #endif /* ULAB_MAX_DIMS > 2 */
+                #if ULAB_MAX_DIMS > 3
+                    sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+                    sarray += source->strides[ULAB_MAX_DIMS - 4];
+                    i++;
+                } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+                #endif /* ULAB_MAX_DIMS > 3 */
+                return MP_OBJ_FROM_PTR(ndarray);
+            } else {
+                mp_raise_TypeError(translate("input dtype must be float or complex"));
+            }
+        }
+    }
+    return vector_generic_vector(o_in, MICROPY_FLOAT_C_FUN(sqrt));
+}
+MP_DEFINE_CONST_FUN_OBJ_KW(vector_sqrt_obj, 1, vector_sqrt);
+#else
+MATH_FUN_1(sqrt, sqrt);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_sqrt_obj, vector_sqrt);
+#endif /* ULAB_SUPPORTS_COMPLEX */
+
+#endif /* ULAB_NUMPY_HAS_SQRT */
+
+#if ULAB_NUMPY_HAS_TAN
+//| def tan(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the tangent"""
+//|    ...
+//|
+
+MATH_FUN_1(tan, tan);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_tan_obj, vector_tan);
+#endif
+
+#if ULAB_NUMPY_HAS_TANH
+//| def tanh(a: _ArrayLike) -> ulab.numpy.ndarray:
+//|    """Computes the hyperbolic tangent"""
+//|    ...
+
+MATH_FUN_1(tanh, tanh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_tanh_obj, vector_tanh);
+#endif
+
+#if ULAB_NUMPY_HAS_VECTORIZE
+static mp_obj_t vector_vectorized_function_call(mp_obj_t self_in, size_t n_args, size_t n_kw, const mp_obj_t *args) {
+    (void) n_args;
+    (void) n_kw;
+    vectorized_function_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    mp_obj_t avalue[1];
+    mp_obj_t fvalue;
+    if(mp_obj_is_type(args[0], &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(args[0]);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(source->dtype)
+        ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, self->otypes);
+        for(size_t i=0; i < source->len; i++) {
+            avalue[0] = mp_binary_get_val_array(source->dtype, source->array, i);
+            fvalue = self->type->MP_TYPE_CALL(self->fun, 1, 0, avalue);
+            ndarray_set_value(self->otypes, ndarray->array, i, fvalue);
+        }
+        return MP_OBJ_FROM_PTR(ndarray);
+    } else if(mp_obj_is_type(args[0], &mp_type_tuple) || mp_obj_is_type(args[0], &mp_type_list) ||
+        mp_obj_is_type(args[0], &mp_type_range)) { // i.e., the input is a generic iterable
+        size_t len = (size_t)mp_obj_get_int(mp_obj_len_maybe(args[0]));
+        ndarray_obj_t *ndarray = ndarray_new_linear_array(len, self->otypes);
+        mp_obj_iter_buf_t iter_buf;
+        mp_obj_t iterable = mp_getiter(args[0], &iter_buf);
+        size_t i=0;
+        while ((avalue[0] = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+            fvalue = self->type->MP_TYPE_CALL(self->fun, 1, 0, avalue);
+            ndarray_set_value(self->otypes, ndarray->array, i, fvalue);
+            i++;
+        }
+        return MP_OBJ_FROM_PTR(ndarray);
+    } else if(mp_obj_is_int(args[0]) || mp_obj_is_float(args[0])) {
+        ndarray_obj_t *ndarray = ndarray_new_linear_array(1, self->otypes);
+        fvalue = self->type->MP_TYPE_CALL(self->fun, 1, 0, args);
+        ndarray_set_value(self->otypes, ndarray->array, 0, fvalue);
+        return MP_OBJ_FROM_PTR(ndarray);
+    } else {
+        mp_raise_ValueError(translate("wrong input type"));
+    }
+    return mp_const_none;
+}
+
+const mp_obj_type_t vector_function_type = {
+    { &mp_type_type },
+    .flags = MP_TYPE_FLAG_EXTENDED,
+    .name = MP_QSTR_,
+    MP_TYPE_EXTENDED_FIELDS(
+    .call = vector_vectorized_function_call,
+    )
+};
+
+//| def vectorize(
+//|     f: Union[Callable[[int], _float], Callable[[_float], _float]],
+//|     *,
+//|     otypes: Optional[_DType] = None
+//| ) -> Callable[[_ArrayLike], ulab.numpy.ndarray]:
+//|    """
+//|    :param callable f: The function to wrap
+//|    :param otypes: List of array types that may be returned by the function.  None is interpreted to mean the return value is float.
+//|
+//|    Wrap a Python function ``f`` so that it can be applied to arrays.
+//|    The callable must return only values of the types specified by ``otypes``, or the result is undefined."""
+//|    ...
+//|
+
+static mp_obj_t vector_vectorize(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} },
+        { MP_QSTR_otypes, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} }
+    };
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+    const mp_obj_type_t *type = mp_obj_get_type(args[0].u_obj);
+    if(mp_type_get_call_slot(type) == NULL) {
+        mp_raise_TypeError(translate("first argument must be a callable"));
+    }
+    mp_obj_t _otypes = args[1].u_obj;
+    uint8_t otypes = NDARRAY_FLOAT;
+    if(_otypes == mp_const_none) {
+        // TODO: is this what numpy does?
+        otypes = NDARRAY_FLOAT;
+    } else if(mp_obj_is_int(_otypes)) {
+        otypes = mp_obj_get_int(_otypes);
+        if(otypes != NDARRAY_FLOAT && otypes != NDARRAY_UINT8 && otypes != NDARRAY_INT8 &&
+            otypes != NDARRAY_UINT16 && otypes != NDARRAY_INT16) {
+                mp_raise_ValueError(translate("wrong output type"));
+        }
+    }
+    else {
+        mp_raise_ValueError(translate("wrong output type"));
+    }
+    vectorized_function_obj_t *function = m_new_obj(vectorized_function_obj_t);
+    function->base.type = &vector_function_type;
+    function->otypes = otypes;
+    function->fun = args[0].u_obj;
+    function->type = type;
+    return MP_OBJ_FROM_PTR(function);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(vector_vectorize_obj, 1, vector_vectorize);
+#endif
diff --git a/circuitpython/extmod/ulab/code/numpy/vector.h b/circuitpython/extmod/ulab/code/numpy/vector.h
new file mode 100644
index 0000000..ea38b0f
--- /dev/null
+++ b/circuitpython/extmod/ulab/code/numpy/vector.h
@@ -0,0 +1,161 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2019-2021 Zoltán Vörös
+*/
+
+#ifndef _VECTOR_
+#define _VECTOR_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+MP_DECLARE_CONST_FUN_OBJ_1(vector_acos_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_acosh_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(vector_arctan2_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(vector_around_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_asin_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_asinh_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_atan_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_atanh_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_ceil_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_cos_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_cosh_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_degrees_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_erf_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_erfc_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_exp_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_expm1_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_floor_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_gamma_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_lgamma_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_log_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_log10_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_log2_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_radians_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_sin_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_sinh_obj);
+#if ULAB_SUPPORTS_COMPLEX
+MP_DECLARE_CONST_FUN_OBJ_KW(vector_sqrt_obj);
+#else
+MP_DECLARE_CONST_FUN_OBJ_1(vector_sqrt_obj);
+#endif
+MP_DECLARE_CONST_FUN_OBJ_1(vector_tan_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_tanh_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(vector_vectorize_obj);
+
+typedef struct _vectorized_function_obj_t {
+    mp_obj_base_t base;
+    uint8_t otypes;
+    mp_obj_t fun;
+    const mp_obj_type_t *type;
+} vectorized_function_obj_t;
+
+#if ULAB_HAS_FUNCTION_ITERATOR
+#define ITERATE_VECTOR(type, array, source, sarray, shift)\
+({\
+    size_t *scoords = ndarray_new_coords((source)->ndim);\
+    for(size_t i=0; i < (source)->len/(source)->shape[ULAB_MAX_DIMS -1]; i++) {\
+        for(size_t l=0; l < (source)->shape[ULAB_MAX_DIMS - 1]; l++) {\
+            *(array) = f(*((type *)(sarray)));\
+            (array) += (shift);\
+            (sarray) += (source)->strides[ULAB_MAX_DIMS - 1];\
+        }\
+        ndarray_rewind_array((source)->ndim, sarray, (source)->shape, (source)->strides, scoords);\
+    }\
+})
+
+#else
+
+#if ULAB_MAX_DIMS == 4
+#define ITERATE_VECTOR(type, array, source, sarray) do {\
+    size_t i=0;\
+    do {\
+        size_t j = 0;\
+        do {\
+            size_t k = 0;\
+            do {\
+                size_t l = 0;\
+                do {\
+                    *(array)++ = f(*((type *)(sarray)));\
+                    (sarray) += (source)->strides[ULAB_MAX_DIMS - 1];\
+                    l++;\
+                } while(l < (source)->shape[ULAB_MAX_DIMS-1]);\
+                (sarray) -= (source)->strides[ULAB_MAX_DIMS - 1] * (source)->shape[ULAB_MAX_DIMS-1];\
+                (sarray) += (source)->strides[ULAB_MAX_DIMS - 2];\
+                k++;\
+            } while(k < (source)->shape[ULAB_MAX_DIMS-2]);\
+            (sarray) -= (source)->strides[ULAB_MAX_DIMS - 2] * (source)->shape[ULAB_MAX_DIMS-2];\
+            (sarray) += (source)->strides[ULAB_MAX_DIMS - 3];\
+            j++;\
+        } while(j < (source)->shape[ULAB_MAX_DIMS-3]);\
+        (sarray) -= (source)->strides[ULAB_MAX_DIMS - 3] * (source)->shape[ULAB_MAX_DIMS-3];\
+        (sarray) += (source)->strides[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (source)->shape[ULAB_MAX_DIMS-4]);\
+} while(0)
+#endif /* ULAB_MAX_DIMS == 4 */
+
+#if ULAB_MAX_DIMS == 3
+#define ITERATE_VECTOR(type, array, source, sarray) do {\
+    size_t j = 0;\
+    do {\
+        size_t k = 0;\
+        do {\
+            size_t l = 0;\
+            do {\
+                *(array)++ = f(*((type *)(sarray)));\
+                (sarray) += (source)->strides[ULAB_MAX_DIMS - 1];\
+                l++;\
+            } while(l < (source)->shape[ULAB_MAX_DIMS-1]);\
+            (sarray) -= (source)->strides[ULAB_MAX_DIMS - 1] * (source)->shape[ULAB_MAX_DIMS-1];\
+            (sarray) += (source)->strides[ULAB_MAX_DIMS - 2];\
+            k++;\
+        } while(k < (source)->shape[ULAB_MAX_DIMS-2]);\
+        (sarray) -= (source)->strides[ULAB_MAX_DIMS - 2] * (source)->shape[ULAB_MAX_DIMS-2];\
+        (sarray) += (source)->strides[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (source)->shape[ULAB_MAX_DIMS-3]);\
+} while(0)
+#endif /* ULAB_MAX_DIMS == 3 */
+
+#if ULAB_MAX_DIMS == 2
+#define ITERATE_VECTOR(type, array, source, sarray) do {\
+    size_t k = 0;\
+    do {\
+        size_t l = 0;\
+        do {\
+            *(array)++ = f(*((type *)(sarray)));\
+            (sarray) += (source)->strides[ULAB_MAX_DIMS - 1];\
+            l++;\
+        } while(l < (source)->shape[ULAB_MAX_DIMS-1]);\
+        (sarray) -= (source)->strides[ULAB_MAX_DIMS - 1] * (source)->shape[ULAB_MAX_DIMS-1];\
+        (sarray) += (source)->strides[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (source)->shape[ULAB_MAX_DIMS-2]);\
+} while(0)
+#endif /* ULAB_MAX_DIMS == 2 */
+
+#if ULAB_MAX_DIMS == 1
+#define ITERATE_VECTOR(type, array, source, sarray) do {\
+    size_t l = 0;\
+    do {\
+        *(array)++ = f(*((type *)(sarray)));\
+        (sarray) += (source)->strides[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (source)->shape[ULAB_MAX_DIMS-1]);\
+} while(0)
+#endif /* ULAB_MAX_DIMS == 1 */
+#endif /* ULAB_HAS_FUNCTION_ITERATOR */
+
+#define MATH_FUN_1(py_name, c_name) \
+    static mp_obj_t vector_ ## py_name(mp_obj_t x_obj) { \
+        return vector_generic_vector(x_obj, MICROPY_FLOAT_C_FUN(c_name)); \
+}
+
+#endif /* _VECTOR_ */