diff --git a/apps/code/script_template.h b/apps/code/script_template.h
index 6ca117f48..c3f0f5b99 100644
--- a/apps/code/script_template.h
+++ b/apps/code/script_template.h
@@ -10,7 +10,7 @@ public:
   constexpr ScriptTemplate(const char * name, const char * value) : m_name(name), m_value(value) {}
   static const ScriptTemplate * Empty();
   const char * name() const { return m_name; }
-  const char * content() const { return m_value; + Script::StatusSize();}
+  const char * content() const { return m_value;}
   const char * value() const { return m_value; }
 private:
   const char * m_name;
diff --git a/apps/code/variable_box_controller.cpp b/apps/code/variable_box_controller.cpp
index fcaf42bc5..ac39f9986 100644
--- a/apps/code/variable_box_controller.cpp
+++ b/apps/code/variable_box_controller.cpp
@@ -823,7 +823,7 @@ bool VariableBoxController::importationSourceIsModule(const char * sourceName, c
     return true;
   }
   // The sourceName might be a module that is not in the toolbox
-  return mp_module_get(qstr_from_str(sourceName)) != MP_OBJ_NULL;
+  return mp_module_get_loaded_or_builtin(qstr_from_str(sourceName)) != MP_OBJ_NULL;
 }
 
 bool VariableBoxController::importationSourceIsScript(const char * sourceName, const char * * scriptFullName, Script * retrievedScript) {
diff --git a/python/Makefile b/python/Makefile
index 95b95bbab..c9942fb02 100644
--- a/python/Makefile
+++ b/python/Makefile
@@ -248,4 +248,5 @@ tests_src += $(addprefix python/test/,\
   time.cpp \
   turtle.cpp \
   matplotlib.cpp \
+  ulab.cpp \
 )
diff --git a/python/port/genhdr/moduledefs.h b/python/port/genhdr/moduledefs.h
index c86562df8..a130c2a67 100644
--- a/python/port/genhdr/moduledefs.h
+++ b/python/port/genhdr/moduledefs.h
@@ -1,13 +1,112 @@
-// Automatically generated by makemoduledefs.py.
+/* In the standard MicroPython build system, this file is autogenerated from the
+ * reset of the sources. We manually include it here some modules are not included
+ * by the build system, so we need to manually update the MicroPython part
+ *
+ * How to update this file with a new MicroPython release
+ * - Get a clean copy of MicroPython
+ * - Copy our mpconfigport.h over the "bare-arm" port of MicroPython
+ * - "make" the bare-arm port of MicroPython (don't worry if it doesn't finish)
+ * - "cat build/genhdr/moduledefs.h".
+ * - Insert the result below in the MicroPython section,
+ *   until the definition of MICROPY_REGISTERED_MODULES
+ * - copy the MICROPY_REGISTERED_MODULES section at the end of this file,
+ *   /!\ this section is present twice in the file, so you need to copy it twice
+ *   Keep the Upsilon part when copying the MICROPY_REGISTERED_MODULES section
+*/
 
-#if (MICROPY_PY_ARRAY)
-    extern const struct _mp_obj_module_t mp_module_uarray;
-    #define MODULE_DEF_MP_QSTR_UARRAY { MP_ROM_QSTR(MP_QSTR_uarray), MP_ROM_PTR(&mp_module_uarray) },
-#else
-    #define MODULE_DEF_MP_QSTR_UARRAY
-#endif
+// MicroPython part
 
+extern const struct _mp_obj_module_t mp_module___main__;
+#undef MODULE_DEF_MP_QSTR___MAIN__
+#define MODULE_DEF_MP_QSTR___MAIN__ { MP_ROM_QSTR(MP_QSTR___main__), MP_ROM_PTR(&mp_module___main__) },
+
+extern const struct _mp_obj_module_t mp_module_builtins;
+#undef MODULE_DEF_MP_QSTR_BUILTINS
+#define MODULE_DEF_MP_QSTR_BUILTINS { MP_ROM_QSTR(MP_QSTR_builtins), MP_ROM_PTR(&mp_module_builtins) },
+
+extern const struct _mp_obj_module_t mp_module_cmath;
+#undef MODULE_DEF_MP_QSTR_CMATH
+#define MODULE_DEF_MP_QSTR_CMATH { MP_ROM_QSTR(MP_QSTR_cmath), MP_ROM_PTR(&mp_module_cmath) },
+
+extern const struct _mp_obj_module_t mp_module_math;
+#undef MODULE_DEF_MP_QSTR_MATH
+#define MODULE_DEF_MP_QSTR_MATH { MP_ROM_QSTR(MP_QSTR_math), MP_ROM_PTR(&mp_module_math) },
+
+extern const struct _mp_obj_module_t mp_module_micropython;
+#undef MODULE_DEF_MP_QSTR_MICROPYTHON
+#define MODULE_DEF_MP_QSTR_MICROPYTHON { MP_ROM_QSTR(MP_QSTR_micropython), MP_ROM_PTR(&mp_module_micropython) },
+
+extern const struct _mp_obj_module_t mp_module_urandom;
+#undef MODULE_DEF_MP_QSTR_URANDOM
+#define MODULE_DEF_MP_QSTR_URANDOM { MP_ROM_QSTR(MP_QSTR_urandom), MP_ROM_PTR(&mp_module_urandom) },
+
+// Upsilon's modules part
+
+extern const struct _mp_obj_module_t modion_module;
+#undef MODULE_DEF_MP_QSTR_ION
+#define MODULE_DEF_MP_QSTR_ION { MP_ROM_QSTR(MP_QSTR_ion), MP_ROM_PTR(&modion_module) },
+
+extern const struct _mp_obj_module_t modkandinsky_module;
+#undef MODULE_DEF_MP_QSTR_KANDINSKY
+#define MODULE_DEF_MP_QSTR_KANDINSKY { MP_ROM_QSTR(MP_QSTR_kandinsky), MP_ROM_PTR(&modkandinsky_module) },
+
+extern const struct _mp_obj_module_t modmatplotlib_module;
+#undef MODULE_DEF_MP_QSTR_MATPLOTLIB
+#define MODULE_DEF_MP_QSTR_MATPLOTLIB { MP_ROM_QSTR(MP_QSTR_matplotlib), MP_ROM_PTR(&modmatplotlib_module) },
+
+extern const struct _mp_obj_module_t modpyplot_module;
+#undef MODULE_DEF_MP_QSTR_PYPLOT
+#define MODULE_DEF_MP_QSTR_PYPLOT { MP_ROM_QSTR(MP_QSTR_matplotlib_dot_pyplot), MP_ROM_PTR(&modpyplot_module) },
+
+extern const struct _mp_obj_module_t modtime_module;
+#undef MODULE_DEF_MP_QSTR_TIME
+#define MODULE_DEF_MP_QSTR_TIME { MP_ROM_QSTR(MP_QSTR_time), MP_ROM_PTR(&modtime_module) },
+
+extern const struct _mp_obj_module_t modos_module;
+#undef MODULE_DEF_MP_QSTR_OS
+#define MODULE_DEF_MP_QSTR_OS { MP_ROM_QSTR(MP_QSTR_os), MP_ROM_PTR(&modos_module) },
+
+extern const struct _mp_obj_module_t modturtle_module;
+#undef MODULE_DEF_MP_QSTR_TURTLE
+#define MODULE_DEF_MP_QSTR_TURTLE { MP_ROM_QSTR(MP_QSTR_turtle), MP_ROM_PTR(&modturtle_module) },
+
+#if !defined(INCLUDE_ULAB)
 
 #define MICROPY_REGISTERED_MODULES \
-    MODULE_DEF_MP_QSTR_UARRAY \
+    MODULE_DEF_MP_QSTR_BUILTINS \
+    MODULE_DEF_MP_QSTR_CMATH \
+    MODULE_DEF_MP_QSTR_MATH \
+    MODULE_DEF_MP_QSTR_MICROPYTHON \
+    MODULE_DEF_MP_QSTR_URANDOM \
+    MODULE_DEF_MP_QSTR___MAIN__ \
+/* Upsilon's modules part */ \
+    MODULE_DEF_MP_QSTR_ION \
+    MODULE_DEF_MP_QSTR_KANDINSKY \
+    MODULE_DEF_MP_QSTR_MATPLOTLIB \
+    MODULE_DEF_MP_QSTR_PYPLOT \
+    MODULE_DEF_MP_QSTR_TIME \
+    MODULE_DEF_MP_QSTR_OS \
+    MODULE_DEF_MP_QSTR_TURTLE
+#else
+extern const struct _mp_obj_module_t ulab_user_cmodule;
+#undef MODULE_DEF_MP_QSTR_ULAB
+#define MODULE_DEF_MP_QSTR_ULAB { MP_ROM_QSTR(MP_QSTR_ulab), MP_ROM_PTR(&ulab_user_cmodule) },
+
+#define MICROPY_REGISTERED_MODULES \
+    MODULE_DEF_MP_QSTR_BUILTINS \
+    MODULE_DEF_MP_QSTR_CMATH \
+    MODULE_DEF_MP_QSTR_MATH \
+    MODULE_DEF_MP_QSTR_MICROPYTHON \
+    MODULE_DEF_MP_QSTR_URANDOM \
+    MODULE_DEF_MP_QSTR___MAIN__ \
+/* Upsilon's modules part */ \
+    MODULE_DEF_MP_QSTR_ION \
+    MODULE_DEF_MP_QSTR_KANDINSKY \
+    MODULE_DEF_MP_QSTR_MATPLOTLIB \
+    MODULE_DEF_MP_QSTR_PYPLOT \
+    MODULE_DEF_MP_QSTR_TIME \
+    MODULE_DEF_MP_QSTR_OS \
+    MODULE_DEF_MP_QSTR_TURTLE \
+    MODULE_DEF_MP_QSTR_ULAB
+#endif
 // MICROPY_REGISTERED_MODULES
diff --git a/python/port/genhdr/mpversion.h b/python/port/genhdr/mpversion.h
new file mode 100644
index 000000000..c247b0f79
--- /dev/null
+++ b/python/port/genhdr/mpversion.h
@@ -0,0 +1,4 @@
+// This file was generated by py/makeversionhdr.py
+#define MICROPY_GIT_TAG "v1.19.1"
+#define MICROPY_GIT_HASH "9b486340d"
+#define MICROPY_BUILD_DATE "2022-06-22"
diff --git a/python/port/genhdr/qstrdefs.in.h b/python/port/genhdr/qstrdefs.in.h
index b3d03ab3a..3fae92b55 100644
--- a/python/port/genhdr/qstrdefs.in.h
+++ b/python/port/genhdr/qstrdefs.in.h
@@ -74,6 +74,7 @@ Q(__call__)
 Q(__class__)
 Q(__contains__)
 Q(__delitem__)
+Q(__dir__)
 Q(__divmod__)
 Q(__enter__)
 Q(__eq__)
@@ -101,9 +102,7 @@ Q(__mod__)
 Q(__module__)
 Q(__mul__)
 Q(__name__)
-#if __EMSCRIPTEN__
 Q(__ne__)
-#endif
 Q(__neg__)
 Q(__new__)
 Q(__next__)
@@ -142,7 +141,6 @@ Q(all)
 Q(any)
 Q(append)
 Q(args)
-Q(argv)
 Q(asin)
 Q(asinh)
 Q(atan)
@@ -154,7 +152,6 @@ Q(bound_method)
 Q(builtins)
 Q(bytearray)
 Q(bytecode)
-Q(byteorder)
 Q(bytes)
 Q(callable)
 Q(ceil)
@@ -192,7 +189,6 @@ Q(erfc)
 Q(errno)
 Q(eval)
 Q(exec)
-Q(exit)
 Q(exp)
 Q(expm1)
 Q(extend)
@@ -223,14 +219,12 @@ Q(heap_unlock)
 Q(hex)
 Q(id)
 Q(imag)
-Q(implementation)
 Q(index)
 Q(input)
 Q(insert)
 Q(int)
 Q(intersection)
 Q(intersection_update)
-Q(ion)
 Q(isalpha)
 Q(isdigit)
 Q(isdisjoint)
@@ -248,7 +242,6 @@ Q(items)
 Q(iter)
 Q(iterator)
 Q(join)
-Q(kandinsky)
 Q(kbd_intr)
 Q(key)
 Q(keys)
@@ -265,23 +258,18 @@ Q(lower)
 Q(lstrip)
 Q(map)
 Q(math)
-Q(matplotlib)
-Q(matplotlib.pyplot)
 Q(max)
 Q(maximum_space_recursion_space_depth_space_exceeded)
 Q(micropython)
 Q(min)
 Q(modf)
 Q(module)
-Q(modules)
 Q(next)
 Q(object)
 Q(oct)
 Q(open)
 Q(opt_level)
 Q(ord)
-Q(os)
-Q(path)
 Q(pend_throw)
 Q(phase)
 Q(pi)
@@ -290,7 +278,6 @@ Q(pop)
 Q(popitem)
 Q(pow)
 Q(print)
-Q(print_exception)
 Q(property)
 Q(radians)
 Q(randint)
@@ -334,26 +321,20 @@ Q(sum)
 Q(super)
 Q(symmetric_difference)
 Q(symmetric_difference_update)
-Q(sys)
 Q(tan)
 Q(tanh)
 Q(throw)
-Q(time)
 Q(to_bytes)
 Q(trunc)
 Q(tuple)
-Q(turtle)
 Q(type)
 Q(uniform)
 Q(union)
 Q(update)
 Q(upper)
 Q(random)
-Q(sys)
 Q(value)
 Q(values)
-Q(version)
-Q(version_info)
 Q(zip)
 
   // Ion QSTR
@@ -643,7 +624,9 @@ Q(set_printoptions)
 Q(get_printoptions)
 Q(ndinfo)
 Q(arange)
+Q(compress)
 Q(concatenate)
+Q(delete)
 Q(diag)
 Q(empty)
 Q(eye)
@@ -706,6 +689,7 @@ Q(byteswap)
 Q(flatten)
 Q(k)
 Q(tobytes)
+Q(tolist)
 Q(M)
 Q(ulab)
 Q(num)
diff --git a/python/port/mod/matplotlib/modmatplotlib_table.c b/python/port/mod/matplotlib/modmatplotlib_table.c
index 0178bd5e1..19714b838 100644
--- a/python/port/mod/matplotlib/modmatplotlib_table.c
+++ b/python/port/mod/matplotlib/modmatplotlib_table.c
@@ -4,13 +4,26 @@ extern const mp_obj_module_t modpyplot_module;
 
 STATIC MP_DEFINE_CONST_FUN_OBJ_0(modmatplotlib___init___obj, modmatplotlib___init__);
 
-STATIC const mp_rom_map_elem_t modmatplotlib_module_globals_table[] = {
+// Define the module table as non-const, because MicroPython needs to be able to modify it.
+STATIC mp_rom_map_elem_t modmatplotlib_module_globals_table[] = {
   { MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_matplotlib) },
   { MP_ROM_QSTR(MP_QSTR___init__), MP_ROM_PTR(&modmatplotlib___init___obj) },
   { MP_ROM_QSTR(MP_QSTR_pyplot), MP_ROM_PTR(&modpyplot_module) }
 };
 
-STATIC MP_DEFINE_CONST_DICT(modmatplotlib_module_globals, modmatplotlib_module_globals_table);
+// Define the module object, not as a constant, because MicroPython needs to be able to dynamically add attributes to it.
+mp_obj_dict_t modmatplotlib_module_globals = { \
+    .base = {&mp_type_dict}, \
+    .map = { \
+        .all_keys_are_qstrs = 1, \
+        .is_fixed = 0, \
+        .is_ordered = 1, \
+        .used = MP_ARRAY_SIZE(modmatplotlib_module_globals_table), \
+        .alloc = MP_ARRAY_SIZE(modmatplotlib_module_globals_table), \
+        .table = (mp_map_elem_t *)(mp_rom_map_elem_t *)modmatplotlib_module_globals_table, \
+    }, \
+};
+
 
 const mp_obj_module_t modmatplotlib_module = {
   .base = { &mp_type_module },
diff --git a/python/port/mod/ulab/micropython.cmake b/python/port/mod/ulab/micropython.cmake
new file mode 100644
index 000000000..66890c0db
--- /dev/null
+++ b/python/port/mod/ulab/micropython.cmake
@@ -0,0 +1,18 @@
+add_library(usermod_ulab INTERFACE)
+
+file(GLOB_RECURSE ULAB_SOURCES ${CMAKE_CURRENT_LIST_DIR}/*.c)
+
+target_sources(usermod_ulab INTERFACE
+    ${ULAB_SOURCES}
+)
+
+target_include_directories(usermod_ulab INTERFACE
+    ${CMAKE_CURRENT_LIST_DIR}
+)
+
+target_compile_definitions(usermod_ulab INTERFACE
+    MODULE_ULAB_ENABLED=1
+)
+
+target_link_libraries(usermod INTERFACE usermod_ulab)
+
diff --git a/python/port/mod/ulab/micropython.mk b/python/port/mod/ulab/micropython.mk
new file mode 100644
index 000000000..f36d1d611
--- /dev/null
+++ b/python/port/mod/ulab/micropython.mk
@@ -0,0 +1,39 @@
+
+USERMODULES_DIR := $(USERMOD_DIR)
+
+# Add all C files to SRC_USERMOD.
+SRC_USERMOD += $(USERMODULES_DIR)/scipy/linalg/linalg.c
+SRC_USERMOD += $(USERMODULES_DIR)/scipy/optimize/optimize.c
+SRC_USERMOD += $(USERMODULES_DIR)/scipy/signal/signal.c
+SRC_USERMOD += $(USERMODULES_DIR)/scipy/special/special.c
+SRC_USERMOD += $(USERMODULES_DIR)/ndarray_operators.c
+SRC_USERMOD += $(USERMODULES_DIR)/ulab_tools.c
+SRC_USERMOD += $(USERMODULES_DIR)/ndarray.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/ndarray/ndarray_iter.c
+SRC_USERMOD += $(USERMODULES_DIR)/ndarray_properties.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/approx.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/compare.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/carray/carray.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/carray/carray_tools.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/create.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/fft/fft.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/fft/fft_tools.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/filter.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/io/io.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/linalg/linalg.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/linalg/linalg_tools.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/numerical.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/poly.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/stats.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/transform.c
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/vector.c
+
+SRC_USERMOD += $(USERMODULES_DIR)/numpy/numpy.c
+SRC_USERMOD += $(USERMODULES_DIR)/scipy/scipy.c
+SRC_USERMOD += $(USERMODULES_DIR)/user/user.c
+SRC_USERMOD += $(USERMODULES_DIR)/utils/utils.c
+SRC_USERMOD += $(USERMODULES_DIR)/ulab.c
+
+CFLAGS_USERMOD += -I$(USERMODULES_DIR)
+
+override CFLAGS_EXTRA += -DMODULE_ULAB_ENABLED=1
diff --git a/python/port/mod/ulab/ndarray.c b/python/port/mod/ulab/ndarray.c
index 88fbcfbda..5a7abce12 100644
--- a/python/port/mod/ulab/ndarray.c
+++ b/python/port/mod/ulab/ndarray.c
@@ -6,7 +6,7 @@
  *
  * The MIT License (MIT)
  *
- * Copyright (c) 2019-2021 Zoltán Vörös
+ * Copyright (c) 2019-2022 Zoltán Vörös
  *               2020 Jeff Epler for Adafruit Industries
  *               2020 Taku Fukada
 */
@@ -25,6 +25,8 @@
 #include "ulab_tools.h"
 #include "ndarray.h"
 #include "ndarray_operators.h"
+#include "numpy/carray/carray.h"
+#include "numpy/carray/carray_tools.h"
 
 mp_uint_t ndarray_print_threshold = NDARRAY_PRINT_THRESHOLD;
 mp_uint_t ndarray_print_edgeitems = NDARRAY_PRINT_EDGEITEMS;
@@ -46,6 +48,19 @@ mp_uint_t ndarray_print_edgeitems = NDARRAY_PRINT_EDGEITEMS;
 //| https://docs.scipy.org/doc/numpy/index.html"""
 //|
 
+void ndarray_set_complex_value(void *p, size_t index, mp_obj_t value) {
+    mp_float_t real, imag;
+    if(mp_obj_is_type(value, &mp_type_complex)) {
+        mp_obj_get_complex(value, &real, &imag);
+        ((mp_float_t *)p)[2 * index] = real;
+        ((mp_float_t *)p)[2 * index + 1] = imag;
+    } else {
+        real = mp_obj_get_float(value);
+        ((mp_float_t *)p)[2 * index] = real;
+        ((mp_float_t *)p)[2 * index + 1] = MICROPY_FLOAT_CONST(0.0);
+    }
+}
+
 #ifdef CIRCUITPY
 void ndarray_set_value(char typecode, void *p, size_t index, mp_obj_t val_in) {
     switch (typecode) {
@@ -64,6 +79,11 @@ void ndarray_set_value(char typecode, void *p, size_t index, mp_obj_t val_in) {
         case NDARRAY_FLOAT:
             ((mp_float_t *)p)[index] = mp_obj_get_float(val_in);
             break;
+        #if ULAB_SUPPORTS_COMPLEX
+        case NDARRAY_COMPLEX:
+            ndarray_set_complex_value(p, index, val_in);
+            break;
+        #endif
     }
 }
 #endif
@@ -143,8 +163,7 @@ void ndarray_fill_array_iterable(mp_float_t *array, mp_obj_t iterable) {
 
 #if ULAB_HAS_FUNCTION_ITERATOR
 size_t *ndarray_new_coords(uint8_t ndim) {
-    size_t *coords = m_new(size_t, ndim);
-    memset(coords, 0, ndim*sizeof(size_t));
+    size_t *coords = m_new0(size_t, ndim);
     return coords;
 }
 
@@ -171,7 +190,7 @@ void ndarray_rewind_array(uint8_t ndim, uint8_t *array, size_t *shape, int32_t *
 static int32_t *strides_from_shape(size_t *shape, uint8_t dtype) {
     // returns a strides array that corresponds to a dense array with the prescribed shape
     int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
-    strides[ULAB_MAX_DIMS-1] = (int32_t)mp_binary_get_size('@', dtype, NULL);
+    strides[ULAB_MAX_DIMS-1] = (int32_t)ulab_binary_get_size(dtype);
     for(uint8_t i=ULAB_MAX_DIMS; i > 1; i--) {
         strides[i-2] = strides[i-1] * shape[i-1];
     }
@@ -231,7 +250,13 @@ void ndarray_dtype_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kin
         mp_print_str(print, "uint16')");
     } else if(self->dtype == NDARRAY_INT16) {
         mp_print_str(print, "int16')");
-    } else {
+    }
+    #if ULAB_SUPPORTS_COMPLEX
+    else if(self->dtype == NDARRAY_COMPLEX) {
+        mp_print_str(print, "complex')");
+    }
+    #endif
+    else {
         #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
         mp_print_str(print, "float32')");
         #else
@@ -280,7 +305,13 @@ mp_obj_t ndarray_dtype_make_new(const mp_obj_type_t *type, size_t n_args, size_t
                 _dtype = NDARRAY_INT16;
             } else if(memcmp(_dtype_, "float", 5) == 0) {
                 _dtype = NDARRAY_FLOAT;
-            } else {
+            }
+            #if ULAB_SUPPORTS_COMPLEX
+            else if(memcmp(_dtype_, "complex", 7) == 0) {
+                _dtype = NDARRAY_COMPLEX;
+            }
+            #endif
+            else {
                 mp_raise_TypeError(translate("data type not understood"));
             }
         }
@@ -308,7 +339,11 @@ mp_obj_t ndarray_dtype(mp_obj_t self_in) {
         GET_STR_DATA_LEN(self_in, _dtype, len);
         if((len != 1) || ((*_dtype != NDARRAY_BOOL) && (*_dtype != NDARRAY_UINT8)
             && (*_dtype != NDARRAY_INT8) && (*_dtype != NDARRAY_UINT16)
-            && (*_dtype != NDARRAY_INT16) && (*_dtype != NDARRAY_FLOAT))) {
+            && (*_dtype != NDARRAY_INT16) && (*_dtype != NDARRAY_FLOAT)
+            #if ULAB_SUPPORTS_COMPLEX
+                && (*_dtype != NDARRAY_COMPLEX)
+            #endif
+        )) {
             mp_raise_TypeError(translate("data type not understood"));
         }
         dtype = *_dtype;
@@ -351,6 +386,14 @@ MP_DEFINE_CONST_FUN_OBJ_0(ndarray_get_printoptions_obj, ndarray_get_printoptions
 mp_obj_t ndarray_get_item(ndarray_obj_t *ndarray, void *array) {
     // returns a proper micropython object from an array
     if(!ndarray->boolean) {
+        #if ULAB_SUPPORTS_COMPLEX
+        if(ndarray->dtype == NDARRAY_COMPLEX) {
+            mp_float_t *c = (mp_float_t *)array;
+            mp_float_t real = *c++;
+            mp_float_t imag = *c;
+            return mp_obj_new_complex(real, imag);
+        }
+        #endif
         return mp_binary_get_val_array(ndarray->dtype, array, 0);
     } else {
         if(*(uint8_t *)array) {
@@ -361,32 +404,55 @@ mp_obj_t ndarray_get_item(ndarray_obj_t *ndarray, void *array) {
     }
 }
 
-static void ndarray_print_row(const mp_print_t *print, ndarray_obj_t * ndarray, uint8_t *array, size_t stride, size_t n) {
+static void ndarray_print_element(const mp_print_t *print, ndarray_obj_t *ndarray, uint8_t *array) {
+    #if ULAB_SUPPORTS_COMPLEX
+        if(ndarray->dtype == NDARRAY_COMPLEX) {
+            // real part first
+            mp_float_t fvalue = *(mp_float_t *)array;
+            mp_obj_print_helper(print, mp_obj_new_float(fvalue), PRINT_REPR);
+            // imaginary part
+            array += ndarray->itemsize / 2;
+            fvalue = *(mp_float_t *)array;
+            if(fvalue >= MICROPY_FLOAT_CONST(0.0) || isnan(fvalue)) {
+                mp_print_str(print, "+");
+            }
+            array += ndarray->itemsize / 2;
+            mp_obj_print_helper(print, mp_obj_new_float(fvalue), PRINT_REPR);
+            mp_print_str(print, "j");
+        } else {
+            mp_obj_print_helper(print, ndarray_get_item(ndarray, array), PRINT_REPR);
+        }
+    #else
+        mp_obj_print_helper(print, ndarray_get_item(ndarray, array), PRINT_REPR);
+    #endif
+}
+
+static void ndarray_print_row(const mp_print_t *print, ndarray_obj_t *ndarray, uint8_t *array, int32_t stride, size_t n) {
     if(n == 0) {
         return;
     }
     mp_print_str(print, "[");
     if((n <= ndarray_print_threshold) || (n <= 2*ndarray_print_edgeitems)) { // if the array is short, print everything
-        mp_obj_print_helper(print, ndarray_get_item(ndarray, array), PRINT_REPR);
+        ndarray_print_element(print, ndarray, array);
         array += stride;
         for(size_t i=1; i < n; i++, array += stride) {
             mp_print_str(print, ", ");
-            mp_obj_print_helper(print, ndarray_get_item(ndarray, array), PRINT_REPR);
+            ndarray_print_element(print, ndarray, array);
         }
     } else {
-        mp_obj_print_helper(print, ndarray_get_item(ndarray, array), PRINT_REPR);
+        ndarray_print_element(print, ndarray, array);
         array += stride;
         for(size_t i=1; i < ndarray_print_edgeitems; i++, array += stride) {
             mp_print_str(print, ", ");
-            mp_obj_print_helper(print, ndarray_get_item(ndarray, array), PRINT_REPR);
+            ndarray_print_element(print, ndarray, array);
         }
         mp_printf(print, ", ..., ");
-        array += stride * (n - 2 *  ndarray_print_edgeitems);
-        mp_obj_print_helper(print, ndarray_get_item(ndarray, array), PRINT_REPR);
+        array += stride * (n - 2 * ndarray_print_edgeitems);
+        ndarray_print_element(print, ndarray, array);
         array += stride;
         for(size_t i=1; i < ndarray_print_edgeitems; i++, array += stride) {
             mp_print_str(print, ", ");
-            mp_obj_print_helper(print, ndarray_get_item(ndarray, array), PRINT_REPR);
+            ndarray_print_element(print, ndarray, array);
         }
     }
     mp_print_str(print, "]");
@@ -459,21 +525,28 @@ void ndarray_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t ki
         ndarray_print_bracket(print, 0, self->shape[ULAB_MAX_DIMS-4], "]");
         #endif
     }
+    mp_print_str(print, ", dtype=");
     if(self->boolean) {
-        mp_print_str(print, ", dtype=bool)");
+        mp_print_str(print, "bool)");
     } else if(self->dtype == NDARRAY_UINT8) {
-        mp_print_str(print, ", dtype=uint8)");
+        mp_print_str(print, "uint8)");
     } else if(self->dtype == NDARRAY_INT8) {
-        mp_print_str(print, ", dtype=int8)");
+        mp_print_str(print, "int8)");
     } else if(self->dtype == NDARRAY_UINT16) {
-        mp_print_str(print, ", dtype=uint16)");
+        mp_print_str(print, "uint16)");
     } else if(self->dtype == NDARRAY_INT16) {
-        mp_print_str(print, ", dtype=int16)");
-    } else {
+        mp_print_str(print, "int16)");
+    }
+    #if ULAB_SUPPORTS_COMPLEX
+    else if(self->dtype == NDARRAY_COMPLEX) {
+        mp_print_str(print, "complex)");
+    }
+    #endif /* ULAB_SUPPORTS_COMPLEX */
+    else {
         #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
-        mp_print_str(print, ", dtype=float32)");
+        mp_print_str(print, "float32)");
         #else
-        mp_print_str(print, ", dtype=float64)");
+        mp_print_str(print, "float64)");
         #endif
     }
 }
@@ -485,7 +558,6 @@ void ndarray_assign_elements(ndarray_obj_t *ndarray, mp_obj_t iterable, uint8_t
         uint8_t *array = (uint8_t *)ndarray->array;
         array += *idx;
         while ((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
-            // TODO: this might be wrong here: we have to check for the trueness of item
             if(mp_obj_is_true(item)) {
                 *array = 1;
             }
@@ -494,7 +566,19 @@ void ndarray_assign_elements(ndarray_obj_t *ndarray, mp_obj_t iterable, uint8_t
         }
     } else {
         while ((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
-            ndarray_set_value(dtype, ndarray->array, (*idx)++, item);
+            #if ULAB_SUPPORTS_COMPLEX
+                mp_float_t real;
+                mp_float_t imag;
+                if(dtype == NDARRAY_COMPLEX) {
+                    mp_obj_get_complex(item, &real, &imag);
+                    ndarray_set_value(NDARRAY_FLOAT, ndarray->array, (*idx)++, mp_obj_new_float(real));
+                    ndarray_set_value(NDARRAY_FLOAT, ndarray->array, (*idx)++, mp_obj_new_float(imag));
+                } else {
+                    ndarray_set_value(dtype, ndarray->array, (*idx)++, item);
+                }
+            #else
+                ndarray_set_value(dtype, ndarray->array, (*idx)++, item);
+            #endif
         }
     }
 }
@@ -518,7 +602,7 @@ ndarray_obj_t *ndarray_new_ndarray(uint8_t ndim, size_t *shape, int32_t *strides
     ndarray->boolean = dtype == NDARRAY_BOOL ? NDARRAY_BOOLEAN : NDARRAY_NUMERIC;
     ndarray->ndim = ndim;
     ndarray->len = ndim == 0 ? 0 : 1;
-    ndarray->itemsize = mp_binary_get_size('@', ndarray->dtype, NULL);
+    ndarray->itemsize = ulab_binary_get_size(dtype);
     int32_t *_strides;
     if(strides == NULL) {
         _strides = strides_from_shape(shape, ndarray->dtype);
@@ -533,10 +617,9 @@ ndarray_obj_t *ndarray_new_ndarray(uint8_t ndim, size_t *shape, int32_t *strides
 
     // if the length is 0, still allocate a single item, so that contractions can be handled
     size_t len = ndarray->itemsize * MAX(1, ndarray->len);
-    uint8_t *array = m_new(byte, len);
+    uint8_t *array = m_new0(byte, len);
     // this should set all elements to 0, irrespective of the of the dtype (all bits are zero)
     // we could, perhaps, leave this step out, and initialise the array only, when needed
-    memset(array, 0, len);
     ndarray->array = array;
     ndarray->origin = array;
     return ndarray;
@@ -546,7 +629,7 @@ ndarray_obj_t *ndarray_new_dense_ndarray(uint8_t ndim, size_t *shape, uint8_t dt
     // creates a dense array, i.e., one, where the strides are derived directly from the shapes
     // the function should work in the general n-dimensional case
     int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
-    strides[ULAB_MAX_DIMS-1] = dtype == NDARRAY_BOOL ? 1 : mp_binary_get_size('@', dtype, NULL);
+    strides[ULAB_MAX_DIMS-1] = (int32_t)ulab_binary_get_size(dtype);
     for(size_t i=ULAB_MAX_DIMS; i > 1; i--) {
         strides[i-2] = strides[i-1] * MAX(1, shape[i-1]);
     }
@@ -567,13 +650,18 @@ ndarray_obj_t *ndarray_new_ndarray_from_tuple(mp_obj_tuple_t *_shape, uint8_t dt
     return ndarray_new_dense_ndarray(_shape->len, shape, dtype);
 }
 
-void ndarray_copy_array(ndarray_obj_t *source, ndarray_obj_t *target) {
+void ndarray_copy_array(ndarray_obj_t *source, ndarray_obj_t *target, uint8_t shift) {
     // TODO: if the array is dense, the content could be copied in a single pass
     // copies the content of source->array into a new dense void pointer
     // it is assumed that the dtypes in source and target are the same
     // Since the target is a new array, it is supposed to be dense
     uint8_t *sarray = (uint8_t *)source->array;
     uint8_t *tarray = (uint8_t *)target->array;
+    #if ULAB_SUPPORTS_COMPLEX
+    if(source->dtype == NDARRAY_COMPLEX) {
+        sarray += shift;
+    }
+    #endif
 
     #if ULAB_MAX_DIMS > 3
     size_t i = 0;
@@ -589,7 +677,7 @@ void ndarray_copy_array(ndarray_obj_t *source, ndarray_obj_t *target) {
             #endif
                 size_t l = 0;
                 do {
-                    memcpy(tarray, sarray, source->itemsize);
+                    memcpy(tarray, sarray, target->itemsize);
                     tarray += target->itemsize;
                     sarray += source->strides[ULAB_MAX_DIMS - 1];
                     l++;
@@ -645,13 +733,92 @@ ndarray_obj_t *ndarray_copy_view(ndarray_obj_t *source) {
 
     uint8_t dtype = source->dtype;
     if(source->boolean) {
-        dtype = NDARRAY_BOOLEAN;
+        dtype = NDARRAY_BOOL;
     }
     ndarray_obj_t *ndarray = ndarray_new_ndarray(source->ndim, source->shape, strides, dtype);
-    ndarray_copy_array(source, ndarray);
+    ndarray_copy_array(source, ndarray, 0);
     return ndarray;
 }
 
+ndarray_obj_t *ndarray_copy_view_convert_type(ndarray_obj_t *source, uint8_t dtype) {
+    // creates a copy, similar to ndarray_copy_view, but it also converts the dtype, if necessary
+    if(dtype == source->dtype) {
+        return ndarray_copy_view(source);
+    }
+    ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, dtype);
+    uint8_t *sarray = (uint8_t *)source->array;
+    uint8_t *array = (uint8_t *)ndarray->array;
+
+    #if ULAB_SUPPORTS_COMPLEX
+    uint8_t complex_size = 2 * sizeof(mp_float_t);
+    #endif
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    mp_obj_t item;
+                    #if ULAB_SUPPORTS_COMPLEX
+                    if(source->dtype == NDARRAY_COMPLEX) {
+                        if(dtype != NDARRAY_COMPLEX) {
+                            mp_raise_TypeError(translate("cannot convert complex type"));
+                        } else {
+                            memcpy(array, sarray, complex_size);
+                        }
+                    } else {
+                    #endif
+                        if((source->dtype == NDARRAY_FLOAT) && (dtype != NDARRAY_FLOAT)) {
+                            // floats must be treated separately, because they can't directly be converted to integer types
+                            mp_float_t f = ndarray_get_float_value(sarray, source->dtype);
+                            item = mp_obj_new_int((int32_t)MICROPY_FLOAT_C_FUN(round)(f));
+                        } else {
+                            item = mp_binary_get_val_array(source->dtype, sarray, 0);
+                        }
+                    #if ULAB_SUPPORTS_COMPLEX
+                        if(dtype == NDARRAY_COMPLEX) {
+                            ndarray_set_value(NDARRAY_FLOAT, array, 0, item);
+                        } else {
+                            ndarray_set_value(dtype, array, 0, item);
+                        }
+                    }
+                    #else
+                    ndarray_set_value(dtype, array, 0, item);
+                    #endif
+                    array += ndarray->itemsize;
+                    sarray += source->strides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                sarray += source->strides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+            sarray += source->strides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+        sarray += source->strides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+    #endif
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
 #if NDARRAY_HAS_BYTESWAP
 mp_obj_t ndarray_byteswap(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
     // changes the endiannes of an array
@@ -849,7 +1016,7 @@ STATIC uint8_t ndarray_init_helper(size_t n_args, const mp_obj_t *pos_args, mp_m
     if(mp_obj_is_type(args[1].u_obj, &ulab_dtype_type)) {
         dtype_obj_t *dtype = MP_OBJ_TO_PTR(args[1].u_obj);
         _dtype = dtype->dtype;
-    } else { // this must be an integer defined as a class constant (ulab.uint8 etc.)
+    } else { // this must be an integer defined as a class constant (ulab.numpy.uint8 etc.)
         _dtype = mp_obj_get_int(args[1].u_obj);
     }
     #else
@@ -863,58 +1030,7 @@ STATIC mp_obj_t ndarray_make_new_core(const mp_obj_type_t *type, size_t n_args,
 
     if(mp_obj_is_type(args[0], &ulab_ndarray_type)) {
         ndarray_obj_t *source = MP_OBJ_TO_PTR(args[0]);
-        if(dtype == source->dtype) {
-            return ndarray_copy_view(source);
-        }
-        ndarray_obj_t *target = ndarray_new_dense_ndarray(source->ndim, source->shape, dtype);
-        uint8_t *sarray = (uint8_t *)source->array;
-        uint8_t *tarray = (uint8_t *)target->array;
-        #if ULAB_MAX_DIMS > 3
-        size_t i = 0;
-        do {
-        #endif
-            #if ULAB_MAX_DIMS > 2
-            size_t j = 0;
-            do {
-            #endif
-                #if ULAB_MAX_DIMS > 1
-                size_t k = 0;
-                do {
-                #endif
-                    size_t l = 0;
-                    do {
-                        mp_obj_t item;
-                        if((source->dtype == NDARRAY_FLOAT) && (dtype != NDARRAY_FLOAT)) {
-                            // floats must be treated separately, because they can't directly be converted to integer types
-                            mp_float_t f = ndarray_get_float_value(sarray, source->dtype);
-                            item = mp_obj_new_int((int32_t)MICROPY_FLOAT_C_FUN(floor)(f));
-                        } else {
-                            item = mp_binary_get_val_array(source->dtype, sarray, 0);
-                        }
-                        ndarray_set_value(dtype, tarray, 0, item);
-                        tarray += target->itemsize;
-                        sarray += source->strides[ULAB_MAX_DIMS - 1];
-                        l++;
-                    } while(l < source->shape[ULAB_MAX_DIMS - 1]);
-                #if ULAB_MAX_DIMS > 1
-                    sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
-                    sarray += source->strides[ULAB_MAX_DIMS - 2];
-                    k++;
-                } while(k < source->shape[ULAB_MAX_DIMS - 2]);
-                #endif
-            #if ULAB_MAX_DIMS > 2
-                sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
-                sarray += source->strides[ULAB_MAX_DIMS - 3];
-                j++;
-            } while(j < source->shape[ULAB_MAX_DIMS - 3]);
-            #endif
-        #if ULAB_MAX_DIMS > 3
-            sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
-            sarray += source->strides[ULAB_MAX_DIMS - 4];
-            i++;
-        } while(i < source->shape[ULAB_MAX_DIMS - 4]);
-        #endif
-        return MP_OBJ_FROM_PTR(target);
+        return MP_OBJ_FROM_PTR(ndarray_copy_view_convert_type(source, dtype));
     } else {
         // assume that the input is an iterable
         return MP_OBJ_FROM_PTR(ndarray_from_iterable(args[0], dtype));
@@ -942,8 +1058,7 @@ bool ndarray_can_broadcast(ndarray_obj_t *lhs, ndarray_obj_t *rhs, uint8_t *ndim
     //
     // 1. the two shapes are either equal
     // 2. one of the shapes is 1
-    memset(lstrides, 0, sizeof(size_t)*ULAB_MAX_DIMS);
-    memset(rstrides, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+
     lstrides[ULAB_MAX_DIMS - 1] = lhs->strides[ULAB_MAX_DIMS - 1];
     rstrides[ULAB_MAX_DIMS - 1] = rhs->strides[ULAB_MAX_DIMS - 1];
     for(uint8_t i=ULAB_MAX_DIMS; i > 0; i--) {
@@ -976,7 +1091,7 @@ bool ndarray_can_broadcast_inplace(ndarray_obj_t *lhs, ndarray_obj_t *rhs, int32
     //
     // 1. the two shapes are either equal
     // 2. the shapes on the right hand side is 1
-    memset(rstrides, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+
     rstrides[ULAB_MAX_DIMS - 1] = rhs->strides[ULAB_MAX_DIMS - 1];
     for(uint8_t i=ULAB_MAX_DIMS; i > 0; i--) {
         if((lhs->shape[i-1] == rhs->shape[i-1]) || (rhs->shape[i-1] == 0) || (rhs->shape[i-1] == 1)) {
@@ -1024,10 +1139,8 @@ static mp_bound_slice_t generate_slice(mp_int_t n, mp_obj_t index) {
 }
 
 static ndarray_obj_t *ndarray_view_from_slices(ndarray_obj_t *ndarray, mp_obj_tuple_t *tuple) {
-    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
-    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
-    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
-    memset(strides, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    size_t *shape = m_new0(size_t, ULAB_MAX_DIMS);
+    int32_t *strides = m_new0(int32_t, ULAB_MAX_DIMS);
 
     uint8_t ndim = ndarray->ndim;
 
@@ -1069,83 +1182,71 @@ void ndarray_assign_view(ndarray_obj_t *view, ndarray_obj_t *values) {
         return;
     }
     uint8_t ndim = 0;
-    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
-    int32_t *lstrides = m_new(int32_t, ULAB_MAX_DIMS);
-    int32_t *rstrides = m_new(int32_t, ULAB_MAX_DIMS);
+    size_t *shape = m_new0(size_t, ULAB_MAX_DIMS);
+    int32_t *lstrides = m_new0(int32_t, ULAB_MAX_DIMS);
+    int32_t *rstrides = m_new0(int32_t, ULAB_MAX_DIMS);
     if(!ndarray_can_broadcast(view, values, &ndim, shape, lstrides, rstrides)) {
         mp_raise_ValueError(translate("operands could not be broadcast together"));
-        m_del(size_t, shape, ULAB_MAX_DIMS);
-        m_del(int32_t, lstrides, ULAB_MAX_DIMS);
-        m_del(int32_t, rstrides, ULAB_MAX_DIMS);
+    } else {
+
+        ndarray_obj_t *ndarray = ndarray_copy_view_convert_type(values, view->dtype);
+        // re-calculate rstrides, since the copy operation might have changed the directions of the strides
+        ndarray_can_broadcast(view, ndarray, &ndim, shape, lstrides, rstrides);
+        uint8_t *rarray = (uint8_t *)ndarray->array;
+
+
+        uint8_t *larray = (uint8_t *)view->array;
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        memcpy(larray, rarray, view->itemsize);
+                        larray += lstrides[ULAB_MAX_DIMS - 1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l <  view->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    larray -= lstrides[ULAB_MAX_DIMS - 1] * view->shape[ULAB_MAX_DIMS-1];
+                    larray += lstrides[ULAB_MAX_DIMS - 2];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 1] * view->shape[ULAB_MAX_DIMS-1];
+                    rarray += rstrides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k <  view->shape[ULAB_MAX_DIMS - 2]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                larray -= lstrides[ULAB_MAX_DIMS - 2] * view->shape[ULAB_MAX_DIMS-2];
+                larray += lstrides[ULAB_MAX_DIMS - 3];
+                rarray -= rstrides[ULAB_MAX_DIMS - 2] * view->shape[ULAB_MAX_DIMS-2];
+                rarray += rstrides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j <  view->shape[ULAB_MAX_DIMS - 3]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            larray -= lstrides[ULAB_MAX_DIMS - 3] * view->shape[ULAB_MAX_DIMS-3];
+            larray += lstrides[ULAB_MAX_DIMS - 4];
+            rarray -= rstrides[ULAB_MAX_DIMS - 3] * view->shape[ULAB_MAX_DIMS-3];
+            rarray += rstrides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i <  view->shape[ULAB_MAX_DIMS - 4]);
+        #endif
     }
 
-    uint8_t *rarray = (uint8_t *)values->array;
-    // since in ASSIGNMENT_LOOP the array has a type, we have to divide the strides by the itemsize
-    for(uint8_t i=0; i < ULAB_MAX_DIMS; i++) {
-        lstrides[i] /= view->itemsize;
-    }
+    m_del(size_t, shape, ULAB_MAX_DIMS);
+    m_del(int32_t, lstrides, ULAB_MAX_DIMS);
+    m_del(int32_t, rstrides, ULAB_MAX_DIMS);
 
-    if(view->dtype == NDARRAY_UINT8) {
-        if(values->dtype == NDARRAY_UINT8) {
-            ASSIGNMENT_LOOP(view, uint8_t, uint8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT8) {
-            ASSIGNMENT_LOOP(view, uint8_t, int8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_UINT16) {
-            ASSIGNMENT_LOOP(view, uint8_t, uint16_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT16) {
-            ASSIGNMENT_LOOP(view, uint8_t, int16_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_FLOAT) {
-            ASSIGNMENT_LOOP(view, uint8_t, mp_float_t, lstrides, rarray, rstrides);
-        }
-    } else if(view->dtype == NDARRAY_INT8) {
-        if(values->dtype == NDARRAY_UINT8) {
-            ASSIGNMENT_LOOP(view, int8_t, uint8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT8) {
-            ASSIGNMENT_LOOP(view, int8_t, int8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_UINT16) {
-            ASSIGNMENT_LOOP(view, int8_t, uint16_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT16) {
-            ASSIGNMENT_LOOP(view, int8_t, int16_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_FLOAT) {
-            ASSIGNMENT_LOOP(view, int8_t, mp_float_t, lstrides, rarray, rstrides);
-        }
-    } else if(view->dtype == NDARRAY_UINT16) {
-        if(values->dtype == NDARRAY_UINT8) {
-            ASSIGNMENT_LOOP(view, uint16_t, uint8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT8) {
-            ASSIGNMENT_LOOP(view, uint16_t, int8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_UINT16) {
-            ASSIGNMENT_LOOP(view, uint16_t, uint16_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT16) {
-            ASSIGNMENT_LOOP(view, uint16_t, int16_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_FLOAT) {
-            ASSIGNMENT_LOOP(view, uint16_t, mp_float_t, lstrides, rarray, rstrides);
-        }
-    } else if(view->dtype == NDARRAY_INT16) {
-        if(values->dtype == NDARRAY_UINT8) {
-            ASSIGNMENT_LOOP(view, int16_t, uint8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT8) {
-            ASSIGNMENT_LOOP(view, int16_t, int8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_UINT16) {
-            ASSIGNMENT_LOOP(view, int16_t, uint16_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT16) {
-            ASSIGNMENT_LOOP(view, int16_t, int16_t,  lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_FLOAT) {
-            ASSIGNMENT_LOOP(view, int16_t, mp_float_t,  lstrides, rarray, rstrides);
-        }
-    } else { // the dtype must be an mp_float_t now
-        if(values->dtype == NDARRAY_UINT8) {
-            ASSIGNMENT_LOOP(view, mp_float_t, uint8_t, lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT8) {
-            ASSIGNMENT_LOOP(view, mp_float_t, int8_t,  lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_UINT16) {
-            ASSIGNMENT_LOOP(view, mp_float_t, uint16_t,  lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_INT16) {
-            ASSIGNMENT_LOOP(view, mp_float_t, int16_t,  lstrides, rarray, rstrides);
-        } else if(values->dtype == NDARRAY_FLOAT) {
-            ASSIGNMENT_LOOP(view, mp_float_t, mp_float_t,  lstrides, rarray, rstrides);
-        }
-    }
+    return;
 }
 
 static mp_obj_t ndarray_from_boolean_index(ndarray_obj_t *ndarray, ndarray_obj_t *index) {
@@ -1181,16 +1282,16 @@ static mp_obj_t ndarray_assign_from_boolean_index(ndarray_obj_t *ndarray, ndarra
     // assigns values to a Boolean-indexed array
     // first we have to find out how many trues there are
     uint8_t *iarray = (uint8_t *)index->array;
+    size_t istride = index->strides[ULAB_MAX_DIMS - 1];
     size_t count = 0;
     for(size_t i=0; i < index->len; i++) {
         count += *iarray;
-        iarray += index->strides[ULAB_MAX_DIMS - 1];
+        iarray += istride;
     }
     // re-wind the index array
     iarray = index->array;
     uint8_t *varray = (uint8_t *)values->array;
     size_t vstride;
-    size_t istride = index->strides[ULAB_MAX_DIMS - 1];
 
     if(count == values->len) {
         // there are as many values as true indices
@@ -1199,65 +1300,92 @@ static mp_obj_t ndarray_assign_from_boolean_index(ndarray_obj_t *ndarray, ndarra
         // there is a single value
         vstride = 0;
     }
+
+    #if ULAB_SUPPORTS_COMPLEX
+    if(values->dtype == NDARRAY_COMPLEX) {
+        if(ndarray->dtype != NDARRAY_COMPLEX) {
+            mp_raise_TypeError(translate("cannot convert complex to dtype"));
+        } else {
+            uint8_t *array = (uint8_t *)ndarray->array;
+            for(size_t i = 0; i < ndarray->len; i++) {
+                if(*iarray) {
+                    memcpy(array, varray, ndarray->itemsize);
+                    varray += vstride;
+                }
+                array += ndarray->strides[ULAB_MAX_DIMS - 1];
+                iarray += istride;
+            } while(0);
+            return MP_OBJ_FROM_PTR(ndarray);
+        }
+    }
+    #endif
+
+    int32_t lstrides = ndarray->strides[ULAB_MAX_DIMS - 1] / ndarray->itemsize;
+
     if(ndarray->dtype == NDARRAY_UINT8) {
         if(values->dtype == NDARRAY_UINT8) {
-            BOOLEAN_ASSIGNMENT_LOOP(uint8_t, uint8_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(uint8_t, uint8_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_INT8) {
-            BOOLEAN_ASSIGNMENT_LOOP(uint8_t, int8_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(uint8_t, int8_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_UINT16) {
-            BOOLEAN_ASSIGNMENT_LOOP(uint8_t, uint16_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(uint8_t, uint16_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_INT16) {
-            BOOLEAN_ASSIGNMENT_LOOP(uint8_t, int16_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(uint8_t, int16_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_FLOAT) {
-            BOOLEAN_ASSIGNMENT_LOOP(uint8_t, mp_float_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(uint8_t, mp_float_t, ndarray, lstrides, iarray, istride, varray, vstride);
         }
     } else if(ndarray->dtype == NDARRAY_INT8) {
         if(values->dtype == NDARRAY_UINT8) {
-            BOOLEAN_ASSIGNMENT_LOOP(int8_t, uint8_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(int8_t, uint8_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_INT8) {
-            BOOLEAN_ASSIGNMENT_LOOP(int8_t, int8_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(int8_t, int8_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_UINT16) {
-            BOOLEAN_ASSIGNMENT_LOOP(int8_t, uint16_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(int8_t, uint16_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_INT16) {
-            BOOLEAN_ASSIGNMENT_LOOP(int8_t, int16_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(int8_t, int16_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_FLOAT) {
-            BOOLEAN_ASSIGNMENT_LOOP(int8_t, mp_float_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(int8_t, mp_float_t, ndarray, lstrides, iarray, istride, varray, vstride);
         }
     } else if(ndarray->dtype == NDARRAY_UINT16) {
         if(values->dtype == NDARRAY_UINT8) {
-            BOOLEAN_ASSIGNMENT_LOOP(uint16_t, uint8_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(uint16_t, uint8_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_INT8) {
-            BOOLEAN_ASSIGNMENT_LOOP(uint16_t, int8_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(uint16_t, int8_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_UINT16) {
-            BOOLEAN_ASSIGNMENT_LOOP(uint16_t, uint16_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(uint16_t, uint16_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_INT16) {
-            BOOLEAN_ASSIGNMENT_LOOP(uint16_t, int16_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(uint16_t, int16_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_FLOAT) {
-            BOOLEAN_ASSIGNMENT_LOOP(uint16_t, mp_float_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(uint16_t, mp_float_t, ndarray, lstrides, iarray, istride, varray, vstride);
         }
     } else if(ndarray->dtype == NDARRAY_INT16) {
         if(values->dtype == NDARRAY_UINT8) {
-            BOOLEAN_ASSIGNMENT_LOOP(int16_t, uint8_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(int16_t, uint8_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_INT8) {
-            BOOLEAN_ASSIGNMENT_LOOP(int16_t, int8_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(int16_t, int8_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_UINT16) {
-            BOOLEAN_ASSIGNMENT_LOOP(int16_t, uint16_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(int16_t, uint16_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_INT16) {
-            BOOLEAN_ASSIGNMENT_LOOP(int16_t, int16_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(int16_t, int16_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_FLOAT) {
-            BOOLEAN_ASSIGNMENT_LOOP(int16_t, mp_float_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(int16_t, mp_float_t, ndarray, lstrides, iarray, istride, varray, vstride);
         }
     } else {
+        #if ULAB_SUPPORTS_COMPLEX
+        if(ndarray->dtype == NDARRAY_COMPLEX) {
+            lstrides *= 2;
+        }
+        #endif
         if(values->dtype == NDARRAY_UINT8) {
-            BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, uint8_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, uint8_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_INT8) {
-            BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, int8_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, int8_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_UINT16) {
-            BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, uint16_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, uint16_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_INT16) {
-            BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, int16_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, int16_t, ndarray, lstrides, iarray, istride, varray, vstride);
         } else if(values->dtype == NDARRAY_FLOAT) {
-            BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, mp_float_t, ndarray, iarray, istride, varray, vstride);
+            BOOLEAN_ASSIGNMENT_LOOP(mp_float_t, mp_float_t, ndarray, lstrides, iarray, istride, varray, vstride);
         }
     }
     return MP_OBJ_FROM_PTR(ndarray);
@@ -1272,7 +1400,7 @@ static mp_obj_t ndarray_get_slice(ndarray_obj_t *ndarray, mp_obj_t index, ndarra
         if(values == NULL) { // return value(s)
             return ndarray_from_boolean_index(ndarray, nindex);
         } else { // assign value(s)
-            ndarray_assign_from_boolean_index(ndarray, index, values);
+            ndarray_assign_from_boolean_index(ndarray, nindex, values);
         }
     }
     if(mp_obj_is_type(index, &mp_type_tuple) || mp_obj_is_int(index) || mp_obj_is_type(index, &mp_type_slice)) {
@@ -1291,7 +1419,7 @@ static mp_obj_t ndarray_get_slice(ndarray_obj_t *ndarray, mp_obj_t index, ndarra
         if(values == NULL) { // return value(s)
             // if the view has been reduced to nothing, return a single value
             if(view->ndim == 0) {
-                return mp_binary_get_val_array(view->dtype, view->array, 0);
+                return ndarray_get_item(view, view->array);
             } else {
                 return MP_OBJ_FROM_PTR(view);
             }
@@ -1525,6 +1653,32 @@ mp_obj_t ndarray_tobytes(mp_obj_t self_in) {
 MP_DEFINE_CONST_FUN_OBJ_1(ndarray_tobytes_obj, ndarray_tobytes);
 #endif
 
+#if NDARRAY_HAS_TOLIST
+static mp_obj_t ndarray_recursive_list(ndarray_obj_t *self, uint8_t *array, uint8_t dim) {
+    int32_t stride = self->strides[ULAB_MAX_DIMS - dim];
+    size_t len = self->shape[ULAB_MAX_DIMS - dim];
+
+    mp_obj_list_t *list = MP_OBJ_TO_PTR(mp_obj_new_list(len, NULL));
+    for(size_t i = 0; i < len; i++) {
+        if(dim == 1) {
+            list->items[i] = ndarray_get_item(self, array);
+        } else {
+            list->items[i] = ndarray_recursive_list(self, array, dim-1);
+        }
+        array += stride;
+    }
+    return MP_OBJ_FROM_PTR(list);
+}
+
+mp_obj_t ndarray_tolist(mp_obj_t self_in) {
+    ndarray_obj_t *self = MP_OBJ_TO_PTR(self_in);
+    uint8_t *array = (uint8_t *)self->array;
+    return ndarray_recursive_list(self, array, self->ndim);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(ndarray_tolist_obj, ndarray_tolist);
+#endif
+
 // Binary operations
 ndarray_obj_t *ndarray_from_mp_obj(mp_obj_t obj, uint8_t other_type) {
     // creates an ndarray from a micropython int or float
@@ -1571,7 +1725,15 @@ ndarray_obj_t *ndarray_from_mp_obj(mp_obj_t obj, uint8_t other_type) {
         array[0] = mp_obj_get_float(obj);
     } else if(mp_obj_is_type(obj, &ulab_ndarray_type)){
         return obj;
-    } else {
+    }
+    #if ULAB_SUPPORTS_COMPLEX
+    else if(mp_obj_is_type(obj, &mp_type_complex)) {
+        ndarray = ndarray_new_linear_array(1, NDARRAY_COMPLEX);
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+        mp_obj_get_complex(obj, &array[0], &array[1]);
+    }
+    #endif
+    else {
         // assume that the input is an iterable (raises an exception, if it is not the case)
         ndarray = ndarray_from_iterable(obj, NDARRAY_FLOAT);
     }
@@ -1606,9 +1768,9 @@ mp_obj_t ndarray_binary_op(mp_binary_op_t _op, mp_obj_t lobj, mp_obj_t robj) {
     }
 
     uint8_t ndim = 0;
-    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
-    int32_t *lstrides = m_new(int32_t, ULAB_MAX_DIMS);
-    int32_t *rstrides = m_new(int32_t, ULAB_MAX_DIMS);
+    size_t *shape = m_new0(size_t, ULAB_MAX_DIMS);
+    int32_t *lstrides = m_new0(int32_t, ULAB_MAX_DIMS);
+    int32_t *rstrides = m_new0(int32_t, ULAB_MAX_DIMS);
     uint8_t broadcastable;
     if((op == MP_BINARY_OP_INPLACE_ADD) || (op == MP_BINARY_OP_INPLACE_MULTIPLY) || (op == MP_BINARY_OP_INPLACE_POWER) ||
         (op == MP_BINARY_OP_INPLACE_SUBTRACT) || (op == MP_BINARY_OP_INPLACE_TRUE_DIVIDE)) {
@@ -1625,7 +1787,7 @@ mp_obj_t ndarray_binary_op(mp_binary_op_t _op, mp_obj_t lobj, mp_obj_t robj) {
     // the empty arrays have to be treated separately
     uint8_t dtype = NDARRAY_INT16;
     ndarray_obj_t *nd;
-    if((lhs->ndim == 0) || (rhs->ndim == 0)) {
+    if((lhs->len == 0) || (rhs->len == 0)) {
         switch(op) {
             case MP_BINARY_OP_INPLACE_ADD:
             case MP_BINARY_OP_INPLACE_MULTIPLY:
@@ -1662,7 +1824,7 @@ mp_obj_t ndarray_binary_op(mp_binary_op_t _op, mp_obj_t lobj, mp_obj_t robj) {
             case MP_BINARY_OP_EQUAL:
             case MP_BINARY_OP_NOT_EQUAL:
                 nd = ndarray_new_linear_array(0, NDARRAY_UINT8);
-                nd->boolean = true;
+                nd->boolean = 1;
                 return MP_OBJ_FROM_PTR(nd);
 
             default:
@@ -1675,26 +1837,31 @@ mp_obj_t ndarray_binary_op(mp_binary_op_t _op, mp_obj_t lobj, mp_obj_t robj) {
         // first the in-place operators
         #if NDARRAY_HAS_INPLACE_ADD
         case MP_BINARY_OP_INPLACE_ADD:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
             return ndarray_inplace_ams(lhs, rhs, rstrides, op);
             break;
         #endif
         #if NDARRAY_HAS_INPLACE_MULTIPLY
         case MP_BINARY_OP_INPLACE_MULTIPLY:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
             return ndarray_inplace_ams(lhs, rhs, rstrides, op);
             break;
         #endif
         #if NDARRAY_HAS_INPLACE_POWER
         case MP_BINARY_OP_INPLACE_POWER:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
             return ndarray_inplace_power(lhs, rhs, rstrides);
             break;
         #endif
         #if NDARRAY_HAS_INPLACE_SUBTRACT
         case MP_BINARY_OP_INPLACE_SUBTRACT:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
             return ndarray_inplace_ams(lhs, rhs, rstrides, op);
             break;
         #endif
         #if NDARRAY_HAS_INPLACE_TRUE_DIVIDE
         case MP_BINARY_OP_INPLACE_TRUE_DIVIDE:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
             return ndarray_inplace_divide(lhs, rhs, rstrides);
             break;
         #endif
@@ -1702,12 +1869,14 @@ mp_obj_t ndarray_binary_op(mp_binary_op_t _op, mp_obj_t lobj, mp_obj_t robj) {
 
         #if NDARRAY_HAS_BINARY_OP_LESS
         case MP_BINARY_OP_LESS:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
             // here we simply swap the operands
             return ndarray_binary_more(rhs, lhs, ndim, shape, rstrides, lstrides, MP_BINARY_OP_MORE);
             break;
         #endif
         #if NDARRAY_HAS_BINARY_OP_LESS_EQUAL
         case MP_BINARY_OP_LESS_EQUAL:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
             // here we simply swap the operands
             return ndarray_binary_more(rhs, lhs, ndim, shape, rstrides, lstrides, MP_BINARY_OP_MORE_EQUAL);
             break;
@@ -1734,11 +1903,13 @@ mp_obj_t ndarray_binary_op(mp_binary_op_t _op, mp_obj_t lobj, mp_obj_t robj) {
         #endif
         #if NDARRAY_HAS_BINARY_OP_MORE
         case MP_BINARY_OP_MORE:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
             return ndarray_binary_more(lhs, rhs, ndim, shape, lstrides, rstrides, MP_BINARY_OP_MORE);
             break;
         #endif
         #if NDARRAY_HAS_BINARY_OP_MORE_EQUAL
         case MP_BINARY_OP_MORE_EQUAL:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
             return ndarray_binary_more(lhs, rhs, ndim, shape, lstrides, rstrides, MP_BINARY_OP_MORE_EQUAL);
             break;
         #endif
@@ -1754,6 +1925,7 @@ mp_obj_t ndarray_binary_op(mp_binary_op_t _op, mp_obj_t lobj, mp_obj_t robj) {
         #endif
         #if NDARRAY_HAS_BINARY_OP_POWER
         case MP_BINARY_OP_POWER:
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(lhs->dtype);
             return ndarray_binary_power(lhs, rhs, ndim, shape, lstrides, rstrides);
             break;
         #endif
@@ -1773,30 +1945,44 @@ mp_obj_t ndarray_unary_op(mp_unary_op_t op, mp_obj_t self_in) {
     switch (op) {
         #if NDARRAY_HAS_UNARY_OP_ABS
         case MP_UNARY_OP_ABS:
-            ndarray = ndarray_copy_view(self);
-            // if Boolean, NDARRAY_UINT8, or NDARRAY_UINT16, there is nothing to do
-            if(self->dtype == NDARRAY_INT8) {
-                int8_t *array = (int8_t *)ndarray->array;
-                for(size_t i=0; i < self->len; i++, array++) {
-                    if(*array < 0) *array = -(*array);
-                }
-            } else if(self->dtype == NDARRAY_INT16) {
-                int16_t *array = (int16_t *)ndarray->array;
-                for(size_t i=0; i < self->len; i++, array++) {
-                    if(*array < 0) *array = -(*array);
-                }
+            #if ULAB_SUPPORTS_COMPLEX
+            if(self->dtype == NDARRAY_COMPLEX) {
+                int32_t *strides = strides_from_shape(self->shape, NDARRAY_FLOAT);
+                ndarray_obj_t *target = ndarray_new_ndarray(self->ndim, self->shape, strides, NDARRAY_FLOAT);
+                ndarray = carray_abs(self, target);
             } else {
-                mp_float_t *array = (mp_float_t *)ndarray->array;
-                for(size_t i=0; i < self->len; i++, array++) {
-                    if(*array < 0) *array = -(*array);
+            #endif
+                ndarray = ndarray_copy_view(self);
+                // if Boolean, NDARRAY_UINT8, or NDARRAY_UINT16, there is nothing to do
+                if(self->dtype == NDARRAY_INT8) {
+                    int8_t *array = (int8_t *)ndarray->array;
+                    for(size_t i=0; i < self->len; i++, array++) {
+                        if(*array < 0) *array = -(*array);
+                    }
+                } else if(self->dtype == NDARRAY_INT16) {
+                    int16_t *array = (int16_t *)ndarray->array;
+                    for(size_t i=0; i < self->len; i++, array++) {
+                        if(*array < 0) *array = -(*array);
+                    }
+                } else {
+                    mp_float_t *array = (mp_float_t *)ndarray->array;
+                    for(size_t i=0; i < self->len; i++, array++) {
+                        if(*array < 0) *array = -(*array);
+                    }
                 }
+            #if ULAB_SUPPORTS_COMPLEX
             }
+            #endif
             return MP_OBJ_FROM_PTR(ndarray);
             break;
         #endif
         #if NDARRAY_HAS_UNARY_OP_INVERT
         case MP_UNARY_OP_INVERT:
+            #if ULAB_SUPPORTS_COMPLEX
+            if(self->dtype == NDARRAY_FLOAT || self->dtype == NDARRAY_COMPLEX) {
+            #else
             if(self->dtype == NDARRAY_FLOAT) {
+            #endif
                 mp_raise_ValueError(translate("operation is not supported for given type"));
             }
             // we can invert the content byte by byte, no need to distinguish between different dtypes
@@ -1805,7 +1991,7 @@ mp_obj_t ndarray_unary_op(mp_unary_op_t op, mp_obj_t self_in) {
             if(ndarray->boolean) {
                 for(size_t i=0; i < ndarray->len; i++, array++) *array = *array ^ 0x01;
             } else {
-                uint8_t itemsize = mp_binary_get_size('@', self->dtype, NULL);
+                uint8_t itemsize = ulab_binary_get_size(self->dtype);
                 for(size_t i=0; i < ndarray->len*itemsize; i++, array++) *array ^= 0xFF;
             }
             return MP_OBJ_FROM_PTR(ndarray);
@@ -1833,7 +2019,13 @@ mp_obj_t ndarray_unary_op(mp_unary_op_t op, mp_obj_t self_in) {
                 for(size_t i=0; i < self->len; i++, array++) *array = -(*array);
             } else {
                 mp_float_t *array = (mp_float_t *)ndarray->array;
-                for(size_t i=0; i < self->len; i++, array++) *array = -(*array);
+                size_t len = self->len;
+                #if ULAB_SUPPORTS_COMPLEX
+                if(self->dtype == NDARRAY_COMPLEX) {
+                    len *= 2;
+                }
+                #endif
+                for(size_t i=0; i < len; i++, array++) *array = -(*array);
             }
             return MP_OBJ_FROM_PTR(ndarray);
             break;
@@ -1887,8 +2079,8 @@ mp_obj_t ndarray_reshape_core(mp_obj_t oin, mp_obj_t _shape, bool inplace) {
     if(shape->len > ULAB_MAX_DIMS) {
         mp_raise_ValueError(translate("maximum number of dimensions is 4"));
     }
-    size_t *new_shape = m_new(size_t, ULAB_MAX_DIMS);
-    memset(new_shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
+    size_t *new_shape = m_new0(size_t, ULAB_MAX_DIMS);
+
     size_t new_length = 1;
     for(uint8_t i=0; i < shape->len; i++) {
         new_shape[ULAB_MAX_DIMS - i - 1] = mp_obj_get_int(shape->items[shape->len - i - 1]);
@@ -1914,7 +2106,7 @@ mp_obj_t ndarray_reshape_core(mp_obj_t oin, mp_obj_t _shape, bool inplace) {
             mp_raise_ValueError(translate("cannot assign new shape"));
         }
         ndarray = ndarray_new_ndarray_from_tuple(shape, source->dtype);
-        ndarray_copy_array(source, ndarray);
+        ndarray_copy_array(source, ndarray, 0);
     }
     return MP_OBJ_FROM_PTR(ndarray);
 }
diff --git a/python/port/mod/ulab/ndarray.h b/python/port/mod/ulab/ndarray.h
index 04abd9659..7fc4dc2c1 100644
--- a/python/port/mod/ulab/ndarray.h
+++ b/python/port/mod/ulab/ndarray.h
@@ -63,6 +63,8 @@ typedef struct _mp_obj_slice_t {
 void ndarray_set_value(char , void *, size_t , mp_obj_t );
 #endif
 
+void ndarray_set_complex_value(void *, size_t , mp_obj_t );
+
 #define NDARRAY_NUMERIC   0
 #define NDARRAY_BOOLEAN   1
 
@@ -77,6 +79,9 @@ enum NDARRAY_TYPE {
     NDARRAY_INT8 = 'b',
     NDARRAY_UINT16 = 'H',
     NDARRAY_INT16 = 'h',
+    #if ULAB_SUPPORTS_COMPLEX
+        NDARRAY_COMPLEX = 'c',
+    #endif
     NDARRAY_FLOAT = FLOAT_TYPECODE,
 };
 
@@ -131,6 +136,7 @@ void ndarray_assign_elements(ndarray_obj_t *, mp_obj_t , uint8_t , size_t *);
 size_t *ndarray_contract_shape(ndarray_obj_t *, uint8_t );
 int32_t *ndarray_contract_strides(ndarray_obj_t *, uint8_t );
 
+ndarray_obj_t *ndarray_from_iterable(mp_obj_t , uint8_t );
 ndarray_obj_t *ndarray_new_dense_ndarray(uint8_t , size_t *, uint8_t );
 ndarray_obj_t *ndarray_new_ndarray_from_tuple(mp_obj_tuple_t *, uint8_t );
 ndarray_obj_t *ndarray_new_ndarray(uint8_t , size_t *, int32_t *, uint8_t );
@@ -138,7 +144,8 @@ ndarray_obj_t *ndarray_new_linear_array(size_t , uint8_t );
 ndarray_obj_t *ndarray_new_view(ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t );
 bool ndarray_is_dense(ndarray_obj_t *);
 ndarray_obj_t *ndarray_copy_view(ndarray_obj_t *);
-void ndarray_copy_array(ndarray_obj_t *, ndarray_obj_t *);
+ndarray_obj_t *ndarray_copy_view_convert_type(ndarray_obj_t *, uint8_t );
+void ndarray_copy_array(ndarray_obj_t *, ndarray_obj_t *, uint8_t );
 
 MP_DECLARE_CONST_FUN_OBJ_KW(ndarray_array_constructor_obj);
 mp_obj_t ndarray_make_new(const mp_obj_type_t *, size_t , size_t , const mp_obj_t *);
@@ -185,6 +192,11 @@ mp_obj_t ndarray_tobytes(mp_obj_t );
 MP_DECLARE_CONST_FUN_OBJ_1(ndarray_tobytes_obj);
 #endif
 
+#if NDARRAY_HAS_TOBYTES
+mp_obj_t ndarray_tolist(mp_obj_t );
+MP_DECLARE_CONST_FUN_OBJ_1(ndarray_tolist_obj);
+#endif
+
 #if NDARRAY_HAS_TRANSPOSE
 mp_obj_t ndarray_transpose(mp_obj_t );
 MP_DECLARE_CONST_FUN_OBJ_1(ndarray_transpose_obj);
@@ -201,15 +213,15 @@ mp_int_t ndarray_get_buffer(mp_obj_t , mp_buffer_info_t *, mp_uint_t );
 ndarray_obj_t *ndarray_from_mp_obj(mp_obj_t , uint8_t );
 
 
-#define BOOLEAN_ASSIGNMENT_LOOP(type_left, type_right, ndarray, iarray, istride, varray, vstride)\
+#define BOOLEAN_ASSIGNMENT_LOOP(type_left, type_right, ndarray, lstrides, iarray, istride, varray, vstride)\
     type_left *array = (type_left *)(ndarray)->array;\
     for(size_t i=0; i < (ndarray)->len; i++) {\
         if(*(iarray)) {\
             *array = (type_left)(*((type_right *)(varray)));\
+            (varray) += (vstride);\
         }\
-        array += (ndarray)->strides[ULAB_MAX_DIMS - 1] / (ndarray)->itemsize;\
+        array += (lstrides);\
         (iarray) += (istride);\
-        (varray) += (vstride);\
     } while(0)
 
 #if ULAB_HAS_FUNCTION_ITERATOR
@@ -634,105 +646,4 @@ ndarray_obj_t *ndarray_from_mp_obj(mp_obj_t , uint8_t );
 #endif /* ULAB_MAX_DIMS == 4 */
 #endif /* ULAB_HAS_FUNCTION_ITERATOR */
 
-
-#if ULAB_MAX_DIMS == 1
-#define ASSIGNMENT_LOOP(results, type_left, type_right, lstrides, rarray, rstrides)\
-    type_left *larray = (type_left *)(results)->array;\
-    size_t l = 0;\
-    do {\
-        *larray = (type_left)(*((type_right *)(rarray)));\
-        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
-        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
-        l++;\
-    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
-
-#endif /* ULAB_MAX_DIMS == 1 */
-
-#if ULAB_MAX_DIMS == 2
-#define ASSIGNMENT_LOOP(results, type_left, type_right, lstrides, rarray, rstrides)\
-    type_left *larray = (type_left *)(results)->array;\
-    size_t k = 0;\
-    do {\
-        size_t l = 0;\
-        do {\
-            *larray = (type_left)(*((type_right *)(rarray)));\
-            (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
-            (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
-            l++;\
-        } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
-        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
-        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
-        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
-        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
-        k++;\
-    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
-
-#endif /* ULAB_MAX_DIMS == 2 */
-
-#if ULAB_MAX_DIMS == 3
-#define ASSIGNMENT_LOOP(results, type_left, type_right, lstrides, rarray, rstrides)\
-    type_left *larray = (type_left *)(results)->array;\
-    size_t j = 0;\
-    do {\
-        size_t k = 0;\
-        do {\
-            size_t l = 0;\
-            do {\
-                *larray = (type_left)(*((type_right *)(rarray)));\
-                (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
-                (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
-                l++;\
-            } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
-            (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
-            (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
-            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
-            (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
-            k++;\
-        } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
-        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];\
-        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
-        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];\
-        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
-        j++;\
-    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
-
-#endif /* ULAB_MAX_DIMS == 3 */
-
-#if ULAB_MAX_DIMS == 4
-#define ASSIGNMENT_LOOP(results, type_left, type_right, lstrides, rarray, rstrides)\
-    type_left *larray = (type_left *)(results)->array;\
-    size_t i = 0;\
-    do {\
-        size_t j = 0;\
-        do {\
-            size_t k = 0;\
-            do {\
-                size_t l = 0;\
-                do {\
-                    *larray = (type_left)(*((type_right *)(rarray)));\
-                    (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
-                    (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
-                    l++;\
-                } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
-                (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
-                (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
-                (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
-                (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
-                k++;\
-            } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
-            (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];\
-            (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
-            (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];\
-            (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
-            j++;\
-        } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
-        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS-3];\
-        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
-        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS-3];\
-        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
-        i++;\
-    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
-
-#endif /* ULAB_MAX_DIMS == 4 */
-
 #endif
diff --git a/python/port/mod/ulab/ndarray_operators.c b/python/port/mod/ulab/ndarray_operators.c
index 465140b65..de1042cc8 100644
--- a/python/port/mod/ulab/ndarray_operators.c
+++ b/python/port/mod/ulab/ndarray_operators.c
@@ -17,6 +17,7 @@
 #include "ndarray_operators.h"
 #include "ulab.h"
 #include "ulab_tools.h"
+#include "numpy/carray/carray.h"
 
 /*
     This file contains the actual implementations of the various
@@ -24,7 +25,8 @@
 
     These are the upcasting rules of the binary operators
 
-    - if one of the operarands is a float, the result is always float
+    - if complex is supported, and if one of the operarands is a complex, the result is always complex
+    - if both operarands are real one of them is a float, then the result is also a float
     - operation on identical types preserves type
 
     uint8 + int8 => int16
@@ -39,6 +41,12 @@
 mp_obj_t ndarray_binary_equality(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
                                             uint8_t ndim, size_t *shape,  int32_t *lstrides, int32_t *rstrides, mp_binary_op_t op) {
 
+    #if ULAB_SUPPORTS_COMPLEX
+    if((lhs->dtype == NDARRAY_COMPLEX) || (rhs->dtype == NDARRAY_COMPLEX))  {
+        return carray_binary_equal_not_equal(lhs, rhs, ndim, shape, lstrides, rstrides, op);
+    }
+    #endif
+
     ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT8);
     results->boolean = 1;
     uint8_t *array = (uint8_t *)results->array;
@@ -161,6 +169,12 @@ mp_obj_t ndarray_binary_equality(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
 mp_obj_t ndarray_binary_add(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
                                         uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
 
+    #if ULAB_SUPPORTS_COMPLEX
+    if((lhs->dtype == NDARRAY_COMPLEX) || (rhs->dtype == NDARRAY_COMPLEX))  {
+        return carray_binary_add(lhs, rhs, ndim, shape, lstrides, rstrides);
+    }
+    #endif
+
     ndarray_obj_t *results = NULL;
     uint8_t *larray = (uint8_t *)lhs->array;
     uint8_t *rarray = (uint8_t *)rhs->array;
@@ -238,6 +252,12 @@ mp_obj_t ndarray_binary_add(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
 mp_obj_t ndarray_binary_multiply(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
                                             uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
 
+    #if ULAB_SUPPORTS_COMPLEX
+    if((lhs->dtype == NDARRAY_COMPLEX) || (rhs->dtype == NDARRAY_COMPLEX))  {
+        return carray_binary_multiply(lhs, rhs, ndim, shape, lstrides, rstrides);
+    }
+    #endif
+
     ndarray_obj_t *results = NULL;
     uint8_t *larray = (uint8_t *)lhs->array;
     uint8_t *rarray = (uint8_t *)rhs->array;
@@ -460,6 +480,12 @@ mp_obj_t ndarray_binary_more(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
 mp_obj_t ndarray_binary_subtract(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
                                             uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
 
+    #if ULAB_SUPPORTS_COMPLEX
+    if((lhs->dtype == NDARRAY_COMPLEX) || (rhs->dtype == NDARRAY_COMPLEX))  {
+        return carray_binary_subtract(lhs, rhs, ndim, shape, lstrides, rstrides);
+    }
+    #endif
+
     ndarray_obj_t *results = NULL;
     uint8_t *larray = (uint8_t *)lhs->array;
     uint8_t *rarray = (uint8_t *)rhs->array;
@@ -559,6 +585,12 @@ mp_obj_t ndarray_binary_subtract(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
 mp_obj_t ndarray_binary_true_divide(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
                                             uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
 
+    #if ULAB_SUPPORTS_COMPLEX
+    if((lhs->dtype == NDARRAY_COMPLEX) || (rhs->dtype == NDARRAY_COMPLEX))  {
+        return carray_binary_divide(lhs, rhs, ndim, shape, lstrides, rstrides);
+    }
+    #endif
+
     ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_FLOAT);
     uint8_t *larray = (uint8_t *)lhs->array;
     uint8_t *rarray = (uint8_t *)rhs->array;
diff --git a/python/port/mod/ulab/ndarray_properties.c b/python/port/mod/ulab/ndarray_properties.c
index 4a93fb823..5464b31d5 100644
--- a/python/port/mod/ulab/ndarray_properties.c
+++ b/python/port/mod/ulab/ndarray_properties.c
@@ -20,6 +20,9 @@
 #include "ulab.h"
 #include "ndarray.h"
 #include "numpy/ndarray/ndarray_iter.h"
+#if ULAB_SUPPORTS_COMPLEX
+#include "numpy/carray/carray.h"
+#endif
 
 #ifndef CIRCUITPY
 
@@ -82,6 +85,18 @@ void ndarray_properties_attr(mp_obj_t self_in, qstr attr, mp_obj_t *dest) {
                 dest[0] = ndarray_transpose(self_in);
                 break;
             #endif
+            #if ULAB_SUPPORTS_COMPLEX
+            #if ULAB_NUMPY_HAS_IMAG
+            case MP_QSTR_imag:
+                dest[0] = carray_imag(self_in);
+                break;
+            #endif
+            #if ULAB_NUMPY_HAS_IMAG
+            case MP_QSTR_real:
+                dest[0] = carray_real(self_in);
+                break;
+            #endif
+            #endif /* ULAB_SUPPORTS_COMPLEX */
             default:
                 call_local_method(self_in, attr, dest);
                 break;
diff --git a/python/port/mod/ulab/numpy/approx.c b/python/port/mod/ulab/numpy/approx.c
index 6ed5d7c2d..85cdbf78d 100644
--- a/python/port/mod/ulab/numpy/approx.c
+++ b/python/port/mod/ulab/numpy/approx.c
@@ -19,6 +19,7 @@
 
 #include "../ulab.h"
 #include "../ulab_tools.h"
+#include "carray/carray_tools.h"
 #include "approx.h"
 
 //| """Numerical approximation methods"""
@@ -60,6 +61,9 @@ STATIC mp_obj_t approx_interp(size_t n_args, const mp_obj_t *pos_args, mp_map_t
     ndarray_obj_t *x = ndarray_from_mp_obj(args[0].u_obj, 0);
     ndarray_obj_t *xp = ndarray_from_mp_obj(args[1].u_obj, 0); // xp must hold an increasing sequence of independent values
     ndarray_obj_t *fp = ndarray_from_mp_obj(args[2].u_obj, 0);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(x->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(xp->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(fp->dtype)
     if((xp->ndim != 1) || (fp->ndim != 1) || (xp->len < 2) || (fp->len < 2) || (xp->len != fp->len)) {
         mp_raise_ValueError(translate("interp is defined for 1D iterables of equal length"));
     }
@@ -157,6 +161,7 @@ STATIC mp_obj_t approx_trapz(size_t n_args, const mp_obj_t *pos_args, mp_map_t *
     mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
 
     ndarray_obj_t *y = ndarray_from_mp_obj(args[0].u_obj, 0);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(y->dtype)
     ndarray_obj_t *x;
     mp_float_t mean = MICROPY_FLOAT_CONST(0.0);
     if(y->len < 2) {
@@ -174,6 +179,7 @@ STATIC mp_obj_t approx_trapz(size_t n_args, const mp_obj_t *pos_args, mp_map_t *
 
     if(args[1].u_obj != mp_const_none) {
         x = ndarray_from_mp_obj(args[1].u_obj, 0); // x must hold an increasing sequence of independent values
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(x->dtype)
         if((x->ndim != 1) || (y->len != x->len)) {
             mp_raise_ValueError(translate("trapz is defined for 1D arrays of equal length"));
         }
diff --git a/python/port/mod/ulab/numpy/carray/carray.c b/python/port/mod/ulab/numpy/carray/carray.c
new file mode 100644
index 000000000..a5f8a2b12
--- /dev/null
+++ b/python/port/mod/ulab/numpy/carray/carray.c
@@ -0,0 +1,826 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2021-2022 Zoltán Vörös
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/objint.h"
+#include "py/runtime.h"
+#include "py/builtin.h"
+#include "py/misc.h"
+
+#include "../../ulab.h"
+#include "../../ndarray.h"
+#include "../../ulab_tools.h"
+#include "carray.h"
+
+#if ULAB_SUPPORTS_COMPLEX
+
+//| import ulab.numpy
+
+//| def real(val):
+//|     """
+//|     Return the real part of the complex argument, which can be
+//|     either an ndarray, or a scalar."""
+//|     ...
+//|
+
+mp_obj_t carray_real(mp_obj_t _source) {
+    if(mp_obj_is_type(_source, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(_source);
+        if(source->dtype != NDARRAY_COMPLEX) {
+            ndarray_obj_t *target = ndarray_new_dense_ndarray(source->ndim, source->shape, source->dtype);
+            ndarray_copy_array(source, target, 0);
+            return MP_OBJ_FROM_PTR(target);
+        } else { // the input is most definitely a complex array
+            ndarray_obj_t *target = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_FLOAT);
+            ndarray_copy_array(source, target, 0);
+            return MP_OBJ_FROM_PTR(target);
+        }
+    } else {
+        mp_raise_NotImplementedError(translate("function is implemented for ndarrays only"));
+    }
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(carray_real_obj, carray_real);
+
+//| def imag(val):
+//|     """
+//|     Return the imaginary part of the complex argument, which can be
+//|     either an ndarray, or a scalar."""
+//|     ...
+//|
+
+mp_obj_t carray_imag(mp_obj_t _source) {
+    if(mp_obj_is_type(_source, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(_source);
+        if(source->dtype != NDARRAY_COMPLEX) { // if not complex, then the imaginary part is zero
+            ndarray_obj_t *target = ndarray_new_dense_ndarray(source->ndim, source->shape, source->dtype);
+            return MP_OBJ_FROM_PTR(target);
+        } else { // the input is most definitely a complex array
+            ndarray_obj_t *target = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_FLOAT);
+            ndarray_copy_array(source, target, source->itemsize / 2);
+            return MP_OBJ_FROM_PTR(target);
+        }
+    } else {
+        mp_raise_NotImplementedError(translate("function is implemented for ndarrays only"));
+    }
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(carray_imag_obj, carray_imag);
+
+#if ULAB_NUMPY_HAS_CONJUGATE
+
+//| def conjugate(val):
+//|     """
+//|     Return the conjugate of the complex argument, which can be
+//|     either an ndarray, or a scalar."""
+//|     ...
+//|
+mp_obj_t carray_conjugate(mp_obj_t _source) {
+    if(mp_obj_is_type(_source, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(_source);
+        ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, source->dtype);
+        ndarray_copy_array(source, ndarray, 0);
+        if(source->dtype == NDARRAY_COMPLEX) {
+            mp_float_t *array = (mp_float_t *)ndarray->array;
+            array++;
+            for(size_t i = 0; i < ndarray->len; i++) {
+                *array *= MICROPY_FLOAT_CONST(-1.0);
+                array += 2;
+            }
+        }
+        return MP_OBJ_FROM_PTR(ndarray);
+    } else {
+        if(mp_obj_is_type(_source, &mp_type_complex)) {
+            mp_float_t real, imag;
+            mp_obj_get_complex(_source, &real, &imag);
+            imag = imag * MICROPY_FLOAT_CONST(-1.0);
+            return mp_obj_new_complex(real, imag);
+        } else if(mp_obj_is_int(_source) || mp_obj_is_float(_source)) {
+            return _source;
+        } else {
+            mp_raise_TypeError(translate("input must be an ndarray, or a scalar"));
+        }
+    }
+    // this should never happen
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(carray_conjugate_obj, carray_conjugate);
+#endif
+
+#if ULAB_NUMPY_HAS_SORT_COMPLEX
+//| def sort_complex(a: ulab.numpy.ndarray) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: a
+//|       a one-dimensional ndarray
+//|
+//|     Sort a complex array using the real part first, then the imaginary part.
+//|     Always returns a sorted complex array, even if the input was real."""
+//|     ...
+//|
+
+static void carray_sort_complex_(mp_float_t *array, size_t len) {
+    // array is assumed to be a floating vector containing the real and imaginary parts
+    // of a complex array at alternating positions as
+    // array[0] = real[0]
+    // array[1] = imag[0]
+    // array[2] = real[1]
+    // array[3] = imag[1]
+
+    mp_float_t real, imag;
+    size_t c, q = len, p, r = len >> 1;
+    for (;;) {
+        if (r > 0) {
+            r--;
+            real = array[2 * r];
+            imag = array[2 * r + 1];
+        } else {
+            q--;
+            if(q == 0) {
+                break;
+            }
+            real = array[2 * q];
+            imag = array[2 * q + 1];
+            array[2 * q] = array[0];
+            array[2 * q + 1] = array[1];
+        }
+        p = r;
+        c = r + r + 1;
+        while (c < q) {
+            if(c + 1 < q) {
+                if((array[2 * (c+1)] > array[2 * c]) ||
+                    ((array[2 * (c+1)] == array[2 * c]) && (array[2 * (c+1) + 1] > array[2 * c + 1]))) {
+                    c++;
+                }
+            }
+            if((array[2 * c] > real) ||
+                ((array[2 * c] == real) && (array[2 * c + 1] > imag))) {
+                array[2 * p] = array[2 * c]; // real part
+                array[2 * p + 1] = array[2 * c + 1]; // imag part
+                p = c;
+                c = p + p + 1;
+            } else {
+                break;
+            }
+        }
+        array[2 * p] = real;
+        array[2 * p + 1] = imag;
+    }
+}
+
+mp_obj_t carray_sort_complex(mp_obj_t _source) {
+    if(!mp_obj_is_type(_source, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("input must be a 1D ndarray"));
+    }
+    ndarray_obj_t *source = MP_OBJ_TO_PTR(_source);
+    if(source->ndim != 1) {
+        mp_raise_TypeError(translate("input must be a 1D ndarray"));
+    }
+
+    ndarray_obj_t *ndarray = ndarray_copy_view_convert_type(source, NDARRAY_COMPLEX);
+    mp_float_t *array = (mp_float_t *)ndarray->array;
+    carray_sort_complex_(array, ndarray->len);
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(carray_sort_complex_obj, carray_sort_complex);
+#endif
+
+//| def abs(a: ulab.numpy.ndarray) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: a
+//|       a one-dimensional ndarray
+//|
+//|     Return the absolute value of complex ndarray."""
+//|     ...
+//|
+
+mp_obj_t carray_abs(ndarray_obj_t *source, ndarray_obj_t *target) {
+    // calculates the absolute value of a complex array and returns a dense array
+    uint8_t *sarray = (uint8_t *)source->array;
+    mp_float_t *tarray = (mp_float_t *)target->array;
+    uint8_t itemsize = mp_binary_get_size('@', NDARRAY_FLOAT, NULL);
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    mp_float_t rvalue = *(mp_float_t *)sarray;
+                    mp_float_t ivalue = *(mp_float_t *)(sarray + itemsize);
+                    *tarray++ = MICROPY_FLOAT_C_FUN(sqrt)(rvalue * rvalue + ivalue * ivalue);
+                    sarray += source->strides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                sarray += source->strides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+            sarray += source->strides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+        sarray += source->strides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+    #endif
+    return MP_OBJ_FROM_PTR(target);
+}
+
+static void carray_copy_part(uint8_t *tarray, uint8_t *sarray, size_t *shape, int32_t *strides) {
+    // copies the real or imaginary part of an array
+    // into the respective part of a dense complex array
+    uint8_t sz = sizeof(mp_float_t);
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    memcpy(tarray, sarray, sz);
+                    tarray += 2 * sz;
+                    sarray += strides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                sarray -= strides[ULAB_MAX_DIMS - 1] * shape[ULAB_MAX_DIMS-1];
+                sarray += strides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < shape[ULAB_MAX_DIMS - 2]);
+            #endif /* ULAB_MAX_DIMS > 1 */
+        #if ULAB_MAX_DIMS > 2
+            sarray -= strides[ULAB_MAX_DIMS - 2] * shape[ULAB_MAX_DIMS-2];
+            sarray += strides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < shape[ULAB_MAX_DIMS - 3]);
+        #endif /* ULAB_MAX_DIMS > 2 */
+    #if ULAB_MAX_DIMS > 3
+        sarray -= strides[ULAB_MAX_DIMS - 3] * shape[ULAB_MAX_DIMS-3];
+        sarray += strides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i < shape[ULAB_MAX_DIMS - 4]);
+    #endif /* ULAB_MAX_DIMS > 3 */
+}
+
+mp_obj_t carray_binary_equal_not_equal(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides, mp_binary_op_t op) {
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_UINT8);
+    results->boolean = 1;
+    uint8_t *array = (uint8_t *)results->array;
+
+    if(op == MP_BINARY_OP_NOT_EQUAL) {
+        memset(array, 1, results->len);
+    }
+
+    if((lhs->dtype == NDARRAY_COMPLEX) && (rhs->dtype == NDARRAY_COMPLEX)) {
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        mp_float_t *rarray = (mp_float_t *)rhs->array;
+
+        ulab_rescale_float_strides(lstrides);
+        ulab_rescale_float_strides(rstrides);
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        if((larray[0] == rarray[0]) && (larray[1] == rarray[1])) {
+                            *array ^= 0x01;
+                        }
+                        array++;
+                        larray += lstrides[ULAB_MAX_DIMS - 1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    larray -= lstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    larray += lstrides[ULAB_MAX_DIMS - 2];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    rarray += rstrides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+                #endif /* ULAB_MAX_DIMS > 1 */
+            #if ULAB_MAX_DIMS > 2
+                larray -= lstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                larray += lstrides[ULAB_MAX_DIMS - 3];
+                rarray -= rstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                rarray += rstrides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+            #endif /* ULAB_MAX_DIMS > 2 */
+        #if ULAB_MAX_DIMS > 3
+            larray -= lstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            larray += lstrides[ULAB_MAX_DIMS - 4];
+            rarray -= rstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            rarray += rstrides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+        #endif /* ULAB_MAX_DIMS > 3 */
+    } else { // only one of the operands is complex
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        uint8_t *rarray = (uint8_t *)rhs->array;
+
+        // align the complex array to the left
+        uint8_t rdtype = rhs->dtype;
+        int32_t *lstrides_ = lstrides;
+        int32_t *rstrides_ = rstrides;
+
+        if(rhs->dtype == NDARRAY_COMPLEX) {
+            larray = (mp_float_t *)rhs->array;
+            rarray = (uint8_t *)lhs->array;
+            lstrides_ = rstrides;
+            rstrides_ = lstrides;
+            rdtype = lhs->dtype;
+        }
+
+        ulab_rescale_float_strides(lstrides_);
+
+        if(rdtype == NDARRAY_UINT8) {
+            BINARY_LOOP_COMPLEX_EQUAL(results, array, uint8_t, larray, lstrides_, rarray, rstrides_);
+        } else if(rdtype == NDARRAY_INT8) {
+            BINARY_LOOP_COMPLEX_EQUAL(results, array, int8_t, larray, lstrides_, rarray, rstrides_);
+        } else if(rdtype == NDARRAY_UINT16) {
+            BINARY_LOOP_COMPLEX_EQUAL(results, array, uint16_t, larray, lstrides_, rarray, rstrides_);
+        } else if(rdtype == NDARRAY_INT16) {
+            BINARY_LOOP_COMPLEX_EQUAL(results, array, int16_t, larray, lstrides_, rarray, rstrides_);
+        } else if(rdtype == NDARRAY_FLOAT) {
+            BINARY_LOOP_COMPLEX_EQUAL(results, array, mp_float_t, larray, lstrides_, rarray, rstrides_);
+        }
+    }
+    return MP_OBJ_FROM_PTR(results);
+}
+
+mp_obj_t carray_binary_add(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_COMPLEX);
+    mp_float_t *resarray = (mp_float_t *)results->array;
+
+    if((lhs->dtype == NDARRAY_COMPLEX) && (rhs->dtype == NDARRAY_COMPLEX)) {
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        mp_float_t *rarray = (mp_float_t *)rhs->array;
+
+        ulab_rescale_float_strides(lstrides);
+        ulab_rescale_float_strides(rstrides);
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        // real part
+                        *resarray++ = larray[0] + rarray[0];
+                        // imaginary part
+                        *resarray++ = larray[1] + rarray[1];
+                        larray += lstrides[ULAB_MAX_DIMS - 1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    larray -= lstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    larray += lstrides[ULAB_MAX_DIMS - 2];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    rarray += rstrides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+                #endif /* ULAB_MAX_DIMS > 1 */
+            #if ULAB_MAX_DIMS > 2
+                larray -= lstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                larray += lstrides[ULAB_MAX_DIMS - 3];
+                rarray -= rstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                rarray += rstrides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+            #endif /* ULAB_MAX_DIMS > 2 */
+        #if ULAB_MAX_DIMS > 3
+            larray -= lstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            larray += lstrides[ULAB_MAX_DIMS - 4];
+            rarray -= rstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            rarray += rstrides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+        #endif /* ULAB_MAX_DIMS > 3 */
+    } else { // only one of the operands is complex
+        uint8_t *larray = (uint8_t *)lhs->array;
+        uint8_t *rarray = (uint8_t *)rhs->array;
+
+        // align the complex array to the left
+        uint8_t rdtype = rhs->dtype;
+        int32_t *lstrides_ = lstrides;
+        int32_t *rstrides_ = rstrides;
+
+        if(rhs->dtype == NDARRAY_COMPLEX) {
+            larray = (uint8_t *)rhs->array;
+            rarray = (uint8_t *)lhs->array;
+            lstrides_ = rstrides;
+            rstrides_ = lstrides;
+            rdtype = lhs->dtype;
+        }
+
+        if(rdtype == NDARRAY_UINT8) {
+            BINARY_LOOP_COMPLEX(results, resarray, uint8_t, larray, lstrides_, rarray, rstrides_, +);
+        } else if(rdtype == NDARRAY_INT8) {
+            BINARY_LOOP_COMPLEX(results, resarray, int8_t, larray, lstrides_, rarray, rstrides_, +);
+        } else if(rdtype == NDARRAY_UINT16) {
+            BINARY_LOOP_COMPLEX(results, resarray, uint16_t, larray, lstrides_, rarray, rstrides_, +);
+        } else if(rdtype == NDARRAY_INT16) {
+            BINARY_LOOP_COMPLEX(results, resarray, int16_t, larray, lstrides_, rarray, rstrides_, +);
+        } else if(rdtype == NDARRAY_FLOAT) {
+            BINARY_LOOP_COMPLEX(results, resarray, mp_float_t, larray, lstrides_, rarray, rstrides_, +);
+        }
+
+        // simply copy the imaginary part
+        uint8_t *tarray = (uint8_t *)results->array;
+        tarray += sizeof(mp_float_t);
+
+        if(lhs->dtype == NDARRAY_COMPLEX) {
+            rarray = (uint8_t *)lhs->array;
+            rstrides = lstrides;
+        } else {
+            rarray = (uint8_t *)rhs->array;
+        }
+        rarray += sizeof(mp_float_t);
+        carray_copy_part(tarray, rarray, results->shape, rstrides);
+    }
+    return MP_OBJ_FROM_PTR(results);
+}
+
+static void carray_binary_multiply_(ndarray_obj_t *results, mp_float_t *resarray, uint8_t *larray, uint8_t *rarray,
+                            int32_t *lstrides, int32_t *rstrides, uint8_t rdtype) {
+
+    if(rdtype == NDARRAY_UINT8) {
+        BINARY_LOOP_COMPLEX(results, resarray, uint8_t, larray, lstrides, rarray, rstrides, *);
+    } else if(rdtype == NDARRAY_INT8) {
+        BINARY_LOOP_COMPLEX(results, resarray, int8_t, larray, lstrides, rarray, rstrides, *);
+    } else if(rdtype == NDARRAY_UINT16) {
+        BINARY_LOOP_COMPLEX(results, resarray, uint16_t, larray, lstrides, rarray, rstrides, *);
+    } else if(rdtype == NDARRAY_INT16) {
+        BINARY_LOOP_COMPLEX(results, resarray, int16_t, larray, lstrides, rarray, rstrides, *);
+    } else if(rdtype == NDARRAY_FLOAT) {
+        BINARY_LOOP_COMPLEX(results, resarray, mp_float_t, larray, lstrides, rarray, rstrides, *);
+    }
+}
+
+mp_obj_t carray_binary_multiply(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_COMPLEX);
+    mp_float_t *resarray = (mp_float_t *)results->array;
+
+    if((lhs->dtype == NDARRAY_COMPLEX) && (rhs->dtype == NDARRAY_COMPLEX)) {
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        mp_float_t *rarray = (mp_float_t *)rhs->array;
+
+        ulab_rescale_float_strides(lstrides);
+        ulab_rescale_float_strides(rstrides);
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        // real part
+                        *resarray++ = larray[0] * rarray[0] - larray[1] * rarray[1];
+                        // imaginary part
+                        *resarray++ = larray[0] * rarray[1] + larray[1] * rarray[0];
+                        larray += lstrides[ULAB_MAX_DIMS - 1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    larray -= lstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    larray += lstrides[ULAB_MAX_DIMS - 2];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    rarray += rstrides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+                #endif /* ULAB_MAX_DIMS > 1 */
+            #if ULAB_MAX_DIMS > 2
+                larray -= lstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                larray += lstrides[ULAB_MAX_DIMS - 3];
+                rarray -= rstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                rarray += rstrides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+            #endif /* ULAB_MAX_DIMS > 2 */
+        #if ULAB_MAX_DIMS > 3
+            larray -= lstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            larray += lstrides[ULAB_MAX_DIMS - 4];
+            rarray -= rstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            rarray += rstrides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+        #endif /* ULAB_MAX_DIMS > 3 */
+    } else { // only one of the operands is complex
+
+        uint8_t *larray = (uint8_t *)lhs->array;
+        uint8_t *rarray = (uint8_t *)rhs->array;
+        uint8_t *lo = larray, *ro = rarray;
+        int32_t *left_strides = lstrides;
+        int32_t *right_strides = rstrides;
+        uint8_t rdtype = rhs->dtype;
+
+        // align the complex array to the left
+        if(rhs->dtype == NDARRAY_COMPLEX) {
+            lo = (uint8_t *)rhs->array;
+            ro = (uint8_t *)lhs->array;
+            rdtype = lhs->dtype;
+            left_strides = rstrides;
+            right_strides = lstrides;
+        }
+
+        larray = lo;
+        rarray = ro;
+        // real part
+        carray_binary_multiply_(results, resarray, larray, rarray, left_strides, right_strides, rdtype);
+
+        larray = lo + sizeof(mp_float_t);
+        rarray = ro;
+        resarray = (mp_float_t *)results->array;
+        resarray++;
+        // imaginary part
+        carray_binary_multiply_(results, resarray, larray, rarray, left_strides, right_strides, rdtype);
+    }
+    return MP_OBJ_FROM_PTR(results);
+}
+
+mp_obj_t carray_binary_subtract(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_COMPLEX);
+    mp_float_t *resarray = (mp_float_t *)results->array;
+
+    if((lhs->dtype == NDARRAY_COMPLEX) && (rhs->dtype == NDARRAY_COMPLEX)) {
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        mp_float_t *rarray = (mp_float_t *)rhs->array;
+
+        ulab_rescale_float_strides(lstrides);
+        ulab_rescale_float_strides(rstrides);
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        // real part
+                        *resarray++ = larray[0] - rarray[0];
+                        // imaginary part
+                        *resarray++ = larray[1] - rarray[1];
+                        larray += lstrides[ULAB_MAX_DIMS - 1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    larray -= lstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    larray += lstrides[ULAB_MAX_DIMS - 2];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    rarray += rstrides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+                #endif /* ULAB_MAX_DIMS > 1 */
+            #if ULAB_MAX_DIMS > 2
+                larray -= lstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                larray += lstrides[ULAB_MAX_DIMS - 3];
+                rarray -= rstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                rarray += rstrides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+            #endif /* ULAB_MAX_DIMS > 2 */
+        #if ULAB_MAX_DIMS > 3
+            larray -= lstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            larray += lstrides[ULAB_MAX_DIMS - 4];
+            rarray -= rstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            rarray += rstrides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+        #endif /* ULAB_MAX_DIMS > 3 */
+    } else {
+        uint8_t *larray = (uint8_t *)lhs->array;
+        if(lhs->dtype == NDARRAY_COMPLEX) {
+            uint8_t *rarray = (uint8_t *)rhs->array;
+            if(rhs->dtype == NDARRAY_UINT8) {
+                BINARY_LOOP_COMPLEX(results, resarray, uint8_t, larray, lstrides, rarray, rstrides, -);
+            } else if(rhs->dtype == NDARRAY_INT8) {
+                BINARY_LOOP_COMPLEX(results, resarray, int8_t, larray, lstrides, rarray, rstrides, -);
+            } else if(rhs->dtype == NDARRAY_UINT16) {
+                BINARY_LOOP_COMPLEX(results, resarray, uint16_t, larray, lstrides, rarray, rstrides, -);
+            } else if(rhs->dtype == NDARRAY_INT16) {
+                BINARY_LOOP_COMPLEX(results, resarray, int16_t, larray, lstrides, rarray, rstrides, -);
+            } else if(rhs->dtype == NDARRAY_FLOAT) {
+                BINARY_LOOP_COMPLEX(results, resarray, mp_float_t, larray, lstrides, rarray, rstrides, -);
+            }
+            // copy the imaginary part
+            uint8_t *tarray = (uint8_t *)results->array;
+            tarray += sizeof(mp_float_t);
+
+            larray = (uint8_t *)lhs->array;
+            larray += sizeof(mp_float_t);
+
+            carray_copy_part(tarray, larray, results->shape, lstrides);
+        } else if(rhs->dtype == NDARRAY_COMPLEX) {
+            mp_float_t *rarray = (mp_float_t *)rhs->array;
+            ulab_rescale_float_strides(rstrides);
+
+            if(lhs->dtype == NDARRAY_UINT8) {
+                BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT(results, resarray, uint8_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_INT8) {
+                BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT(results, resarray, int8_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_UINT16) {
+                BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT(results, resarray, uint16_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_INT16) {
+                BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT(results, resarray, int16_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_FLOAT) {
+                BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT(results, resarray, mp_float_t, larray, lstrides, rarray, rstrides);
+            }
+        }
+    }
+
+    return MP_OBJ_FROM_PTR(results);
+}
+
+static void carray_binary_left_divide_(ndarray_obj_t *results, mp_float_t *resarray, uint8_t *larray, uint8_t *rarray,
+                            int32_t *lstrides, int32_t *rstrides, uint8_t rdtype) {
+
+    if(rdtype == NDARRAY_UINT8) {
+        BINARY_LOOP_COMPLEX(results, resarray, uint8_t, larray, lstrides, rarray, rstrides, /);
+    } else if(rdtype == NDARRAY_INT8) {
+        BINARY_LOOP_COMPLEX(results, resarray, int8_t, larray, lstrides, rarray, rstrides, /);
+    } else if(rdtype == NDARRAY_UINT16) {
+        BINARY_LOOP_COMPLEX(results, resarray, uint16_t, larray, lstrides, rarray, rstrides, /);
+    } else if(rdtype == NDARRAY_INT16) {
+        BINARY_LOOP_COMPLEX(results, resarray, int16_t, larray, lstrides, rarray, rstrides, /);
+    } else if(rdtype == NDARRAY_FLOAT) {
+        BINARY_LOOP_COMPLEX(results, resarray, mp_float_t, larray, lstrides, rarray, rstrides, /);
+    }
+}
+
+mp_obj_t carray_binary_divide(ndarray_obj_t *lhs, ndarray_obj_t *rhs,
+                            uint8_t ndim, size_t *shape, int32_t *lstrides, int32_t *rstrides) {
+
+    ndarray_obj_t *results = ndarray_new_dense_ndarray(ndim, shape, NDARRAY_COMPLEX);
+    mp_float_t *resarray = (mp_float_t *)results->array;
+
+    if((lhs->dtype == NDARRAY_COMPLEX) && (rhs->dtype == NDARRAY_COMPLEX)) {
+        mp_float_t *larray = (mp_float_t *)lhs->array;
+        mp_float_t *rarray = (mp_float_t *)rhs->array;
+
+        ulab_rescale_float_strides(lstrides);
+        ulab_rescale_float_strides(rstrides);
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        // (a + bi) / (c + di) =
+                        // (ac + bd) / (c^2 + d^2) + i (bc - ad) / (c^2 + d^2)
+                        // denominator
+                        mp_float_t denom = rarray[0] * rarray[0] + rarray[1] * rarray[1];
+
+                        // real part
+                        *resarray++ = (larray[0] * rarray[0] + larray[1] * rarray[1]) / denom;
+                        // imaginary part
+                        *resarray++ = (larray[1] * rarray[0] - larray[0] * rarray[1]) / denom;
+                        larray += lstrides[ULAB_MAX_DIMS - 1];
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < results->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    larray -= lstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    larray += lstrides[ULAB_MAX_DIMS - 2];
+                    rarray -= rstrides[ULAB_MAX_DIMS - 1] * results->shape[ULAB_MAX_DIMS-1];
+                    rarray += rstrides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < results->shape[ULAB_MAX_DIMS - 2]);
+                #endif /* ULAB_MAX_DIMS > 1 */
+            #if ULAB_MAX_DIMS > 2
+                larray -= lstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                larray += lstrides[ULAB_MAX_DIMS - 3];
+                rarray -= rstrides[ULAB_MAX_DIMS - 2] * results->shape[ULAB_MAX_DIMS-2];
+                rarray += rstrides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < results->shape[ULAB_MAX_DIMS - 3]);
+            #endif /* ULAB_MAX_DIMS > 2 */
+        #if ULAB_MAX_DIMS > 3
+            larray -= lstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            larray += lstrides[ULAB_MAX_DIMS - 4];
+            rarray -= rstrides[ULAB_MAX_DIMS - 3] * results->shape[ULAB_MAX_DIMS-3];
+            rarray += rstrides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < results->shape[ULAB_MAX_DIMS - 4]);
+        #endif /* ULAB_MAX_DIMS > 3 */
+    } else {
+        uint8_t *larray = (uint8_t *)lhs->array;
+        uint8_t *rarray = (uint8_t *)rhs->array;
+        if(lhs->dtype == NDARRAY_COMPLEX) {
+            // real part
+            carray_binary_left_divide_(results, resarray, larray, rarray, lstrides, rstrides, rhs->dtype);
+            // imaginary part
+            resarray = (mp_float_t *)results->array;
+            resarray++;
+            larray = (uint8_t *)lhs->array;
+            larray += sizeof(mp_float_t);
+            rarray = (uint8_t *)rhs->array;
+            carray_binary_left_divide_(results, resarray, larray, rarray, lstrides, rstrides, rhs->dtype);
+        } else {
+            if(lhs->dtype == NDARRAY_UINT8) {
+                BINARY_LOOP_COMPLEX_RIGHT_DIVIDE(results, resarray, uint8_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_INT8) {
+                BINARY_LOOP_COMPLEX_RIGHT_DIVIDE(results, resarray, int8_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_UINT16) {
+                BINARY_LOOP_COMPLEX_RIGHT_DIVIDE(results, resarray, uint16_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_INT16) {
+                BINARY_LOOP_COMPLEX_RIGHT_DIVIDE(results, resarray, int16_t, larray, lstrides, rarray, rstrides);
+            } else if(lhs->dtype == NDARRAY_FLOAT) {
+                BINARY_LOOP_COMPLEX_RIGHT_DIVIDE(results, resarray, mp_float_t, larray, lstrides, rarray, rstrides);
+            }
+        }
+    }
+
+    return MP_OBJ_FROM_PTR(results);
+}
+
+#endif
diff --git a/python/port/mod/ulab/numpy/carray/carray.h b/python/port/mod/ulab/numpy/carray/carray.h
new file mode 100644
index 000000000..8ca5de2dd
--- /dev/null
+++ b/python/port/mod/ulab/numpy/carray/carray.h
@@ -0,0 +1,237 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2021-2022 Zoltán Vörös
+*/
+
+#ifndef _CARRAY_
+#define _CARRAY_
+
+MP_DECLARE_CONST_FUN_OBJ_1(carray_real_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(carray_imag_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(carray_conjugate_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(carray_sort_complex_obj);
+
+
+mp_obj_t carray_imag(mp_obj_t );
+mp_obj_t carray_real(mp_obj_t );
+
+mp_obj_t carray_abs(ndarray_obj_t *, ndarray_obj_t *);
+mp_obj_t carray_binary_add(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+mp_obj_t carray_binary_multiply(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+mp_obj_t carray_binary_subtract(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+mp_obj_t carray_binary_divide(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *);
+mp_obj_t carray_binary_equal_not_equal(ndarray_obj_t *, ndarray_obj_t *, uint8_t , size_t *, int32_t *, int32_t *, mp_binary_op_t );
+
+#define BINARY_LOOP_COMPLEX1(results, resarray, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t l = 0;\
+    do {\
+        *(resarray) = *((mp_float_t *)(larray)) OPERATOR *((type_right *)(rarray));\
+        (resarray) += 2;\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#define BINARY_LOOP_COMPLEX2(results, resarray, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t k = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX1((results), (resarray), type_right, (larray), (lstrides), (rarray), (rstrides), OPERATOR);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#define BINARY_LOOP_COMPLEX3(results, resarray, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t j = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX2((results), (resarray), type_right, (larray), (lstrides), (rarray), (rstrides), OPERATOR);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#define BINARY_LOOP_COMPLEX4(results, resarray, type_right, larray, lstrides, rarray, rstrides, OPERATOR)\
+    size_t i = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX3((results), (resarray), type_right, (larray), (lstrides), (rarray), (rstrides), OPERATOR);\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT1(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t l = 0;\
+    do {\
+        *(resarray)++ = *((type_left *)(larray)) - (rarray)[0];\
+        *(resarray)++ = -(rarray)[1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT2(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t k = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT1((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS-1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT3(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t j = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT2((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT4(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t i = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT3((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE1(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t l = 0;\
+    do {\
+        mp_float_t *c = (mp_float_t *)(rarray);\
+        mp_float_t denom = c[0] * c[0] + c[1] * c[1];\
+        mp_float_t a = *((type_left *)(larray)) / denom;\
+        *(resarray)++ = a * c[0];\
+        *(resarray)++ = -a * c[1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE2(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t k = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_RIGHT_DIVIDE1((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE3(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t j = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_RIGHT_DIVIDE2((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE4(results, resarray, type_left, larray, lstrides, rarray, rstrides)\
+    size_t i = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_RIGHT_DIVIDE3((results), (resarray), type_left, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+
+#define BINARY_LOOP_COMPLEX_EQUAL1(results, array, type_right, larray, lstrides, rarray, rstrides)\
+    size_t l = 0;\
+    do {\
+        if((*(larray) == *((type_right *)(rarray))) && ((larray)[1] == MICROPY_FLOAT_CONST(0.0))) {\
+            *(array) ^= 0x01;\
+        }\
+        (array)++;\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 1];\
+        l++;\
+    } while(l < (results)->shape[ULAB_MAX_DIMS - 1]);\
+
+#define BINARY_LOOP_COMPLEX_EQUAL2(results, array, type_right, larray, lstrides, rarray, rstrides)\
+    size_t k = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_EQUAL1((results), (array), type_right, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 2];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 1] * (results)->shape[ULAB_MAX_DIMS - 1];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 2];\
+        k++;\
+    } while(k < (results)->shape[ULAB_MAX_DIMS - 2]);\
+
+#define BINARY_LOOP_COMPLEX_EQUAL3(results, array, type_right, larray, lstrides, rarray, rstrides)\
+    size_t j = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_EQUAL2((results), (array), type_right, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 3];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 2] * (results)->shape[ULAB_MAX_DIMS - 2];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 3];\
+        j++;\
+    } while(j < (results)->shape[ULAB_MAX_DIMS - 3]);\
+
+#define BINARY_LOOP_COMPLEX_EQUAL4(results, array, type_right, larray, lstrides, rarray, rstrides)\
+    size_t i = 0;\
+    do {\
+        BINARY_LOOP_COMPLEX_EQUAL3((results), (array), type_right, (larray), (lstrides), (rarray), (rstrides));\
+        (larray) -= (lstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (larray) += (lstrides)[ULAB_MAX_DIMS - 4];\
+        (rarray) -= (rstrides)[ULAB_MAX_DIMS - 3] * (results)->shape[ULAB_MAX_DIMS - 3];\
+        (rarray) += (rstrides)[ULAB_MAX_DIMS - 4];\
+        i++;\
+    } while(i < (results)->shape[ULAB_MAX_DIMS - 4]);\
+
+#if ULAB_MAX_DIMS == 1
+#define BINARY_LOOP_COMPLEX BINARY_LOOP_COMPLEX1
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT1
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE BINARY_LOOP_COMPLEX_RIGHT_DIVIDE1
+#define BINARY_LOOP_COMPLEX_EQUAL BINARY_LOOP_COMPLEX_EQUAL1
+#endif /* ULAB_MAX_DIMS == 1 */
+
+#if ULAB_MAX_DIMS == 2
+#define BINARY_LOOP_COMPLEX BINARY_LOOP_COMPLEX2
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT2
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE BINARY_LOOP_COMPLEX_RIGHT_DIVIDE2
+#define BINARY_LOOP_COMPLEX_EQUAL BINARY_LOOP_COMPLEX_EQUAL2
+#endif /* ULAB_MAX_DIMS == 2 */
+
+#if ULAB_MAX_DIMS == 3
+#define BINARY_LOOP_COMPLEX BINARY_LOOP_COMPLEX3
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT3
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE BINARY_LOOP_COMPLEX_RIGHT_DIVIDE3
+#define BINARY_LOOP_COMPLEX_EQUAL BINARY_LOOP_COMPLEX_EQUAL3
+#endif /* ULAB_MAX_DIMS == 3 */
+
+#if ULAB_MAX_DIMS == 4
+#define BINARY_LOOP_COMPLEX BINARY_LOOP_COMPLEX4
+#define BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT BINARY_LOOP_COMPLEX_REVERSED_SUBTRACT4
+#define BINARY_LOOP_COMPLEX_RIGHT_DIVIDE BINARY_LOOP_COMPLEX_RIGHT_DIVIDE4
+#define BINARY_LOOP_COMPLEX_EQUAL BINARY_LOOP_COMPLEX_EQUAL4
+#endif /* ULAB_MAX_DIMS == 4 */
+
+#endif
diff --git a/python/port/mod/ulab/numpy/carray/carray_tools.c b/python/port/mod/ulab/numpy/carray/carray_tools.c
new file mode 100644
index 000000000..7b623d349
--- /dev/null
+++ b/python/port/mod/ulab/numpy/carray/carray_tools.c
@@ -0,0 +1,28 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2022 Zoltán Vörös
+*/
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+#include "py/misc.h"
+
+#include "../../ulab.h"
+#include "../../ndarray.h"
+
+#if ULAB_SUPPORTS_COMPLEX
+
+void raise_complex_NotImplementedError(void) {
+    mp_raise_NotImplementedError(translate("not implemented for complex dtype"));
+}
+
+#endif
diff --git a/python/port/mod/ulab/numpy/carray/carray_tools.h b/python/port/mod/ulab/numpy/carray/carray_tools.h
new file mode 100644
index 000000000..3ac79b5f4
--- /dev/null
+++ b/python/port/mod/ulab/numpy/carray/carray_tools.h
@@ -0,0 +1,25 @@
+
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2022 Zoltán Vörös
+*/
+
+#ifndef _CARRAY_TOOLS_
+#define _CARRAY_TOOLS_
+
+void raise_complex_NotImplementedError(void);
+
+#if ULAB_SUPPORTS_COMPLEX
+    #define NOT_IMPLEMENTED_FOR_COMPLEX() raise_complex_NotImplementedError();
+    #define COMPLEX_DTYPE_NOT_IMPLEMENTED(dtype) if((dtype) == NDARRAY_COMPLEX) raise_complex_NotImplementedError();
+#else
+    #define NOT_IMPLEMENTED_FOR_COMPLEX() // do nothing
+    #define COMPLEX_DTYPE_NOT_IMPLEMENTED(dtype) // do nothing
+#endif
+
+#endif
diff --git a/python/port/mod/ulab/numpy/compare.c b/python/port/mod/ulab/numpy/compare.c
index b9154569c..5a820725c 100644
--- a/python/port/mod/ulab/numpy/compare.c
+++ b/python/port/mod/ulab/numpy/compare.c
@@ -20,11 +20,17 @@
 #include "../ulab.h"
 #include "../ndarray_operators.h"
 #include "../ulab_tools.h"
+#include "carray/carray_tools.h"
 #include "compare.h"
 
 static mp_obj_t compare_function(mp_obj_t x1, mp_obj_t x2, uint8_t op) {
     ndarray_obj_t *lhs = ndarray_from_mp_obj(x1, 0);
     ndarray_obj_t *rhs = ndarray_from_mp_obj(x2, 0);
+    #if ULAB_SUPPORTS_COMPLEX
+    if((lhs->dtype == NDARRAY_COMPLEX) || (rhs->dtype == NDARRAY_COMPLEX)) {
+        NOT_IMPLEMENTED_FOR_COMPLEX()
+    }
+    #endif
     uint8_t ndim = 0;
     size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
     int32_t *lstrides = m_new(int32_t, ULAB_MAX_DIMS);
@@ -197,6 +203,7 @@ static mp_obj_t compare_isinf_isfinite(mp_obj_t _x, uint8_t mask) {
         }
     } else if(mp_obj_is_type(_x, &ulab_ndarray_type)) {
         ndarray_obj_t *x = MP_OBJ_TO_PTR(_x);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(x->dtype)
         ndarray_obj_t *results = ndarray_new_dense_ndarray(x->ndim, x->shape, NDARRAY_BOOL);
         // At this point, results is all False
         uint8_t *rarray = (uint8_t *)results->array;
@@ -313,6 +320,10 @@ mp_obj_t compare_where(mp_obj_t _condition, mp_obj_t _x, mp_obj_t _y) {
     ndarray_obj_t *x = ndarray_from_mp_obj(_x, 0);
     ndarray_obj_t *y = ndarray_from_mp_obj(_y, 0);
 
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(c->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(x->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(y->dtype)
+
     int32_t *cstrides = m_new(int32_t, ULAB_MAX_DIMS);
     int32_t *xstrides = m_new(int32_t, ULAB_MAX_DIMS);
     int32_t *ystrides = m_new(int32_t, ULAB_MAX_DIMS);
diff --git a/python/port/mod/ulab/numpy/create.c b/python/port/mod/ulab/numpy/create.c
new file mode 100644
index 000000000..883fc3261
--- /dev/null
+++ b/python/port/mod/ulab/numpy/create.c
@@ -0,0 +1,843 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2019-2021 Zoltán Vörös
+ *               2020 Taku Fukada
+*/
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "py/obj.h"
+#include "py/runtime.h"
+
+#include "create.h"
+#include "../ulab.h"
+#include "../ulab_tools.h"
+
+#if ULAB_NUMPY_HAS_ONES | ULAB_NUMPY_HAS_ZEROS | ULAB_NUMPY_HAS_FULL | ULAB_NUMPY_HAS_EMPTY
+static mp_obj_t create_zeros_ones_full(mp_obj_t oshape, uint8_t dtype, mp_obj_t value) {
+    if(!mp_obj_is_int(oshape) && !mp_obj_is_type(oshape, &mp_type_tuple) && !mp_obj_is_type(oshape, &mp_type_list)) {
+        mp_raise_TypeError(translate("input argument must be an integer, a tuple, or a list"));
+    }
+    ndarray_obj_t *ndarray = NULL;
+    if(mp_obj_is_int(oshape)) {
+        size_t n = mp_obj_get_int(oshape);
+        ndarray = ndarray_new_linear_array(n, dtype);
+    } else if(mp_obj_is_type(oshape, &mp_type_tuple) || mp_obj_is_type(oshape, &mp_type_list)) {
+        uint8_t len = (uint8_t)mp_obj_get_int(mp_obj_len_maybe(oshape));
+        if(len > ULAB_MAX_DIMS) {
+            mp_raise_TypeError(translate("too many dimensions"));
+        }
+        size_t *shape = m_new0(size_t, ULAB_MAX_DIMS);
+
+        size_t i = 0;
+        mp_obj_iter_buf_t iter_buf;
+        mp_obj_t item, iterable = mp_getiter(oshape, &iter_buf);
+        while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION){
+            shape[ULAB_MAX_DIMS - len + i] = (size_t)mp_obj_get_int(item);
+            i++;
+        }
+        ndarray = ndarray_new_dense_ndarray(len, shape, dtype);
+    }
+    if(value != mp_const_none) {
+        if(dtype == NDARRAY_BOOL) {
+            dtype = NDARRAY_UINT8;
+            if(mp_obj_is_true(value)) {
+                value = mp_obj_new_int(1);
+            } else {
+                value = mp_obj_new_int(0);
+            }
+        }
+        for(size_t i=0; i < ndarray->len; i++) {
+            #if ULAB_SUPPORTS_COMPLEX
+            if(dtype == NDARRAY_COMPLEX) {
+                ndarray_set_complex_value(ndarray->array, i, value);
+            } else {
+                ndarray_set_value(dtype, ndarray->array, i, value);
+            }
+            #else
+            ndarray_set_value(dtype, ndarray->array, i, value);
+            #endif
+        }
+    }
+    // if zeros calls the function, we don't have to do anything
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+#endif
+
+#if ULAB_NUMPY_HAS_ARANGE | ULAB_NUMPY_HAS_LINSPACE
+static ndarray_obj_t *create_linspace_arange(mp_float_t start, mp_float_t step, mp_float_t stop, size_t len, uint8_t dtype) {
+    mp_float_t value = start;
+
+    ndarray_obj_t *ndarray = ndarray_new_linear_array(len, dtype);
+    if(ndarray->boolean == NDARRAY_BOOLEAN) {
+        uint8_t *array = (uint8_t *)ndarray->array;
+        for(size_t i=0; i < len; i++, value += step) {
+            *array++ = value == MICROPY_FLOAT_CONST(0.0) ? 0 : 1;
+        }
+    } else if(dtype == NDARRAY_UINT8) {
+        ARANGE_LOOP(uint8_t, ndarray, len, step, stop);
+    } else if(dtype == NDARRAY_INT8) {
+        ARANGE_LOOP(int8_t, ndarray, len, step, stop);
+    } else if(dtype == NDARRAY_UINT16) {
+        ARANGE_LOOP(uint16_t, ndarray, len, step, stop);
+    } else if(dtype == NDARRAY_INT16) {
+        ARANGE_LOOP(int16_t, ndarray, len, step, stop);
+    } else {
+        ARANGE_LOOP(mp_float_t, ndarray, len, step, stop);
+    }
+    return ndarray;
+}
+#endif
+
+#if ULAB_NUMPY_HAS_ARANGE
+//| @overload
+//| def arange(stop: _float, step: _float = 1, *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray: ...
+//| @overload
+//| def arange(start: _float, stop: _float, step: _float = 1, *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: start
+//|       First value in the array, optional, defaults to 0
+//|     .. param: stop
+//|       Final value in the array
+//|     .. param: step
+//|       Difference between consecutive elements, optional, defaults to 1.0
+//|     .. param: dtype
+//|       Type of values in the array
+//|
+//|     Return a new 1-D array with elements ranging from ``start`` to ``stop``, with step size ``step``."""
+//|     ...
+//|
+
+mp_obj_t create_arange(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+    uint8_t dtype = NDARRAY_FLOAT;
+    mp_float_t start, stop, step;
+    if(n_args == 1) {
+        start = MICROPY_FLOAT_CONST(0.0);
+        stop = mp_obj_get_float(args[0].u_obj);
+        step = MICROPY_FLOAT_CONST(1.0);
+        if(mp_obj_is_int(args[0].u_obj)) dtype = NDARRAY_INT16;
+    } else if(n_args == 2) {
+        start = mp_obj_get_float(args[0].u_obj);
+        stop = mp_obj_get_float(args[1].u_obj);
+        step = MICROPY_FLOAT_CONST(1.0);
+        if(mp_obj_is_int(args[0].u_obj) && mp_obj_is_int(args[1].u_obj)) dtype = NDARRAY_INT16;
+    } else if(n_args == 3) {
+        start = mp_obj_get_float(args[0].u_obj);
+        stop = mp_obj_get_float(args[1].u_obj);
+        step = mp_obj_get_float(args[2].u_obj);
+        if(mp_obj_is_int(args[0].u_obj) && mp_obj_is_int(args[1].u_obj) && mp_obj_is_int(args[2].u_obj)) dtype = NDARRAY_INT16;
+    } else {
+        mp_raise_TypeError(translate("wrong number of arguments"));
+    }
+    if((MICROPY_FLOAT_C_FUN(fabs)(stop) > 32768) || (MICROPY_FLOAT_C_FUN(fabs)(start) > 32768) || (MICROPY_FLOAT_C_FUN(fabs)(step) > 32768)) {
+        dtype = NDARRAY_FLOAT;
+    }
+    if(args[3].u_obj != mp_const_none) {
+        dtype = (uint8_t)mp_obj_get_int(args[3].u_obj);
+    }
+    ndarray_obj_t *ndarray;
+    if((stop - start)/step < 0) {
+        ndarray = ndarray_new_linear_array(0, dtype);
+    } else {
+        size_t len = (size_t)(MICROPY_FLOAT_C_FUN(ceil)((stop - start) / step));
+        stop = start + (len - 1) * step;
+        ndarray = create_linspace_arange(start, step, stop, len, dtype);
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_arange_obj, 1, create_arange);
+#endif
+
+
+#if ULAB_NUMPY_HAS_ASARRAY
+mp_obj_t create_asarray(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    uint8_t _dtype;
+    #if ULAB_HAS_DTYPE_OBJECT
+    if(mp_obj_is_type(args[1].u_obj, &ulab_dtype_type)) {
+        dtype_obj_t *dtype = MP_OBJ_TO_PTR(args[1].u_obj);
+        _dtype = dtype->dtype;
+    } else { // this must be an integer defined as a class constant (ulab.numpy.uint8 etc.)
+        if(args[1].u_obj == mp_const_none) {
+            _dtype = 0;
+        } else {
+            _dtype = mp_obj_get_int(args[1].u_obj);
+        }
+    }
+    #else
+    if(args[1].u_obj == mp_const_none) {
+        _dtype = 0;
+    } else {
+        _dtype = mp_obj_get_int(args[1].u_obj);
+    }
+    #endif
+
+    if(ulab_tools_mp_obj_is_scalar(args[0].u_obj)) {
+        return args[0].u_obj;
+    } else if(mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
+        if((_dtype == ndarray->dtype) || (_dtype == 0)) {
+            return args[0].u_obj;
+        } else {
+            return MP_OBJ_FROM_PTR(ndarray_copy_view_convert_type(ndarray, _dtype));
+        }
+    } else if(ndarray_object_is_array_like(args[0].u_obj)) {
+        if(_dtype == 0) {
+            _dtype = NDARRAY_FLOAT;
+        }
+        return MP_OBJ_FROM_PTR(ndarray_from_iterable(args[0].u_obj, _dtype));
+    } else {
+        mp_raise_TypeError(translate("wrong input type"));
+    }
+    return mp_const_none; // this should never happen
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_asarray_obj, 1, create_asarray);
+#endif
+
+#if ULAB_NUMPY_HAS_CONCATENATE
+//| def concatenate(arrays: Tuple[ulab.numpy.ndarray], *, axis: int = 0) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: arrays
+//|       tuple of ndarrays
+//|     .. param: axis
+//|       axis along which the arrays will be joined
+//|
+//|     Join a sequence of arrays along an existing axis."""
+//|     ...
+//|
+
+mp_obj_t create_concatenate(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = 0 } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!mp_obj_is_type(args[0].u_obj, &mp_type_tuple)) {
+        mp_raise_TypeError(translate("first argument must be a tuple of ndarrays"));
+    }
+    int8_t axis = (int8_t)args[1].u_int;
+    size_t *shape = m_new0(size_t, ULAB_MAX_DIMS);
+    mp_obj_tuple_t *ndarrays = MP_OBJ_TO_PTR(args[0].u_obj);
+
+    // first check, whether the arrays are compatible
+    ndarray_obj_t *_ndarray = MP_OBJ_TO_PTR(ndarrays->items[0]);
+    uint8_t dtype = _ndarray->dtype;
+    uint8_t ndim = _ndarray->ndim;
+    if(axis < 0) {
+        axis += ndim;
+    }
+    if((axis < 0) || (axis >= ndim)) {
+        mp_raise_ValueError(translate("wrong axis specified"));
+    }
+    // shift axis
+    axis = ULAB_MAX_DIMS - ndim + axis;
+    for(uint8_t j=0; j < ULAB_MAX_DIMS; j++) {
+        shape[j] = _ndarray->shape[j];
+    }
+
+    for(uint8_t i=1; i < ndarrays->len; i++) {
+        _ndarray = MP_OBJ_TO_PTR(ndarrays->items[i]);
+        // check, whether the arrays are compatible
+        if((dtype != _ndarray->dtype) || (ndim != _ndarray->ndim)) {
+            mp_raise_ValueError(translate("input arrays are not compatible"));
+        }
+        for(uint8_t j=0; j < ULAB_MAX_DIMS; j++) {
+            if(j == axis) {
+                shape[j] += _ndarray->shape[j];
+            } else {
+                if(shape[j] != _ndarray->shape[j]) {
+                    mp_raise_ValueError(translate("input arrays are not compatible"));
+                }
+            }
+        }
+    }
+
+    ndarray_obj_t *target = ndarray_new_dense_ndarray(ndim, shape, dtype);
+    uint8_t *tpos = (uint8_t *)target->array;
+    uint8_t *tarray;
+
+    for(uint8_t p=0; p < ndarrays->len; p++) {
+        // reset the pointer along the axis
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(ndarrays->items[p]);
+        uint8_t *sarray = (uint8_t *)source->array;
+        tarray = tpos;
+
+        #if ULAB_MAX_DIMS > 3
+        size_t i = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 2
+            size_t j = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 1
+                size_t k = 0;
+                do {
+                #endif
+                    size_t l = 0;
+                    do {
+                        memcpy(tarray, sarray, source->itemsize);
+                        tarray += target->strides[ULAB_MAX_DIMS - 1];
+                        sarray += source->strides[ULAB_MAX_DIMS - 1];
+                        l++;
+                    } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+                #if ULAB_MAX_DIMS > 1
+                    tarray -= target->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                    tarray += target->strides[ULAB_MAX_DIMS - 2];
+                    sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                    sarray += source->strides[ULAB_MAX_DIMS - 2];
+                    k++;
+                } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+                #endif
+            #if ULAB_MAX_DIMS > 2
+                tarray -= target->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                tarray += target->strides[ULAB_MAX_DIMS - 3];
+                sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                sarray += source->strides[ULAB_MAX_DIMS - 3];
+                j++;
+            } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+            #endif
+        #if ULAB_MAX_DIMS > 3
+            tarray -= target->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+            tarray += target->strides[ULAB_MAX_DIMS - 4];
+            sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+            sarray += source->strides[ULAB_MAX_DIMS - 4];
+            i++;
+        } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+        #endif
+        if(p < ndarrays->len - 1) {
+            tpos += target->strides[axis] * source->shape[axis];
+        }
+    }
+    return MP_OBJ_FROM_PTR(target);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_concatenate_obj, 1, create_concatenate);
+#endif
+
+#if ULAB_MAX_DIMS > 1
+#if ULAB_NUMPY_HAS_DIAG
+//| def diag(a: ulab.numpy.ndarray, *, k: int = 0) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: a
+//|       an ndarray
+//|     .. param: k
+//|       Offset of the diagonal from the main diagonal. Can be positive or negative.
+//|
+//|     Return specified diagonals."""
+//|     ...
+//|
+
+mp_obj_t create_diag(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_k, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = 0 } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    ndarray_obj_t *source = ndarray_from_iterable(args[0].u_obj, NDARRAY_FLOAT);
+    ndarray_obj_t *target = NULL;
+
+    int32_t k = args[1].u_int;
+    size_t k_abs = k >= 0 ? (size_t)k : (size_t)(-k);
+    if(source->ndim == 2) { // return the diagonal
+        size_t len;
+        if(k >= 0) {
+           len = (k_abs <= source->shape[ULAB_MAX_DIMS - 1]) ? source->shape[ULAB_MAX_DIMS - 1] - k_abs : 0;
+        } else {
+           len = (k_abs <= source->shape[ULAB_MAX_DIMS - 2]) ? source->shape[ULAB_MAX_DIMS - 2] - k_abs : 0;
+        }
+        target = ndarray_new_linear_array(len, source->dtype);
+
+        if(len == 0) {
+            return MP_OBJ_FROM_PTR(target);
+        }
+
+        uint8_t *sarray = (uint8_t *)source->array;
+        uint8_t *tarray = (uint8_t *)target->array;
+        if(k >= 0) {
+            sarray += source->strides[ULAB_MAX_DIMS - 1] * k;
+        } else {
+            sarray += source->strides[ULAB_MAX_DIMS - 2] * k_abs;
+        }
+        for(size_t i=0; i < len; i++) {
+            memcpy(tarray, sarray, source->itemsize);
+            sarray += (source->strides[ULAB_MAX_DIMS - 1] + source->strides[ULAB_MAX_DIMS - 2]);
+            tarray += target->itemsize;
+        }
+    } else if(source->ndim == 1) { // return a rank-2 tensor with the prescribed diagonal
+        size_t len = source->len + k_abs;
+        target = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, len, len), source->dtype);
+        uint8_t *sarray = (uint8_t *)source->array;
+        uint8_t *tarray = (uint8_t *)target->array;
+
+        if(k < 0) {
+            tarray += len * k_abs * target->itemsize;
+        } else {
+            tarray += k_abs * target->itemsize;
+        }
+        for(size_t i = 0; i < source->len; i++) {
+            memcpy(tarray, sarray, source->itemsize);
+            sarray += source->strides[ULAB_MAX_DIMS - 1];
+            tarray += (len + 1) * target->itemsize;
+        }
+    }
+    #if ULAB_MAX_DIMS > 2
+    else {
+        mp_raise_ValueError(translate("input must be 1- or 2-d"));
+    }
+    #endif
+
+    return MP_OBJ_FROM_PTR(target);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_diag_obj, 1, create_diag);
+#endif /* ULAB_NUMPY_HAS_DIAG */
+
+#if ULAB_NUMPY_HAS_EMPTY
+// This function is bound in numpy.c to numpy.zeros(), and is simply an alias for that
+
+//| def empty(shape: Union[int, Tuple[int, ...]], *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|    """
+//|    .. param: shape
+//|       Shape of the array, either an integer (for a 1-D array) or a tuple of 2 integers (for a 2-D array)
+//|    .. param: dtype
+//|       Type of values in the array
+//|
+//|    Return a new array of the given shape with all elements set to 0. An alias for numpy.zeros."""
+//|    ...
+//|
+#endif
+
+#if ULAB_NUMPY_HAS_EYE
+//| def eye(size: int, *, M: Optional[int] = None, k: int = 0, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|     """Return a new square array of size, with the diagonal elements set to 1
+//|        and the other elements set to 0. If k is given, the diagonal is shifted by the specified amount."""
+//|     ...
+//|
+
+mp_obj_t create_eye(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_INT, { .u_int = 0 } },
+        { MP_QSTR_M, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_k, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = 0 } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    size_t n = args[0].u_int, m;
+    size_t k = args[2].u_int > 0 ? (size_t)args[2].u_int : (size_t)(-args[2].u_int);
+    uint8_t dtype = args[3].u_int;
+    if(args[1].u_rom_obj == mp_const_none) {
+        m = n;
+    } else {
+        m = mp_obj_get_int(args[1].u_rom_obj);
+    }
+    ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, n, m), dtype);
+    if(dtype == NDARRAY_BOOL) {
+       dtype = NDARRAY_UINT8;
+   }
+    mp_obj_t one = mp_obj_new_int(1);
+    size_t i = 0;
+    if((args[2].u_int >= 0)) {
+        while(k < m) {
+            ndarray_set_value(dtype, ndarray->array, i*m+k, one);
+            k++;
+            i++;
+        }
+    } else {
+        while(k < n) {
+            ndarray_set_value(dtype, ndarray->array, k*m+i, one);
+            k++;
+            i++;
+        }
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_eye_obj, 1, create_eye);
+#endif /* ULAB_NUMPY_HAS_EYE */
+#endif /* ULAB_MAX_DIMS > 1 */
+
+#if ULAB_NUMPY_HAS_FULL
+//| def full(shape: Union[int, Tuple[int, ...]], fill_value: Union[_float, _bool], *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|    """
+//|    .. param: shape
+//|       Shape of the array, either an integer (for a 1-D array) or a tuple of integers (for tensors of higher rank)
+//|    .. param: fill_value
+//|       scalar, the value with which the array is filled
+//|    .. param: dtype
+//|       Type of values in the array
+//|
+//|    Return a new array of the given shape with all elements set to 0."""
+//|    ...
+//|
+
+mp_obj_t create_full(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_obj = MP_OBJ_NULL } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_obj = MP_OBJ_NULL } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    uint8_t dtype = args[2].u_int;
+
+    return create_zeros_ones_full(args[0].u_obj, dtype, args[1].u_obj);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_full_obj, 0, create_full);
+#endif
+
+
+#if ULAB_NUMPY_HAS_LINSPACE
+//| def linspace(
+//|     start: _float,
+//|     stop: _float,
+//|     *,
+//|     dtype: _DType = ulab.numpy.float,
+//|     num: int = 50,
+//|     endpoint: _bool = True,
+//|     retstep: _bool = False
+//| ) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: start
+//|       First value in the array
+//|     .. param: stop
+//|       Final value in the array
+//|     .. param int: num
+//|       Count of values in the array.
+//|     .. param: dtype
+//|       Type of values in the array
+//|     .. param bool: endpoint
+//|       Whether the ``stop`` value is included.  Note that even when
+//|       endpoint=True, the exact ``stop`` value may not be included due to the
+//|       inaccuracy of floating point arithmetic.
+//|      .. param bool: retstep,
+//|       If True, return (`samples`, `step`), where `step` is the spacing between samples.
+//|
+//|     Return a new 1-D array with ``num`` elements ranging from ``start`` to ``stop`` linearly."""
+//|     ...
+//|
+
+mp_obj_t create_linspace(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_num, MP_ARG_INT, { .u_int = 50 } },
+        { MP_QSTR_endpoint, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_true } },
+        { MP_QSTR_retstep, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_false } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(args[2].u_int < 2) {
+        mp_raise_ValueError(translate("number of points must be at least 2"));
+    }
+    size_t len = (size_t)args[2].u_int;
+    mp_float_t start, step, stop;
+
+    ndarray_obj_t *ndarray = NULL;
+
+    #if ULAB_SUPPORTS_COMPLEX
+    mp_float_t step_real, step_imag;
+    bool complex_out = false;
+
+    if(mp_obj_is_type(args[0].u_obj, &mp_type_complex) || mp_obj_is_type(args[1].u_obj, &mp_type_complex)) {
+        complex_out = true;
+        ndarray = ndarray_new_linear_array(len, NDARRAY_COMPLEX);
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+        mp_float_t start_real, start_imag;
+        mp_float_t stop_real, stop_imag;
+
+        mp_obj_get_complex(args[0].u_obj, &start_real, &start_imag);
+        mp_obj_get_complex(args[1].u_obj, &stop_real, &stop_imag);
+        if(args[3].u_obj == mp_const_true) {
+            step_real = (stop_real - start_real) / (len - 1);
+            step_imag = (stop_imag - start_imag) / (len - 1);
+        } else {
+            step_real = (stop_real - start_real) / len;
+            step_imag = (stop_imag - start_imag) / len;
+        }
+
+        for(size_t i = 0; i < len; i++) {
+            *array++ = start_real;
+            *array++ = start_imag;
+            start_real += step_real;
+            start_imag += step_imag;
+        }
+    } else {
+    #endif
+        start = mp_obj_get_float(args[0].u_obj);
+        stop = mp_obj_get_float(args[1].u_obj);
+
+        uint8_t typecode = args[5].u_int;
+
+        if(args[3].u_obj == mp_const_true) {
+            step = (stop - start) / (len - 1);
+        } else {
+            step = (stop - start) / len;
+            stop = start + step * (len - 1);
+        }
+
+        ndarray = create_linspace_arange(start, step, stop, len, typecode);
+    #if ULAB_SUPPORTS_COMPLEX
+    }
+    #endif
+
+    if(args[4].u_obj == mp_const_false) {
+        return MP_OBJ_FROM_PTR(ndarray);
+    } else {
+        mp_obj_t tuple[2];
+        tuple[0] = ndarray;
+        #if ULAB_SUPPORTS_COMPLEX
+        if(complex_out) {
+            tuple[1] = mp_obj_new_complex(step_real, step_imag);
+        } else {
+            tuple[1] = mp_obj_new_float(step);
+        }
+        #else /* ULAB_SUPPORTS_COMPLEX */
+        tuple[1] = mp_obj_new_float(step);
+        #endif
+
+        return mp_obj_new_tuple(2, tuple);
+    }
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_linspace_obj, 2, create_linspace);
+#endif
+
+#if ULAB_NUMPY_HAS_LOGSPACE
+//| def logspace(
+//|     start: _float,
+//|     stop: _float,
+//|     *,
+//|     dtype: _DType = ulab.numpy.float,
+//|     num: int = 50,
+//|     endpoint: _bool = True,
+//|     base: _float = 10.0
+//| ) -> ulab.numpy.ndarray:
+//|     """
+//|     .. param: start
+//|       First value in the array
+//|     .. param: stop
+//|       Final value in the array
+//|     .. param int: num
+//|       Count of values in the array. Defaults to 50.
+//|     .. param: base
+//|       The base of the log space. The step size between the elements in
+//|       ``ln(samples) / ln(base)`` (or ``log_base(samples)``) is uniform. Defaults to 10.0.
+//|     .. param: dtype
+//|       Type of values in the array
+//|     .. param bool: endpoint
+//|       Whether the ``stop`` value is included.  Note that even when
+//|       endpoint=True, the exact ``stop`` value may not be included due to the
+//|       inaccuracy of floating point arithmetic. Defaults to True.
+//|
+//|     Return a new 1-D array with ``num`` evenly spaced elements on a log scale.
+//|     The sequence starts at ``base ** start``, and ends with ``base ** stop``."""
+//|     ...
+//|
+
+const mp_obj_float_t create_float_const_ten = {{&mp_type_float}, MICROPY_FLOAT_CONST(10.0)};
+
+mp_obj_t create_logspace(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_num, MP_ARG_INT, { .u_int = 50 } },
+        { MP_QSTR_base, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_PTR(&create_float_const_ten) } },
+        { MP_QSTR_endpoint, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_true } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(args[2].u_int < 2) {
+        mp_raise_ValueError(translate("number of points must be at least 2"));
+    }
+    size_t len = (size_t)args[2].u_int;
+    mp_float_t start, step, quotient;
+    start = mp_obj_get_float(args[0].u_obj);
+    uint8_t dtype = args[5].u_int;
+    mp_float_t base = mp_obj_get_float(args[3].u_obj);
+    if(args[4].u_obj == mp_const_true) step = (mp_obj_get_float(args[1].u_obj) - start)/(len - 1);
+    else step = (mp_obj_get_float(args[1].u_obj) - start) / len;
+    quotient = MICROPY_FLOAT_C_FUN(pow)(base, step);
+    ndarray_obj_t *ndarray = ndarray_new_linear_array(len, dtype);
+
+    mp_float_t value = MICROPY_FLOAT_C_FUN(pow)(base, start);
+    if(ndarray->dtype == NDARRAY_UINT8) {
+        uint8_t *array = (uint8_t *)ndarray->array;
+        if(ndarray->boolean) {
+            memset(array, 1, len);
+        } else {
+            for(size_t i=0; i < len; i++, value *= quotient) *array++ = (uint8_t)value;
+        }
+    } else if(ndarray->dtype == NDARRAY_INT8) {
+        int8_t *array = (int8_t *)ndarray->array;
+        for(size_t i=0; i < len; i++, value *= quotient) *array++ = (int8_t)value;
+    } else if(ndarray->dtype == NDARRAY_UINT16) {
+        uint16_t *array = (uint16_t *)ndarray->array;
+        for(size_t i=0; i < len; i++, value *= quotient) *array++ = (uint16_t)value;
+    } else if(ndarray->dtype == NDARRAY_INT16) {
+        int16_t *array = (int16_t *)ndarray->array;
+        for(size_t i=0; i < len; i++, value *= quotient) *array++ = (int16_t)value;
+    } else {
+        mp_float_t *array = (mp_float_t *)ndarray->array;
+        for(size_t i=0; i < len; i++, value *= quotient) *array++ = value;
+    }
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_logspace_obj, 2, create_logspace);
+#endif
+
+#if ULAB_NUMPY_HAS_ONES
+//| def ones(shape: Union[int, Tuple[int, ...]], *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|    """
+//|    .. param: shape
+//|       Shape of the array, either an integer (for a 1-D array) or a tuple of 2 integers (for a 2-D array)
+//|    .. param: dtype
+//|       Type of values in the array
+//|
+//|    Return a new array of the given shape with all elements set to 1."""
+//|    ...
+//|
+
+mp_obj_t create_ones(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_obj = MP_OBJ_NULL } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    uint8_t dtype = args[1].u_int;
+    mp_obj_t one = mp_obj_new_int(1);
+    return create_zeros_ones_full(args[0].u_obj, dtype, one);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_ones_obj, 0, create_ones);
+#endif
+
+#if ULAB_NUMPY_HAS_ZEROS
+//| def zeros(shape: Union[int, Tuple[int, ...]], *, dtype: _DType = ulab.numpy.float) -> ulab.numpy.ndarray:
+//|    """
+//|    .. param: shape
+//|       Shape of the array, either an integer (for a 1-D array) or a tuple of 2 integers (for a 2-D array)
+//|    .. param: dtype
+//|       Type of values in the array
+//|
+//|    Return a new array of the given shape with all elements set to 0."""
+//|    ...
+//|
+
+mp_obj_t create_zeros(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_obj = MP_OBJ_NULL } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    uint8_t dtype = args[1].u_int;
+    return create_zeros_ones_full(args[0].u_obj, dtype, mp_const_none);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_zeros_obj, 0, create_zeros);
+#endif
+
+#if ULAB_NUMPY_HAS_FROMBUFFER
+mp_obj_t create_frombuffer(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_INT(NDARRAY_FLOAT) } },
+        { MP_QSTR_count, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_INT(-1) } },
+        { MP_QSTR_offset, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_INT(0) } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    uint8_t dtype = mp_obj_get_int(args[1].u_obj);
+    size_t offset = mp_obj_get_int(args[3].u_obj);
+
+    mp_buffer_info_t bufinfo;
+    if(mp_get_buffer(args[0].u_obj, &bufinfo, MP_BUFFER_READ)) {
+        size_t sz = ulab_binary_get_size(dtype);
+
+        if(bufinfo.len < offset) {
+            mp_raise_ValueError(translate("offset must be non-negative and no greater than buffer length"));
+        }
+        size_t len = (bufinfo.len - offset) / sz;
+        if((len * sz) != (bufinfo.len - offset)) {
+            mp_raise_ValueError(translate("buffer size must be a multiple of element size"));
+        }
+        if(mp_obj_get_int(args[2].u_obj) > 0) {
+            size_t count = mp_obj_get_int(args[2].u_obj);
+            if(len < count) {
+                mp_raise_ValueError(translate("buffer is smaller than requested size"));
+            } else {
+                len = count;
+            }
+        }
+        ndarray_obj_t *ndarray = m_new_obj(ndarray_obj_t);
+        ndarray->base.type = &ulab_ndarray_type;
+        ndarray->dtype = dtype == NDARRAY_BOOL ? NDARRAY_UINT8 : dtype;
+        ndarray->boolean = dtype == NDARRAY_BOOL ? NDARRAY_BOOLEAN : NDARRAY_NUMERIC;
+        ndarray->ndim = 1;
+        ndarray->len = len;
+        ndarray->itemsize = sz;
+        ndarray->shape[ULAB_MAX_DIMS - 1] = len;
+        ndarray->strides[ULAB_MAX_DIMS - 1] = sz;
+
+        uint8_t *buffer = bufinfo.buf;
+        ndarray->array = buffer + offset;
+        return MP_OBJ_FROM_PTR(ndarray);
+    }
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(create_frombuffer_obj, 1, create_frombuffer);
+#endif
diff --git a/python/port/mod/ulab/numpy/create.h b/python/port/mod/ulab/numpy/create.h
new file mode 100644
index 000000000..6e54b10e7
--- /dev/null
+++ b/python/port/mod/ulab/numpy/create.h
@@ -0,0 +1,84 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2020 Jeff Epler for Adafruit Industries
+ *               2019-2021 Zoltán Vörös
+*/
+
+#ifndef _CREATE_
+#define _CREATE_
+
+#include "../ulab.h"
+#include "../ndarray.h"
+
+#if ULAB_NUMPY_HAS_ARANGE
+mp_obj_t create_arange(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_arange_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_ASARRAY
+mp_obj_t create_arange(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_asarray_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_CONCATENATE
+mp_obj_t create_concatenate(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_concatenate_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_DIAG
+mp_obj_t create_diag(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_diag_obj);
+#endif
+
+#if ULAB_MAX_DIMS > 1
+#if ULAB_NUMPY_HAS_EYE
+mp_obj_t create_eye(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_eye_obj);
+#endif
+#endif
+
+#if ULAB_NUMPY_HAS_FULL
+mp_obj_t create_full(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_full_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_LINSPACE
+mp_obj_t create_linspace(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_linspace_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_LOGSPACE
+mp_obj_t create_logspace(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_logspace_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_ONES
+mp_obj_t create_ones(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_ones_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_ZEROS
+mp_obj_t create_zeros(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_zeros_obj);
+#endif
+
+#if ULAB_NUMPY_HAS_FROMBUFFER
+mp_obj_t create_frombuffer(size_t , const mp_obj_t *, mp_map_t *);
+MP_DECLARE_CONST_FUN_OBJ_KW(create_frombuffer_obj);
+#endif
+
+#define ARANGE_LOOP(type_, ndarray, len, step, stop) \
+({\
+    type_ *array = (type_ *)(ndarray)->array;\
+    for (size_t i = 0; i < (len) - 1; i++, (value) += (step)) {\
+        *array++ = (type_)(value);\
+    }\
+    *array = (type_)(stop);\
+})
+
+#endif
diff --git a/python/port/mod/ulab/numpy/fft/fft.c b/python/port/mod/ulab/numpy/fft/fft.c
index 5c6af832d..31a8712eb 100644
--- a/python/port/mod/ulab/numpy/fft/fft.c
+++ b/python/port/mod/ulab/numpy/fft/fft.c
@@ -20,11 +20,13 @@
 #include "py/obj.h"
 #include "py/objarray.h"
 
+#include "../carray/carray_tools.h"
 #include "fft.h"
 
 //| """Frequency-domain functions"""
 //|
 //| import ulab.numpy
+//| import ulab.utils
 
 
 //| def fft(r: ulab.numpy.ndarray, c: Optional[ulab.numpy.ndarray] = None) -> Tuple[ulab.numpy.ndarray, ulab.numpy.ndarray]:
@@ -35,10 +37,17 @@
 //|
 //|     Perform a Fast Fourier Transform from the time domain into the frequency domain
 //|
-//|     See also ~ulab.extras.spectrum, which computes the magnitude of the fft,
+//|     See also `ulab.utils.spectrogram`, which computes the magnitude of the fft,
 //|     rather than separately returning its real and imaginary parts."""
 //|     ...
 //|
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+static mp_obj_t fft_fft(mp_obj_t arg) {
+    return fft_fft_ifft_spectrogram(arg, FFT_FFT);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(fft_fft_obj, fft_fft);
+#else
 static mp_obj_t fft_fft(size_t n_args, const mp_obj_t *args) {
     if(n_args == 2) {
         return fft_fft_ifft_spectrogram(n_args, args[0], args[1], FFT_FFT);
@@ -48,6 +57,7 @@ static mp_obj_t fft_fft(size_t n_args, const mp_obj_t *args) {
 }
 
 MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(fft_fft_obj, 1, 2, fft_fft);
+#endif
 
 //| def ifft(r: ulab.numpy.ndarray, c: Optional[ulab.numpy.ndarray] = None) -> Tuple[ulab.numpy.ndarray, ulab.numpy.ndarray]:
 //|     """
@@ -55,11 +65,19 @@ MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(fft_fft_obj, 1, 2, fft_fft);
 //|     :param ulab.numpy.ndarray c: An optional 1-dimension array of values whose size is a power of 2, giving the complex part of the value
 //|     :return tuple (r, c): The real and complex parts of the inverse FFT
 //|
-//|     Perform an Inverse Fast Fourier Transform from the frequency domain into the time domain"""
+//|     Perform an Inverse Fast Fourier Transform from the frequeny domain into the time domain"""
 //|     ...
 //|
 
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+static mp_obj_t fft_ifft(mp_obj_t arg) {
+    return fft_fft_ifft_spectrogram(arg, FFT_IFFT);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(fft_ifft_obj, fft_ifft);
+#else
 static mp_obj_t fft_ifft(size_t n_args, const mp_obj_t *args) {
+    NOT_IMPLEMENTED_FOR_COMPLEX()
     if(n_args == 2) {
         return fft_fft_ifft_spectrogram(n_args, args[0], args[1], FFT_IFFT);
     } else {
@@ -68,6 +86,7 @@ static mp_obj_t fft_ifft(size_t n_args, const mp_obj_t *args) {
 }
 
 MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(fft_ifft_obj, 1, 2, fft_ifft);
+#endif
 
 STATIC const mp_rom_map_elem_t ulab_fft_globals_table[] = {
     { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_fft) },
diff --git a/python/port/mod/ulab/numpy/fft/fft.h b/python/port/mod/ulab/numpy/fft/fft.h
index 66acafe11..7a1669841 100644
--- a/python/port/mod/ulab/numpy/fft/fft.h
+++ b/python/port/mod/ulab/numpy/fft/fft.h
@@ -19,6 +19,12 @@
 
 extern mp_obj_module_t ulab_fft_module;
 
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+MP_DECLARE_CONST_FUN_OBJ_3(fft_fft_obj);
+MP_DECLARE_CONST_FUN_OBJ_3(fft_ifft_obj);
+#else
 MP_DECLARE_CONST_FUN_OBJ_VAR_BETWEEN(fft_fft_obj);
 MP_DECLARE_CONST_FUN_OBJ_VAR_BETWEEN(fft_ifft_obj);
 #endif
+
+#endif
diff --git a/python/port/mod/ulab/numpy/fft/fft_tools.c b/python/port/mod/ulab/numpy/fft/fft_tools.c
index 6dd2ca47c..8a55927e3 100644
--- a/python/port/mod/ulab/numpy/fft/fft_tools.c
+++ b/python/port/mod/ulab/numpy/fft/fft_tools.c
@@ -9,10 +9,12 @@
 */
 
 #include <math.h>
+#include <string.h>
 #include "py/runtime.h"
 
 #include "../../ndarray.h"
 #include "../../ulab_tools.h"
+#include "../carray/carray_tools.h"
 #include "fft_tools.h"
 
 #ifndef MP_PI
@@ -22,7 +24,8 @@
 #define MP_E MICROPY_FLOAT_CONST(2.71828182845904523536)
 #endif
 
-/*
+/* Kernel implementation for the case, when ulab has no complex support
+
  * The following function takes two arrays, namely, the real and imaginary
  * parts of a complex array, and calculates the Fourier transform in place.
  *
@@ -31,6 +34,128 @@
  * and can be used independent of ulab.
  */
 
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+/* Kernel implementation for the complex case. Data are contained in data as
+
+    data[0], data[1], data[2], data[3], .... , data[2n - 2], data[2n-1]
+    real[0], imag[0], real[1], imag[1], .... , real[n-1],    imag[n-1]
+
+    In general
+    real[i] = data[2i]
+    imag[i] = data[2i+1]
+
+*/
+void fft_kernel_complex(mp_float_t *data, size_t n, int isign) {
+    size_t j, m, mmax, istep;
+    mp_float_t tempr, tempi;
+    mp_float_t wtemp, wr, wpr, wpi, wi, theta;
+
+    j = 0;
+    for(size_t i = 0; i < n; i++) {
+        if (j > i) {
+            SWAP(mp_float_t, data[2*i], data[2*j]);
+            SWAP(mp_float_t, data[2*i+1], data[2*j+1]);
+        }
+        m = n >> 1;
+        while (j >= m && m > 0) {
+            j -= m;
+            m >>= 1;
+        }
+        j += m;
+    }
+
+    mmax = 1;
+    while (n > mmax) {
+        istep = mmax << 1;
+        theta = MICROPY_FLOAT_CONST(-2.0)*isign*MP_PI/istep;
+        wtemp = MICROPY_FLOAT_C_FUN(sin)(MICROPY_FLOAT_CONST(0.5) * theta);
+        wpr = MICROPY_FLOAT_CONST(-2.0) * wtemp * wtemp;
+        wpi = MICROPY_FLOAT_C_FUN(sin)(theta);
+        wr = MICROPY_FLOAT_CONST(1.0);
+        wi = MICROPY_FLOAT_CONST(0.0);
+        for(m = 0; m < mmax; m++) {
+            for(size_t i = m; i < n; i += istep) {
+                j = i + mmax;
+                tempr = wr * data[2*j] - wi * data[2*j+1];
+                tempi = wr * data[2*j+1] + wi * data[2*j];
+                data[2*j] = data[2*i] - tempr;
+                data[2*j+1] = data[2*i+1] - tempi;
+                data[2*i] += tempr;
+                data[2*i+1] += tempi;
+            }
+            wtemp = wr;
+            wr = wr*wpr - wi*wpi + wr;
+            wi = wi*wpr + wtemp*wpi + wi;
+        }
+        mmax = istep;
+    }
+}
+
+/*
+ * The following function is a helper interface to the python side.
+ * It has been factored out from fft.c, so that the same argument parsing
+ * routine can be called from scipy.signal.spectrogram.
+ */
+mp_obj_t fft_fft_ifft_spectrogram(mp_obj_t data_in, uint8_t type) {
+    if(!mp_obj_is_type(data_in, &ulab_ndarray_type)) {
+        mp_raise_NotImplementedError(translate("FFT is defined for ndarrays only"));
+    }
+    ndarray_obj_t *in = MP_OBJ_TO_PTR(data_in);
+    #if ULAB_MAX_DIMS > 1
+    if(in->ndim != 1) {
+        mp_raise_TypeError(translate("FFT is implemented for linear arrays only"));
+    }
+    #endif
+    size_t len = in->len;
+    // Check if input is of length of power of 2
+    if((len & (len-1)) != 0) {
+        mp_raise_ValueError(translate("input array length must be power of 2"));
+    }
+
+    ndarray_obj_t *out = ndarray_new_linear_array(len, NDARRAY_COMPLEX);
+    mp_float_t *data = (mp_float_t *)out->array;
+    uint8_t *array = (uint8_t *)in->array;
+
+    if(in->dtype == NDARRAY_COMPLEX) {
+        uint8_t sz = 2 * sizeof(mp_float_t);
+        uint8_t *data_ = (uint8_t *)out->array;
+        for(size_t i = 0; i < len; i++) {
+            memcpy(data_, array, sz);
+            array += in->strides[ULAB_MAX_DIMS - 1];
+        }
+    } else {
+        mp_float_t (*func)(void *) = ndarray_get_float_function(in->dtype);
+        for(size_t i = 0; i < len; i++) {
+            // real part; the imaginary part is 0, no need to assign
+            *data = func(array);
+            data += 2;
+            array += in->strides[ULAB_MAX_DIMS - 1];
+        }
+    }
+    data -= 2 * len;
+
+    if((type == FFT_FFT) || (type == FFT_SPECTROGRAM)) {
+        fft_kernel_complex(data, len, 1);
+        if(type == FFT_SPECTROGRAM) {
+            ndarray_obj_t *spectrum = ndarray_new_linear_array(len, NDARRAY_FLOAT);
+            mp_float_t *sarray = (mp_float_t *)spectrum->array;
+            for(size_t i = 0; i < len; i++) {
+                *sarray++ = MICROPY_FLOAT_C_FUN(sqrt)(data[0] * data[0] + data[1] * data[1]);
+                data += 2;
+            }
+            m_del(mp_float_t, data, 2 * len);
+            return MP_OBJ_FROM_PTR(spectrum);
+        }
+    } else { // inverse transform
+        fft_kernel_complex(data, len, -1);
+        // TODO: numpy accepts the norm keyword argument
+        for(size_t i = 0; i < len; i++) {
+            *data++ /= len;
+        }
+    }
+    return MP_OBJ_FROM_PTR(out);
+}
+#else /* ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE */
 void fft_kernel(mp_float_t *real, mp_float_t *imag, size_t n, int isign) {
     size_t j, m, mmax, istep;
     mp_float_t tempr, tempi;
@@ -77,12 +202,6 @@ void fft_kernel(mp_float_t *real, mp_float_t *imag, size_t n, int isign) {
     }
 }
 
-/*
- * The following function is a helper interface to the python side.
- * It has been factored out from fft.c, so that the same argument parsing
- * routine can be called from scipy.signal.spectrogram.
- */
-
 mp_obj_t fft_fft_ifft_spectrogram(size_t n_args, mp_obj_t arg_re, mp_obj_t arg_im, uint8_t type) {
     if(!mp_obj_is_type(arg_re, &ulab_ndarray_type)) {
         mp_raise_NotImplementedError(translate("FFT is defined for ndarrays only"));
@@ -95,6 +214,7 @@ mp_obj_t fft_fft_ifft_spectrogram(size_t n_args, mp_obj_t arg_re, mp_obj_t arg_i
     ndarray_obj_t *re = MP_OBJ_TO_PTR(arg_re);
     #if ULAB_MAX_DIMS > 1
     if(re->ndim != 1) {
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(re->dtype)
         mp_raise_TypeError(translate("FFT is implemented for linear arrays only"));
     }
     #endif
@@ -122,6 +242,7 @@ mp_obj_t fft_fft_ifft_spectrogram(size_t n_args, mp_obj_t arg_re, mp_obj_t arg_i
         ndarray_obj_t *im = MP_OBJ_TO_PTR(arg_im);
         #if ULAB_MAX_DIMS > 1
         if(im->ndim != 1) {
+            COMPLEX_DTYPE_NOT_IMPLEMENTED(im->dtype)
             mp_raise_TypeError(translate("FFT is implemented for linear arrays only"));
         }
         #endif
@@ -163,3 +284,4 @@ mp_obj_t fft_fft_ifft_spectrogram(size_t n_args, mp_obj_t arg_re, mp_obj_t arg_i
         return mp_obj_new_tuple(2, tuple);
     }
 }
+#endif  /* ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE */
diff --git a/python/port/mod/ulab/numpy/fft/fft_tools.h b/python/port/mod/ulab/numpy/fft/fft_tools.h
index d3b856d07..9444232f6 100644
--- a/python/port/mod/ulab/numpy/fft/fft_tools.h
+++ b/python/port/mod/ulab/numpy/fft/fft_tools.h
@@ -17,7 +17,12 @@ enum FFT_TYPE {
     FFT_SPECTROGRAM,
 };
 
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+void fft_kernel(mp_float_t *, size_t , int );
+mp_obj_t fft_fft_ifft_spectrogram(mp_obj_t , uint8_t );
+#else
 void fft_kernel(mp_float_t *, mp_float_t *, size_t , int );
 mp_obj_t fft_fft_ifft_spectrogram(size_t , mp_obj_t , mp_obj_t , uint8_t );
+#endif /* ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE */
 
 #endif /* _FFT_TOOLS_ */
diff --git a/python/port/mod/ulab/numpy/filter.c b/python/port/mod/ulab/numpy/filter.c
index bf2d16cd4..057cd6dc4 100644
--- a/python/port/mod/ulab/numpy/filter.c
+++ b/python/port/mod/ulab/numpy/filter.c
@@ -21,6 +21,7 @@
 
 #include "../ulab.h"
 #include "../scipy/signal/signal.h"
+#include "carray/carray_tools.h"
 #include "filter.h"
 
 #if ULAB_NUMPY_HAS_CONVOLVE
@@ -53,30 +54,77 @@ mp_obj_t filter_convolve(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_a
     }
 
     int len = len_a + len_c - 1; // convolve mode "full"
-    ndarray_obj_t *out = ndarray_new_linear_array(len, NDARRAY_FLOAT);
-    mp_float_t *outptr = (mp_float_t *)out->array;
+    int32_t off = len_c - 1;
+    uint8_t dtype = NDARRAY_FLOAT;
+
+    #if ULAB_SUPPORTS_COMPLEX
+    if((a->dtype == NDARRAY_COMPLEX) || (c->dtype == NDARRAY_COMPLEX)) {
+        dtype = NDARRAY_COMPLEX;
+    }
+    #endif
+    ndarray_obj_t *ndarray = ndarray_new_linear_array(len, dtype);
+    mp_float_t *array = (mp_float_t *)ndarray->array;
+
     uint8_t *aarray = (uint8_t *)a->array;
     uint8_t *carray = (uint8_t *)c->array;
 
-    int32_t off = len_c - 1;
     int32_t as = a->strides[ULAB_MAX_DIMS - 1] / a->itemsize;
     int32_t cs = c->strides[ULAB_MAX_DIMS - 1] / c->itemsize;
 
-    for(int32_t k=-off; k < len-off; k++) {
-        mp_float_t accum = (mp_float_t)0.0;
+
+    #if ULAB_SUPPORTS_COMPLEX
+    if(dtype == NDARRAY_COMPLEX) {
+        mp_float_t a_real, a_imag;
+        mp_float_t c_real, c_imag = MICROPY_FLOAT_CONST(0.0);
+        for(int32_t k = -off; k < len-off; k++) {
+            mp_float_t accum_real = MICROPY_FLOAT_CONST(0.0);
+            mp_float_t accum_imag = MICROPY_FLOAT_CONST(0.0);
+
+            int32_t top_n = MIN(len_c, len_a - k);
+            int32_t bot_n = MAX(-k, 0);
+
+            for(int32_t n = bot_n; n < top_n; n++) {
+                int32_t idx_c = (len_c - n - 1) * cs;
+                int32_t idx_a = (n + k) * as;
+                if(a->dtype != NDARRAY_COMPLEX) {
+                    a_real = ndarray_get_float_index(aarray, a->dtype, idx_a);
+                    a_imag = MICROPY_FLOAT_CONST(0.0);
+                } else {
+                    a_real = ndarray_get_float_index(aarray, NDARRAY_FLOAT, 2 * idx_a);
+                    a_imag = ndarray_get_float_index(aarray, NDARRAY_FLOAT, 2 * idx_a + 1);
+                }
+
+                if(c->dtype != NDARRAY_COMPLEX) {
+                    c_real = ndarray_get_float_index(carray, c->dtype, idx_c);
+                    c_imag = MICROPY_FLOAT_CONST(0.0);
+                } else {
+                    c_real = ndarray_get_float_index(carray, NDARRAY_FLOAT, 2 * idx_c);
+                    c_imag = ndarray_get_float_index(carray, NDARRAY_FLOAT, 2 * idx_c + 1);
+                }
+                accum_real += a_real * c_real - a_imag * c_imag;
+                accum_imag += a_real * c_imag + a_imag * c_real;
+            }
+            *array++ = accum_real;
+            *array++ = accum_imag;
+        }
+        return MP_OBJ_FROM_PTR(ndarray);
+    }
+    #endif
+
+    for(int32_t k = -off; k < len-off; k++) {
+        mp_float_t accum = MICROPY_FLOAT_CONST(0.0);
         int32_t top_n = MIN(len_c, len_a - k);
         int32_t bot_n = MAX(-k, 0);
-        for(int32_t n=bot_n; n < top_n; n++) {
+        for(int32_t n = bot_n; n < top_n; n++) {
             int32_t idx_c = (len_c - n - 1) * cs;
             int32_t idx_a = (n + k) * as;
             mp_float_t ai = ndarray_get_float_index(aarray, a->dtype, idx_a);
             mp_float_t ci = ndarray_get_float_index(carray, c->dtype, idx_c);
             accum += ai * ci;
         }
-        *outptr++ = accum;
+        *array++ = accum;
     }
-
-    return out;
+    return MP_OBJ_FROM_PTR(ndarray);
 }
 
 MP_DEFINE_CONST_FUN_OBJ_KW(filter_convolve_obj, 2, filter_convolve);
diff --git a/python/port/mod/ulab/numpy/io/io.c b/python/port/mod/ulab/numpy/io/io.c
new file mode 100644
index 000000000..0d0294571
--- /dev/null
+++ b/python/port/mod/ulab/numpy/io/io.c
@@ -0,0 +1,817 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2022 Zoltán Vörös
+*/
+
+#include <math.h>
+#include <string.h>
+
+#include "py/builtin.h"
+#include "py/formatfloat.h"
+#include "py/obj.h"
+#include "py/parsenum.h"
+#include "py/runtime.h"
+#include "py/stream.h"
+#include "extmod/vfs.h"
+
+#include "../../ndarray.h"
+#include "../../ulab_tools.h"
+#include "io.h"
+
+#define ULAB_IO_BUFFER_SIZE         128
+#define ULAB_IO_CLIPBOARD_SIZE      32
+#define ULAB_IO_MAX_ROWS            65535
+
+#define ULAB_IO_NULL_ENDIAN         0
+#define ULAB_IO_LITTLE_ENDIAN       1
+#define ULAB_IO_BIG_ENDIAN          2
+
+#if ULAB_NUMPY_HAS_LOAD
+static void io_read_(mp_obj_t stream, const mp_stream_p_t *stream_p, char *buffer, char *string, uint16_t len, int *error) {
+    size_t read = stream_p->read(stream, buffer, len, error);
+    bool fail = false;
+    if(read == len) {
+        if(string != NULL) {
+            if(memcmp(buffer, string, len) != 0) {
+                fail = true;
+            }
+        }
+    } else {
+        fail = true;
+    }
+    if(fail) {
+        stream_p->ioctl(stream, MP_STREAM_CLOSE, 0, error);
+        mp_raise_msg(&mp_type_RuntimeError, translate("corrupted file"));
+    }
+}
+
+static mp_obj_t io_load(mp_obj_t file) {
+    if(!mp_obj_is_str(file)) {
+        mp_raise_TypeError(translate("wrong input type"));
+    }
+
+    int error;
+    char *buffer = m_new(char, ULAB_IO_BUFFER_SIZE);
+
+    // test for endianness
+    uint16_t x = 1;
+    int8_t native_endianness = (x >> 8) == 1 ? ULAB_IO_BIG_ENDIAN : ULAB_IO_LITTLE_ENDIAN;
+
+    mp_obj_t open_args[2] = {
+        file,
+        MP_OBJ_NEW_QSTR(MP_QSTR_rb)
+    };
+
+    mp_obj_t stream = mp_builtin_open_obj.fun.kw(2, open_args, (mp_map_t *)&mp_const_empty_map);
+    const mp_stream_p_t *stream_p = mp_get_stream(stream);
+
+    // read header
+    // magic string
+    io_read_(stream, stream_p, buffer, "\x93NUMPY", 6, &error);
+    // simply discard the version number
+    io_read_(stream, stream_p, buffer, NULL, 2, &error);
+    // header length, represented as a little endian uint16 (0x76, 0x00)
+    io_read_(stream, stream_p, buffer, NULL, 2, &error);
+
+    uint16_t header_length = buffer[1];
+    header_length <<= 8;
+    header_length += buffer[0];
+
+    // beginning of the dictionary describing the array
+    io_read_(stream, stream_p, buffer, "{'descr': '", 11, &error);
+    uint8_t dtype;
+
+    io_read_(stream, stream_p, buffer, NULL, 1, &error);
+    uint8_t endianness = ULAB_IO_NULL_ENDIAN;
+    if(*buffer == '<') {
+        endianness = ULAB_IO_LITTLE_ENDIAN;
+    } else if(*buffer == '>') {
+        endianness = ULAB_IO_BIG_ENDIAN;
+    }
+
+    io_read_(stream, stream_p, buffer, NULL, 2, &error);
+    if(memcmp(buffer, "u1", 2) == 0) {
+        dtype = NDARRAY_UINT8;
+    } else if(memcmp(buffer, "i1", 2) == 0) {
+        dtype = NDARRAY_INT8;
+    } else if(memcmp(buffer, "u2", 2) == 0) {
+        dtype = NDARRAY_UINT16;
+    } else if(memcmp(buffer, "i2", 2) == 0) {
+        dtype = NDARRAY_INT16;
+    }
+    #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
+    else if(memcmp(buffer, "f4", 2) == 0) {
+        dtype = NDARRAY_FLOAT;
+    }
+    #else
+    else if(memcmp(buffer, "f8", 2) == 0) {
+        dtype = NDARRAY_FLOAT;
+    }
+    #endif
+    #if ULAB_SUPPORTS_COMPLEX
+    #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
+    else if(memcmp(buffer, "c8", 2) == 0) {
+        dtype = NDARRAY_COMPLEX;
+    }
+    #else
+    else if(memcmp(buffer, "c16", 3) == 0) {
+        dtype = NDARRAY_COMPLEX;
+    }
+    #endif
+    #endif /* ULAB_SUPPORT_COPMLEX */
+    else {
+        stream_p->ioctl(stream, MP_STREAM_CLOSE, 0, &error);
+        mp_raise_TypeError(translate("wrong dtype"));
+    }
+
+    io_read_(stream, stream_p, buffer, "', 'fortran_order': False, 'shape': (", 37, &error);
+
+    size_t *shape = m_new0(size_t, ULAB_MAX_DIMS);
+
+    uint16_t bytes_to_read = MIN(ULAB_IO_BUFFER_SIZE, header_length - 51);
+    // bytes_to_read is 128 at most. This should be enough to contain a
+    // maximum of 4 size_t numbers plus the delimiters
+    io_read_(stream, stream_p, buffer, NULL, bytes_to_read, &error);
+    char *needle = buffer;
+    uint8_t ndim = 0;
+
+    // find out the number of dimensions by counting the commas in the string
+    while(1) {
+        if(*needle == ',') {
+            ndim++;
+            if(needle[1] == ')') {
+                break;
+            }
+        } else if((*needle == ')') && (ndim > 0)) {
+            ndim++;
+            break;
+        }
+        needle++;
+    }
+
+    needle = buffer;
+    for(uint8_t i = 0; i < ndim; i++) {
+        size_t number = 0;
+        // trivial number parsing here
+        while(1) {
+            if((*needle == ' ') || (*needle == '\t')) {
+                needle++;
+            }
+            if((*needle > 47) && (*needle < 58)) {
+                number = number * 10 + (*needle - 48);
+            } else if((*needle == ',') || (*needle == ')')) {
+                break;
+            }
+            else {
+                stream_p->ioctl(stream, MP_STREAM_CLOSE, 0, &error);
+                mp_raise_msg(&mp_type_RuntimeError, translate("corrupted file"));
+            }
+            needle++;
+        }
+        needle++;
+        shape[ULAB_MAX_DIMS - ndim + i] = number;
+    }
+
+    // strip the rest of the header
+    if((bytes_to_read + 51) < header_length) {
+        io_read_(stream, stream_p, buffer, NULL, header_length - (bytes_to_read + 51), &error);
+    }
+
+    ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(ndim, shape, dtype);
+    char *array = (char *)ndarray->array;
+
+    size_t read = stream_p->read(stream, array, ndarray->len * ndarray->itemsize, &error);
+    if(read != ndarray->len * ndarray->itemsize) {
+        stream_p->ioctl(stream, MP_STREAM_CLOSE, 0, &error);
+        mp_raise_msg(&mp_type_RuntimeError, translate("corrupted file"));
+    }
+
+    stream_p->ioctl(stream, MP_STREAM_CLOSE, 0, &error);
+    m_del(char, buffer, ULAB_IO_BUFFER_SIZE);
+
+    // swap the bytes, if necessary
+    if((native_endianness != endianness) && (dtype != NDARRAY_UINT8) && (dtype != NDARRAY_INT8)) {
+        uint8_t sz = ndarray->itemsize;
+        char *tmpbuff = NULL;
+
+        #if ULAB_SUPPORTS_COMPLEX
+        if(dtype == NDARRAY_COMPLEX) {
+            // work with the floating point real and imaginary parts
+            sz /= 2;
+            tmpbuff = m_new(char, sz);
+            for(size_t i = 0; i < ndarray->len; i++) {
+                for(uint8_t k = 0; k < 2; k++) {
+                    tmpbuff += sz;
+                    for(uint8_t j = 0; j < sz; j++) {
+                        memcpy(--tmpbuff, array++, 1);
+                    }
+                    memcpy(array-sz, tmpbuff, sz);
+                }
+            }
+        } else {
+        #endif
+            tmpbuff = m_new(char, sz);
+            for(size_t i = 0; i < ndarray->len; i++) {
+                tmpbuff += sz;
+                for(uint8_t j = 0; j < sz; j++) {
+                    memcpy(--tmpbuff, array++, 1);
+                }
+                memcpy(array-sz, tmpbuff, sz);
+            }
+        #if ULAB_SUPPORTS_COMPLEX
+        }
+        #endif
+        m_del(char, tmpbuff, sz);
+    }
+
+    m_del(size_t, shape, ULAB_MAX_DIMS);
+
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(io_load_obj, io_load);
+#endif /* ULAB_NUMPY_HAS_LOAD */
+
+#if ULAB_NUMPY_HAS_LOADTXT
+static void io_assign_value(const char *clipboard, uint8_t len, ndarray_obj_t *ndarray, size_t *idx, uint8_t dtype) {
+    mp_obj_t value = mp_parse_num_decimal(clipboard, len, false, false, NULL);
+    if(dtype != NDARRAY_FLOAT) {
+        mp_float_t _value = mp_obj_get_float(value);
+        value = mp_obj_new_int((int32_t)MICROPY_FLOAT_C_FUN(round)(_value));
+    }
+    ndarray_set_value(dtype, ndarray->array, (*idx)++, value);
+}
+
+static mp_obj_t io_loadtxt(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_delimiter, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_comments, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_max_rows, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = -1 } },
+        { MP_QSTR_usecols, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = NDARRAY_FLOAT } },
+        { MP_QSTR_skiprows, MP_ARG_KW_ONLY | MP_ARG_INT, { .u_int = 0 } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t open_args[2] = {
+        args[0].u_obj,
+        MP_OBJ_NEW_QSTR(MP_QSTR_r)
+    };
+
+    mp_obj_t stream = mp_builtin_open_obj.fun.kw(2, open_args, (mp_map_t *)&mp_const_empty_map);
+    const mp_stream_p_t *stream_p = mp_get_stream(stream);
+
+    char *buffer = m_new(char, ULAB_IO_BUFFER_SIZE);
+    int error;
+
+    char delimiter = ' ';
+    if(args[1].u_obj != mp_const_none) {
+        size_t _len;
+        char *_delimiter = m_new(char, 8);
+        _delimiter = (char *)mp_obj_str_get_data(args[1].u_obj, &_len);
+        delimiter = _delimiter[0];
+    }
+
+    char comment_char = '#';
+    if(args[2].u_obj != mp_const_none) {
+        size_t _len;
+        char *_comment_char = m_new(char, 8);
+        _comment_char = (char *)mp_obj_str_get_data(args[2].u_obj, &_len);
+        comment_char = _comment_char[0];
+    }
+
+    uint16_t skiprows = args[6].u_int;
+    uint16_t max_rows = ULAB_IO_MAX_ROWS;
+    if((args[3].u_int > 0) && (args[3].u_int < ULAB_IO_MAX_ROWS)) {
+        max_rows = args[3].u_int + skiprows;
+    }
+
+    uint16_t *cols = NULL;
+    uint8_t used_columns = 0;
+    if(args[4].u_obj != mp_const_none) {
+        if(mp_obj_is_int(args[4].u_obj)) {
+            used_columns = 1;
+            cols = m_new(uint16_t, used_columns);
+            cols[0] = (uint16_t)mp_obj_get_int(args[4].u_obj);
+        } else {
+            #if ULAB_MAX_DIMS == 1
+            mp_raise_ValueError(translate("usecols keyword must be specified"));
+            #else
+            // assume that the argument is an iterable
+            used_columns = (uint16_t)mp_obj_get_int(mp_obj_len(args[4].u_obj));
+            cols = m_new(uint16_t, used_columns);
+            mp_obj_iter_buf_t iter_buf;
+            mp_obj_t item, iterable = mp_getiter(args[4].u_obj, &iter_buf);
+            while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+                *cols++ = (uint16_t)mp_obj_get_int(item);
+            }
+            cols -= used_columns;
+            #endif
+        }
+    }
+
+    uint8_t dtype = args[5].u_int;
+
+    // count the columns and rows
+    // we actually count only the rows and the items, and assume that
+    // the number of columns can be gotten by means of a simple division,
+    // i.e., that each row has the same number of columns
+    char *offset;
+    uint16_t rows = 0, items = 0, all_rows = 0;
+    uint8_t read;
+    uint8_t len = 0;
+
+    do {
+        read = (uint8_t)stream_p->read(stream, buffer, ULAB_IO_BUFFER_SIZE - 1, &error);
+        buffer[read] = '\0';
+        offset = buffer;
+        while(*offset != '\0') {
+            if(*offset == comment_char) {
+                // clear the line till the end, or the buffer's end
+                while((*offset != '\0')) {
+                    offset++;
+                    if(*offset == '\n') {
+                        offset++;
+                        all_rows++;
+                        break;
+                    }
+                }
+            }
+
+            // catch whitespaces here: if these are not on a comment line, then they delimit a number
+            if(*offset == '\n') {
+                all_rows++;
+                if(all_rows > skiprows) {
+                    rows++;
+                    items++;
+                    len = 0;
+                }
+                if(all_rows == max_rows) {
+                    break;
+                }
+            }
+
+            if((*offset == ' ') || (*offset == '\t') || (*offset == '\v') ||
+                (*offset == '\f') || (*offset == '\r') || (*offset == delimiter)) {
+                offset++;
+                while((*offset == ' ') || (*offset == '\t') || (*offset == '\v') || (*offset == '\f') || (*offset == '\r')) {
+                    offset++;
+                }
+                if(len > 0) {
+                    if(all_rows >= skiprows) {
+                        items++;
+                    }
+                    len = 0;
+                }
+            } else {
+                offset++;
+                len++;
+            }
+        }
+    } while((read > 0) && (all_rows < max_rows));
+
+    if(rows == 0) {
+        mp_raise_ValueError(translate("empty file"));
+    }
+    uint16_t columns = items / rows;
+
+    if(columns < used_columns) {
+        mp_raise_ValueError(translate("usecols is too high"));
+    }
+
+    size_t *shape = m_new0(size_t, ULAB_MAX_DIMS);
+
+    #if ULAB_MAX_DIMS == 1
+    shape[0] = rows;
+    ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(1, shape, dtype);
+    #else
+    if(args[4].u_obj == mp_const_none) {
+        shape[ULAB_MAX_DIMS - 1] = columns;
+    } else {
+        shape[ULAB_MAX_DIMS - 1] = used_columns;
+    }
+    shape[ULAB_MAX_DIMS - 2] = rows;
+    ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(2, shape, dtype);
+    #endif
+
+    struct mp_stream_seek_t seek_s;
+    seek_s.offset = 0;
+    seek_s.whence = MP_SEEK_SET;
+    stream_p->ioctl(stream, MP_STREAM_SEEK, (mp_uint_t)(uintptr_t)&seek_s, &error);
+
+    char *clipboard = m_new(char, ULAB_IO_CLIPBOARD_SIZE);
+    char *clipboard_origin = clipboard;
+
+    rows = 0;
+    columns = 0;
+    len = 0;
+
+    size_t idx = 0;
+    do {
+        read = stream_p->read(stream, buffer, ULAB_IO_BUFFER_SIZE - 1, &error);
+        buffer[read] = '\0';
+        offset = buffer;
+
+        while(*offset != '\0') {
+            if(*offset == comment_char) {
+                // clear the line till the end, or the buffer's end
+                while((*offset != '\0')) {
+                    offset++;
+                    if(*offset == '\n') {
+                        rows++;
+                        offset++;
+                        break;
+                    }
+                }
+            }
+
+            if(rows == max_rows) {
+                break;
+            }
+
+            if((*offset == ' ') || (*offset == '\t') || (*offset == '\v') ||
+                (*offset == '\f') || (*offset == '\r') || (*offset == '\n') || (*offset == delimiter)) {
+                offset++;
+                while((*offset == ' ') || (*offset == '\t') || (*offset == '\v') ||
+                    (*offset == '\f') || (*offset == '\r') || (*offset == '\n')) {
+                    offset++;
+                }
+                if(len > 0) {
+                    clipboard = clipboard_origin;
+                    if(rows >= skiprows) {
+                        #if ULAB_MAX_DIMS == 1
+                        if(columns == cols[0]) {
+                            io_assign_value(clipboard, len, ndarray, &idx, dtype);
+                        }
+                        #else
+                        if(args[4].u_obj == mp_const_none) {
+                            io_assign_value(clipboard, len, ndarray, &idx, dtype);
+                        } else {
+                            for(uint8_t c = 0; c < used_columns; c++) {
+                                if(columns == cols[c]) {
+                                    io_assign_value(clipboard, len, ndarray, &idx, dtype);
+                                    break;
+                                }
+                            }
+                        }
+                        #endif
+                    }
+                    columns++;
+                    len = 0;
+
+                    if(offset[-1] == '\n') {
+                        columns = 0;
+                        rows++;
+                    }
+                }
+            } else {
+                *clipboard++ = *offset++;
+                len++;
+            }
+        }
+    } while((read > 0) && (rows < max_rows));
+
+    stream_p->ioctl(stream, MP_STREAM_CLOSE, 0, &error);
+
+    m_del(size_t, shape, ULAB_MAX_DIMS);
+    m_del(char, buffer, ULAB_IO_BUFFER_SIZE);
+    m_del(char, clipboard, ULAB_IO_CLIPBOARD_SIZE);
+    m_del(uint16_t, cols, used_columns);
+
+    return MP_OBJ_FROM_PTR(ndarray);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(io_loadtxt_obj, 1, io_loadtxt);
+#endif /* ULAB_NUMPY_HAS_LOADTXT */
+
+
+#if ULAB_NUMPY_HAS_SAVE
+static uint8_t io_sprintf(char *buffer, const char *comma, size_t x) {
+    uint8_t offset = 1;
+    char *buf = buffer;
+    // our own minimal implementation of sprintf for size_t types
+    // this is required on systems, where sprintf is not available
+
+    // find out, how many characters are required
+    // we could call log10 here...
+    for(size_t i = 10; i < 100000000; i *= 10) {
+        if(x < i) {
+            break;
+        }
+        buf++;
+    }
+
+    while(x > 0) {
+        uint8_t rem = x % 10;
+        *buf-- = '0' + rem;
+        x /= 10;
+        offset++;
+    }
+
+    buf += offset;
+    while(*comma != '\0') {
+        *buf++ = *comma++;
+        offset++;
+    }
+    return offset - 1;
+}
+
+static mp_obj_t io_save(mp_obj_t file, mp_obj_t ndarray_) {
+    if(!mp_obj_is_str(file) || !mp_obj_is_type(ndarray_, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("wrong input type"));
+    }
+
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(ndarray_);
+    int error;
+    char *buffer = m_new(char, ULAB_IO_BUFFER_SIZE);
+    uint8_t offset = 0;
+
+    // test for endianness
+    uint16_t x = 1;
+    int8_t native_endianness = (x >> 8) == 1 ? '>' : '<';
+
+    mp_obj_t open_args[2] = {
+        file,
+        MP_OBJ_NEW_QSTR(MP_QSTR_wb)
+    };
+
+    mp_obj_t stream = mp_builtin_open_obj.fun.kw(2, open_args, (mp_map_t *)&mp_const_empty_map);
+    const mp_stream_p_t *stream_p = mp_get_stream(stream);
+
+    // write header;
+    // magic string + header length, which is always 128 - 10 = 118, represented as a little endian uint16 (0x76, 0x00)
+    // + beginning of the dictionary describing the array
+    memcpy(buffer, "\x93NUMPY\x01\x00\x76\x00{'descr': '", 21);
+    offset += 21;
+
+    buffer[offset] = native_endianness;
+    if((ndarray->dtype == NDARRAY_UINT8) || (ndarray->dtype == NDARRAY_INT8)) {
+        // for single-byte data, the endianness doesn't matter
+        buffer[offset] = '|';
+    }
+    offset++;
+    switch(ndarray->dtype) {
+        case NDARRAY_UINT8:
+            memcpy(buffer+offset, "u1", 2);
+            break;
+        case NDARRAY_INT8:
+            memcpy(buffer+offset, "i1", 2);
+            break;
+        case NDARRAY_UINT16:
+            memcpy(buffer+offset, "u2", 2);
+            break;
+        case NDARRAY_INT16:
+            memcpy(buffer+offset, "i2", 2);
+            break;
+        case NDARRAY_FLOAT:
+            #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
+            memcpy(buffer+offset, "f4", 2);
+            #else
+            memcpy(buffer+offset, "f8", 2);
+            #endif
+            break;
+        #if ULAB_SUPPORTS_COMPLEX
+        case NDARRAY_COMPLEX:
+            #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
+            memcpy(buffer+offset, "c8", 2);
+            #else
+            memcpy(buffer+offset, "c16", 3);
+            offset++;
+            #endif
+            break;
+        #endif
+    }
+
+    offset += 2;
+    memcpy(buffer+offset, "', 'fortran_order': False, 'shape': (", 37);
+    offset += 37;
+
+    if(ndarray->ndim == 1) {
+        offset += io_sprintf(buffer+offset, ",\0", ndarray->shape[ULAB_MAX_DIMS - 1]);
+    } else {
+        for(uint8_t i = ndarray->ndim; i > 1; i--) {
+            offset += io_sprintf(buffer+offset, ", \0", ndarray->shape[ULAB_MAX_DIMS - i]);
+        }
+        offset += io_sprintf(buffer+offset, "\0", ndarray->shape[ULAB_MAX_DIMS - 1]);
+    }
+    memcpy(buffer+offset, "), }", 4);
+    offset += 4;
+    // pad with space till the very end
+    memset(buffer+offset, 32, ULAB_IO_BUFFER_SIZE - offset - 1);
+    buffer[ULAB_IO_BUFFER_SIZE - 1] = '\n';
+    stream_p->write(stream, buffer, ULAB_IO_BUFFER_SIZE, &error);
+
+    // write the array data
+    uint8_t sz = ndarray->itemsize;
+    offset = 0;
+
+    uint8_t *array = (uint8_t *)ndarray->array;
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    memcpy(buffer+offset, array, sz);
+                    offset += sz;
+                    if(offset == ULAB_IO_BUFFER_SIZE) {
+                        stream_p->write(stream, buffer, offset, &error);
+                        offset = 0;
+                    }
+                    array += ndarray->strides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l <  ndarray->shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                array -= ndarray->strides[ULAB_MAX_DIMS - 1] * ndarray->shape[ULAB_MAX_DIMS-1];
+                array += ndarray->strides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k <  ndarray->shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            array -= ndarray->strides[ULAB_MAX_DIMS - 2] * ndarray->shape[ULAB_MAX_DIMS-2];
+            array += ndarray->strides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j <  ndarray->shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        array -= ndarray->strides[ULAB_MAX_DIMS - 3] * ndarray->shape[ULAB_MAX_DIMS-3];
+        array += ndarray->strides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i <  ndarray->shape[ULAB_MAX_DIMS - 4]);
+    #endif
+
+    stream_p->write(stream, buffer, offset, &error);
+    stream_p->ioctl(stream, MP_STREAM_CLOSE, 0, &error);
+
+    m_del(char, buffer, ULAB_IO_BUFFER_SIZE);
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_2(io_save_obj, io_save);
+#endif /* ULAB_NUMPY_HAS_SAVE */
+
+#if ULAB_NUMPY_HAS_SAVETXT
+static int8_t io_format_float(ndarray_obj_t *ndarray, mp_float_t (*func)(void *), uint8_t *array, char *buffer, char *delimiter) {
+    // own implementation of float formatting for platforms that don't have sprintf
+    int8_t offset = 0;
+
+    #if MICROPY_FLOAT_IMPL == MICROPY_FLOAT_IMPL_FLOAT
+        #if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_C
+        const int precision = 6;
+        #else
+        const int precision = 7;
+        #endif
+    #else
+        const int precision = 16;
+    #endif
+
+    #if ULAB_SUPPORTS_COMPLEX
+    if(ndarray->dtype == NDARRAY_COMPLEX) {
+        mp_float_t real = func(array);
+        mp_float_t imag = func(array + ndarray->itemsize / 2);
+        offset = mp_format_float(real, buffer, ULAB_IO_BUFFER_SIZE, 'f', precision, 'j');
+        if(imag >= MICROPY_FLOAT_CONST(0.0)) {
+            buffer[offset++] = '+';
+        } else {
+            buffer[offset++] = '-';
+        }
+        offset += mp_format_float(-imag, &buffer[offset], ULAB_IO_BUFFER_SIZE, 'f', precision, 'j');
+    }
+    #endif
+    offset = (uint8_t)mp_format_float(func(array), buffer, ULAB_IO_BUFFER_SIZE, 'f', precision, '\0');
+
+    #if ULAB_SUPPORTS_COMPLEX
+    if(ndarray->dtype != NDARRAY_COMPLEX) {
+        // complexes end with a 'j', floats with a '\0', so we have to wind back by one character
+        offset--;
+    }
+    #endif
+
+    while(*delimiter != '\0') {
+        buffer[offset++] = *delimiter++;
+    }
+
+    return offset;
+}
+
+static mp_obj_t io_savetxt(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_delimiter, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_header, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_footer, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_comments, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!mp_obj_is_str(args[0].u_obj) || !mp_obj_is_type(args[1].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("wrong input type"));
+    }
+
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[1].u_obj);
+
+    #if ULAB_MAX_DIMS > 2
+    if(ndarray->ndim > 2) {
+        mp_raise_ValueError(translate("array has too many dimensions"));
+    }
+    #endif
+
+    mp_obj_t open_args[2] = {
+        args[0].u_obj,
+        MP_OBJ_NEW_QSTR(MP_QSTR_w)
+    };
+
+    mp_obj_t stream = mp_builtin_open_obj.fun.kw(2, open_args, (mp_map_t *)&mp_const_empty_map);
+    const mp_stream_p_t *stream_p = mp_get_stream(stream);
+
+    char *buffer = m_new(char, ULAB_IO_BUFFER_SIZE);
+    int error;
+
+    if(mp_obj_is_str(args[3].u_obj)) {
+        size_t _len;
+        if(mp_obj_is_str(args[5].u_obj)) {
+            const char *comments = mp_obj_str_get_data(args[5].u_obj, &_len);
+            stream_p->write(stream, comments, _len, &error);
+        } else {
+            stream_p->write(stream, "# ", 2, &error);
+        }
+        const char *header = mp_obj_str_get_data(args[3].u_obj, &_len);
+        stream_p->write(stream, header, _len, &error);
+        stream_p->write(stream, "\n", 1, &error);
+    }
+
+    uint8_t *array = (uint8_t *)ndarray->array;
+    mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
+    char *delimiter = m_new(char, 8);
+
+    if(ndarray->ndim == 1) {
+        delimiter[0] = '\n';
+        delimiter[1] = '\0';
+    } else if(args[2].u_obj == mp_const_none) {
+        delimiter[0] = ' ';
+        delimiter[1] = '\0';
+    } else {
+        size_t delimiter_len;
+        delimiter = (char *)mp_obj_str_get_data(args[2].u_obj, &delimiter_len);
+    }
+
+    #if ULAB_MAX_DIMS > 1
+    size_t k = 0;
+    do {
+    #endif
+        size_t l = 0;
+        do {
+            int8_t chars = io_format_float(ndarray, func, array, buffer, l == ndarray->shape[ULAB_MAX_DIMS - 1] - 1 ? "\n" : delimiter);
+            if(chars > 0) {
+                stream_p->write(stream, buffer, chars, &error);
+            }
+            array += ndarray->strides[ULAB_MAX_DIMS - 1];
+            l++;
+        } while(l < ndarray->shape[ULAB_MAX_DIMS - 1]);
+    #if ULAB_MAX_DIMS > 1
+        array -= ndarray->strides[ULAB_MAX_DIMS - 1] * ndarray->shape[ULAB_MAX_DIMS-1];
+        array += ndarray->strides[ULAB_MAX_DIMS - 2];
+        k++;
+    } while(k < ndarray->shape[ULAB_MAX_DIMS - 2]);
+    #endif
+
+    if(mp_obj_is_str(args[4].u_obj)) {
+        size_t _len;
+        if(mp_obj_is_str(args[5].u_obj)) {
+            const char *comments = mp_obj_str_get_data(args[5].u_obj, &_len);
+            stream_p->write(stream, comments, _len, &error);
+        } else {
+            stream_p->write(stream, "# ", 2, &error);
+        }
+        const char *footer = mp_obj_str_get_data(args[4].u_obj, &_len);
+        stream_p->write(stream, footer, _len, &error);
+        stream_p->write(stream, "\n", 1, &error);
+    }
+
+    stream_p->ioctl(stream, MP_STREAM_CLOSE, 0, &error);
+
+    return mp_const_none;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(io_savetxt_obj, 2, io_savetxt);
+#endif /* ULAB_NUMPY_HAS_SAVETXT */
\ No newline at end of file
diff --git a/python/port/mod/ulab/numpy/io/io.h b/python/port/mod/ulab/numpy/io/io.h
new file mode 100644
index 000000000..d0141e78d
--- /dev/null
+++ b/python/port/mod/ulab/numpy/io/io.h
@@ -0,0 +1,19 @@
+/*
+ * This file is part of the micropython-ulab project,
+ *
+ * https://github.com/v923z/micropython-ulab
+ *
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2022 Zoltán Vörös
+*/
+
+#ifndef _ULAB_IO_
+#define _ULAB_IO_
+
+MP_DECLARE_CONST_FUN_OBJ_1(io_load_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(io_loadtxt_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(io_save_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(io_savetxt_obj);
+
+#endif
\ No newline at end of file
diff --git a/python/port/mod/ulab/numpy/linalg/linalg.c b/python/port/mod/ulab/numpy/linalg/linalg.c
index 596280fea..478503cf6 100644
--- a/python/port/mod/ulab/numpy/linalg/linalg.c
+++ b/python/port/mod/ulab/numpy/linalg/linalg.c
@@ -22,6 +22,7 @@
 
 #include "../../ulab.h"
 #include "../../ulab_tools.h"
+#include "../carray/carray_tools.h"
 #include "linalg.h"
 
 #if ULAB_NUMPY_HAS_LINALG_MODULE
@@ -44,6 +45,7 @@
 
 static mp_obj_t linalg_cholesky(mp_obj_t oin) {
     ndarray_obj_t *ndarray = tools_object_is_square(oin);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
     ndarray_obj_t *L = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, ndarray->shape[ULAB_MAX_DIMS - 1], ndarray->shape[ULAB_MAX_DIMS - 1]), NDARRAY_FLOAT);
     mp_float_t *Larray = (mp_float_t *)L->array;
 
@@ -110,6 +112,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(linalg_cholesky_obj, linalg_cholesky);
 
 static mp_obj_t linalg_det(mp_obj_t oin) {
     ndarray_obj_t *ndarray = tools_object_is_square(oin);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
     uint8_t *array = (uint8_t *)ndarray->array;
     size_t N = ndarray->shape[ULAB_MAX_DIMS - 1];
     mp_float_t *tmp = m_new(mp_float_t, N * N);
@@ -182,6 +185,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(linalg_det_obj, linalg_det);
 
 static mp_obj_t linalg_eig(mp_obj_t oin) {
     ndarray_obj_t *in = tools_object_is_square(oin);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(in->dtype)
     uint8_t *iarray = (uint8_t *)in->array;
     size_t S = in->shape[ULAB_MAX_DIMS - 1];
     mp_float_t *array = m_new(mp_float_t, S*S);
@@ -243,6 +247,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(linalg_eig_obj, linalg_eig);
 //|
 static mp_obj_t linalg_inv(mp_obj_t o_in) {
     ndarray_obj_t *ndarray = tools_object_is_square(o_in);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
     uint8_t *array = (uint8_t *)ndarray->array;
     size_t N = ndarray->shape[ULAB_MAX_DIMS - 1];
     ndarray_obj_t *inverted = ndarray_new_dense_ndarray(2, ndarray_shape_vector(0, 0, N, N), NDARRAY_FLOAT);
@@ -305,6 +310,7 @@ static mp_obj_t linalg_norm(size_t n_args, const mp_obj_t *pos_args, mp_map_t *k
         return mp_obj_new_float(MICROPY_FLOAT_C_FUN(sqrt)(dot * (count - 1)));
     } else if(mp_obj_is_type(x, &ulab_ndarray_type)) {
         ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(x);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
         uint8_t *array = (uint8_t *)ndarray->array;
         // always get a float, so that we don't have to resolve the dtype later
         mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
@@ -429,22 +435,22 @@ static mp_obj_t linalg_qr(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_
             // [[c  s],
             //  [s -c]]
             if(MICROPY_FLOAT_C_FUN(fabs)(rarray[i * n + j]) < LINALG_EPSILON) { // r[i, j]
-                c = (rarray[(i - 1) * n + j] >= 0.0) ? 1.0 : -1.0; // r[i-1, j]
+                c = (rarray[(i - 1) * n + j] >= MICROPY_FLOAT_CONST(0.0)) ? MICROPY_FLOAT_CONST(1.0) : MICROPY_FLOAT_CONST(-1.0); // r[i-1, j]
                 s = 0.0;
             } else if(MICROPY_FLOAT_C_FUN(fabs)(rarray[(i - 1) * n + j]) < LINALG_EPSILON) { // r[i-1, j]
                 c = 0.0;
-                s = (rarray[i * n + j] >= 0.0) ? -1.0 : 1.0; // r[i, j]
+                s = (rarray[i * n + j] >= MICROPY_FLOAT_CONST(0.0)) ? MICROPY_FLOAT_CONST(-1.0) : MICROPY_FLOAT_CONST(1.0); // r[i, j]
             } else {
                 mp_float_t t, u;
                 if(MICROPY_FLOAT_C_FUN(fabs)(rarray[(i - 1) * n + j]) > MICROPY_FLOAT_C_FUN(fabs)(rarray[i * n + j])) { // r[i-1, j], r[i, j]
                     t = rarray[i * n + j] / rarray[(i - 1) * n + j]; // r[i, j]/r[i-1, j]
                     u = MICROPY_FLOAT_C_FUN(sqrt)(1 + t * t);
-                    c = -1.0 / u;
+                    c = MICROPY_FLOAT_CONST(-1.0) / u;
                     s = c * t;
                 } else {
                     t = rarray[(i - 1) * n + j] / rarray[i * n + j]; // r[i-1, j]/r[i, j]
                     u = MICROPY_FLOAT_C_FUN(sqrt)(1 + t * t);
-                    s = -1.0 / u;
+                    s = MICROPY_FLOAT_CONST(-1.0) / u;
                     c = s * t;
                 }
             }
diff --git a/python/port/mod/ulab/numpy/linalg/linalg_tools.c b/python/port/mod/ulab/numpy/linalg/linalg_tools.c
index 5e03a50ab..7ae97d211 100644
--- a/python/port/mod/ulab/numpy/linalg/linalg_tools.c
+++ b/python/port/mod/ulab/numpy/linalg/linalg_tools.c
@@ -14,8 +14,8 @@
 
 #include "linalg_tools.h"
 
-/* 
- * The following function inverts a matrix, whose entries are given in the input array 
+/*
+ * The following function inverts a matrix, whose entries are given in the input array
  * The function has no dependencies beyond micropython itself (for the definition of mp_float_t),
  * and can be used independent of ulab.
  */
@@ -26,10 +26,9 @@ bool linalg_invert_matrix(mp_float_t *data, size_t N) {
 
     // initially, this is the unit matrix: the contents of this matrix is what
     // will be returned after all the transformations
-    mp_float_t *unit = m_new(mp_float_t, N*N);
+    mp_float_t *unit = m_new0(mp_float_t, N*N);
     mp_float_t elem = 1.0;
-    // initialise the unit matrix
-    memset(unit, 0, sizeof(mp_float_t)*N*N);
+
     for(size_t m=0; m < N; m++) {
         memcpy(&unit[m * (N+1)], &elem, sizeof(mp_float_t));
     }
@@ -78,9 +77,9 @@ bool linalg_invert_matrix(mp_float_t *data, size_t N) {
     return true;
 }
 
-/* 
- * The following function calculates the eigenvalues and eigenvectors of a symmetric 
- * real matrix, whose entries are given in the input array. 
+/*
+ * The following function calculates the eigenvalues and eigenvectors of a symmetric
+ * real matrix, whose entries are given in the input array.
  * The function has no dependencies beyond micropython itself (for the definition of mp_float_t),
  * and can be used independent of ulab.
  */
@@ -166,6 +165,6 @@ size_t linalg_jacobi_rotations(mp_float_t *array, mp_float_t *eigvectors, size_t
             eigvectors[m * S + N] = s * vm + c * vn;
         }
     } while(iterations > 0);
-    
+
     return iterations;
 }
diff --git a/python/port/mod/ulab/numpy/numerical.c b/python/port/mod/ulab/numpy/numerical.c
index 39a5f9791..b33d0262b 100644
--- a/python/port/mod/ulab/numpy/numerical.c
+++ b/python/port/mod/ulab/numpy/numerical.c
@@ -22,6 +22,7 @@
 
 #include "../ulab.h"
 #include "../ulab_tools.h"
+#include "./carray/carray_tools.h"
 #include "numerical.h"
 
 enum NUMERICAL_FUNCTION_TYPE {
@@ -130,33 +131,71 @@ static mp_obj_t numerical_all_any(mp_obj_t oin, mp_obj_t axis, uint8_t optype) {
                     size_t l = 0;
                     if(axis == mp_const_none) {
                         do {
-                            mp_float_t value = func(array);
-                            if((value != MICROPY_FLOAT_CONST(0.0)) & !anytype) {
-                                // optype = NUMERICAL_ANY
-                                return mp_const_true;
-                            } else if((value == MICROPY_FLOAT_CONST(0.0)) & anytype) {
-                                // optype == NUMERICAL_ALL
-                                return mp_const_false;
+                            #if ULAB_SUPPORTS_COMPLEX
+                            if(ndarray->dtype == NDARRAY_COMPLEX) {
+                                mp_float_t real = *((mp_float_t *)array);
+                                mp_float_t imag = *((mp_float_t *)(array + sizeof(mp_float_t)));
+                                if(((real != MICROPY_FLOAT_CONST(0.0)) | (imag != MICROPY_FLOAT_CONST(0.0))) & !anytype) {
+                                    // optype = NUMERICAL_ANY
+                                    return mp_const_true;
+                                } else if(((real == MICROPY_FLOAT_CONST(0.0)) & (imag == MICROPY_FLOAT_CONST(0.0))) & anytype) {
+                                    // optype == NUMERICAL_ALL
+                                    return mp_const_false;
+                                }
+                            } else {
+                            #endif
+                                mp_float_t value = func(array);
+                                if((value != MICROPY_FLOAT_CONST(0.0)) & !anytype) {
+                                    // optype = NUMERICAL_ANY
+                                    return mp_const_true;
+                                } else if((value == MICROPY_FLOAT_CONST(0.0)) & anytype) {
+                                    // optype == NUMERICAL_ALL
+                                    return mp_const_false;
+                                }
+                            #if ULAB_SUPPORTS_COMPLEX
                             }
+                            #endif
                             array += _shape_strides.strides[0];
                             l++;
                         } while(l < _shape_strides.shape[0]);
                     } else { // a scalar axis keyword was supplied
                         do {
-                            mp_float_t value = func(array);
-                            if((value != MICROPY_FLOAT_CONST(0.0)) & !anytype) {
-                                // optype == NUMERICAL_ANY
-                                *rarray = 1;
-                                // since we are breaking out of the loop, move the pointer forward
-                                array += _shape_strides.strides[0] * (_shape_strides.shape[0] - l);
-                                break;
-                            } else if((value == MICROPY_FLOAT_CONST(0.0)) & anytype) {
-                                // optype == NUMERICAL_ALL
-                                *rarray = 0;
-                                // since we are breaking out of the loop, move the pointer forward
-                                array += _shape_strides.strides[0] * (_shape_strides.shape[0] - l);
-                                break;
+                            #if ULAB_SUPPORTS_COMPLEX
+                            if(ndarray->dtype == NDARRAY_COMPLEX) {
+                                mp_float_t real = *((mp_float_t *)array);
+                                mp_float_t imag = *((mp_float_t *)(array + sizeof(mp_float_t)));
+                                if(((real != MICROPY_FLOAT_CONST(0.0)) | (imag != MICROPY_FLOAT_CONST(0.0))) & !anytype) {
+                                    // optype = NUMERICAL_ANY
+                                    *rarray = 1;
+                                    // since we are breaking out of the loop, move the pointer forward
+                                    array += _shape_strides.strides[0] * (_shape_strides.shape[0] - l);
+                                    break;
+                                } else if(((real == MICROPY_FLOAT_CONST(0.0)) & (imag == MICROPY_FLOAT_CONST(0.0))) & anytype) {
+                                    // optype == NUMERICAL_ALL
+                                    *rarray = 0;
+                                    // since we are breaking out of the loop, move the pointer forward
+                                    array += _shape_strides.strides[0] * (_shape_strides.shape[0] - l);
+                                    break;
+                                }
+                            } else {
+                            #endif
+                                mp_float_t value = func(array);
+                                if((value != MICROPY_FLOAT_CONST(0.0)) & !anytype) {
+                                    // optype == NUMERICAL_ANY
+                                    *rarray = 1;
+                                    // since we are breaking out of the loop, move the pointer forward
+                                    array += _shape_strides.strides[0] * (_shape_strides.shape[0] - l);
+                                    break;
+                                } else if((value == MICROPY_FLOAT_CONST(0.0)) & anytype) {
+                                    // optype == NUMERICAL_ALL
+                                    *rarray = 0;
+                                    // since we are breaking out of the loop, move the pointer forward
+                                    array += _shape_strides.strides[0] * (_shape_strides.shape[0] - l);
+                                    break;
+                                }
+                            #if ULAB_SUPPORTS_COMPLEX
                             }
+                            #endif
                             array += _shape_strides.strides[0];
                             l++;
                         } while(l < _shape_strides.shape[0]);
@@ -234,6 +273,7 @@ static mp_obj_t numerical_sum_mean_std_iterable(mp_obj_t oin, uint8_t optype, si
 }
 
 static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t axis, uint8_t optype, size_t ddof) {
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
     uint8_t *array = (uint8_t *)ndarray->array;
     shape_strides _shape_strides = tools_reduce_axes(ndarray, axis);
 
@@ -244,7 +284,7 @@ static mp_obj_t numerical_sum_mean_std_ndarray(ndarray_obj_t *ndarray, mp_obj_t
             return mp_obj_new_float(MICROPY_FLOAT_CONST(0.0));
         }
         mp_float_t (*func)(void *) = ndarray_get_float_function(ndarray->dtype);
-        mp_float_t M =MICROPY_FLOAT_CONST(0.0);
+        mp_float_t M = MICROPY_FLOAT_CONST(0.0);
         mp_float_t m = MICROPY_FLOAT_CONST(0.0);
         mp_float_t S = MICROPY_FLOAT_CONST(0.0);
         mp_float_t s = MICROPY_FLOAT_CONST(0.0);
@@ -472,17 +512,12 @@ static mp_obj_t numerical_argmin_argmax_ndarray(ndarray_obj_t *ndarray, mp_obj_t
             }
         }
     } else {
-        int8_t ax = mp_obj_get_int(axis);
-        if(ax < 0) ax += ndarray->ndim;
-        if((ax < 0) || (ax > ndarray->ndim - 1)) {
-            mp_raise_ValueError(translate("axis is out of bounds"));
-        }
+        int8_t ax = tools_get_axis(axis, ndarray->ndim);
 
         uint8_t *array = (uint8_t *)ndarray->array;
-        size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
-        memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
-        int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
-        memset(strides, 0, sizeof(uint32_t)*ULAB_MAX_DIMS);
+        size_t *shape = m_new0(size_t, ULAB_MAX_DIMS);
+        int32_t *strides = m_new0(int32_t, ULAB_MAX_DIMS);
+
         numerical_reduce_axes(ndarray, ax, shape, strides);
         uint8_t index = ULAB_MAX_DIMS - ndarray->ndim + ax;
 
@@ -507,6 +542,9 @@ static mp_obj_t numerical_argmin_argmax_ndarray(ndarray_obj_t *ndarray, mp_obj_t
         } else {
             RUN_ARGMIN(ndarray, mp_float_t, array, results, rarray, shape, strides, index, optype);
         }
+
+        m_del(int32_t, strides, ULAB_MAX_DIMS);
+
         if(results->len == 1) {
             return mp_binary_get_val_array(results->dtype, results->array, 0);
         }
@@ -555,9 +593,11 @@ static mp_obj_t numerical_function(size_t n_args, const mp_obj_t *pos_args, mp_m
             case NUMERICAL_MAX:
             case NUMERICAL_ARGMIN:
             case NUMERICAL_ARGMAX:
+                COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
                 return numerical_argmin_argmax_ndarray(ndarray, axis, optype);
             case NUMERICAL_SUM:
             case NUMERICAL_MEAN:
+                COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
                 return numerical_sum_mean_std_ndarray(ndarray, axis, optype, 0);
             default:
                 mp_raise_NotImplementedError(translate("operation is not implemented on ndarrays"));
@@ -580,6 +620,7 @@ static mp_obj_t numerical_sort_helper(mp_obj_t oin, mp_obj_t axis, uint8_t inpla
     } else {
         ndarray = ndarray_copy_view(MP_OBJ_TO_PTR(oin));
     }
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
 
     int8_t ax = 0;
     if(axis == mp_const_none) {
@@ -594,30 +635,30 @@ static mp_obj_t numerical_sort_helper(mp_obj_t oin, mp_obj_t axis, uint8_t inpla
         ndarray->ndim = 1;
         #endif
     } else {
-        ax = mp_obj_get_int(axis);
-        if(ax < 0) ax += ndarray->ndim;
-        if((ax < 0) || (ax > ndarray->ndim - 1)) {
-            mp_raise_ValueError(translate("index out of range"));
-        }
+        ax = tools_get_axis(axis, ndarray->ndim);
     }
 
-    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
-    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
-    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
-    memset(strides, 0, sizeof(uint32_t)*ULAB_MAX_DIMS);
+    size_t *shape = m_new0(size_t, ULAB_MAX_DIMS);
+    int32_t *strides = m_new0(int32_t, ULAB_MAX_DIMS);
+
     numerical_reduce_axes(ndarray, ax, shape, strides);
     ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
     // we work with the typed array, so re-scale the stride
     int32_t increment = ndarray->strides[ax] / ndarray->itemsize;
 
     uint8_t *array = (uint8_t *)ndarray->array;
-    if((ndarray->dtype == NDARRAY_UINT8) || (ndarray->dtype == NDARRAY_INT8)) {
-        HEAPSORT(ndarray, uint8_t, array, shape, strides, ax, increment, ndarray->shape[ax]);
-    } else if((ndarray->dtype == NDARRAY_INT16) || (ndarray->dtype == NDARRAY_INT16)) {
-        HEAPSORT(ndarray, uint16_t, array, shape, strides, ax, increment, ndarray->shape[ax]);
-    } else {
-        HEAPSORT(ndarray, mp_float_t, array, shape, strides, ax, increment, ndarray->shape[ax]);
+    if(ndarray->shape[ax]) {
+        if((ndarray->dtype == NDARRAY_UINT8) || (ndarray->dtype == NDARRAY_INT8)) {
+            HEAPSORT(ndarray, uint8_t, array, shape, strides, ax, increment, ndarray->shape[ax]);
+        } else if((ndarray->dtype == NDARRAY_INT16) || (ndarray->dtype == NDARRAY_INT16)) {
+            HEAPSORT(ndarray, uint16_t, array, shape, strides, ax, increment, ndarray->shape[ax]);
+        } else {
+            HEAPSORT(ndarray, mp_float_t, array, shape, strides, ax, increment, ndarray->shape[ax]);
+        }
     }
+
+    m_del(int32_t, strides, ULAB_MAX_DIMS);
+
     if(inplace == 1) {
         return mp_const_none;
     } else {
@@ -682,6 +723,7 @@ mp_obj_t numerical_argsort(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw
     }
 
     ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
     if(args[1].u_obj == mp_const_none) {
         // bail out, though dense arrays could still be sorted
         mp_raise_NotImplementedError(translate("argsort is not implemented for flattened arrays"));
@@ -693,22 +735,17 @@ mp_obj_t numerical_argsort(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw
             mp_raise_ValueError(translate("axis too long"));
         }
     }
-    int8_t ax = mp_obj_get_int(args[1].u_obj);
-    if(ax < 0) ax += ndarray->ndim;
-    if((ax < 0) || (ax > ndarray->ndim - 1)) {
-        mp_raise_ValueError(translate("index out of range"));
-    }
-    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
-    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
-    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
-    memset(strides, 0, sizeof(uint32_t)*ULAB_MAX_DIMS);
+    int8_t ax = tools_get_axis(args[1].u_obj, ndarray->ndim);
+
+    size_t *shape = m_new0(size_t, ULAB_MAX_DIMS);
+    int32_t *strides = m_new0(int32_t, ULAB_MAX_DIMS);
     numerical_reduce_axes(ndarray, ax, shape, strides);
 
     // We could return an NDARRAY_UINT8 array, if all lengths are shorter than 256
     ndarray_obj_t *indices = ndarray_new_ndarray(ndarray->ndim, ndarray->shape, NULL, NDARRAY_UINT16);
-    int32_t *istrides = m_new(int32_t, ULAB_MAX_DIMS);
-    memset(istrides, 0, sizeof(uint32_t)*ULAB_MAX_DIMS);
+    int32_t *istrides = m_new0(int32_t, ULAB_MAX_DIMS);
     numerical_reduce_axes(indices, ax, shape, istrides);
+
     for(uint8_t i=0; i < ULAB_MAX_DIMS; i++) {
         istrides[i] /= sizeof(uint16_t);
     }
@@ -760,13 +797,20 @@ mp_obj_t numerical_argsort(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw
     // reset the array
     iarray = indices->array;
 
-    if((ndarray->dtype == NDARRAY_UINT8) || (ndarray->dtype == NDARRAY_INT8)) {
-        HEAP_ARGSORT(ndarray, uint8_t, array, shape, strides, ax, increment, ndarray->shape[ax], iarray, istrides, iincrement);
-    } else if((ndarray->dtype == NDARRAY_UINT16) || (ndarray->dtype == NDARRAY_INT16)) {
-        HEAP_ARGSORT(ndarray, uint16_t, array, shape, strides, ax, increment, ndarray->shape[ax], iarray, istrides, iincrement);
-    } else {
-        HEAP_ARGSORT(ndarray, mp_float_t, array, shape, strides, ax, increment, ndarray->shape[ax], iarray, istrides, iincrement);
+    if(ndarray->shape[ax]) {
+        if((ndarray->dtype == NDARRAY_UINT8) || (ndarray->dtype == NDARRAY_INT8)) {
+            HEAP_ARGSORT(ndarray, uint8_t, array, shape, strides, ax, increment, ndarray->shape[ax], iarray, istrides, iincrement);
+        } else if((ndarray->dtype == NDARRAY_UINT16) || (ndarray->dtype == NDARRAY_INT16)) {
+            HEAP_ARGSORT(ndarray, uint16_t, array, shape, strides, ax, increment, ndarray->shape[ax], iarray, istrides, iincrement);
+        } else {
+            HEAP_ARGSORT(ndarray, mp_float_t, array, shape, strides, ax, increment, ndarray->shape[ax], iarray, istrides, iincrement);
+        }
     }
+
+    m_del(size_t, shape, ULAB_MAX_DIMS);
+    m_del(int32_t, strides, ULAB_MAX_DIMS);
+    m_del(int32_t, istrides, ULAB_MAX_DIMS);
+
     return MP_OBJ_FROM_PTR(indices);
 }
 
@@ -785,6 +829,8 @@ static mp_obj_t numerical_cross(mp_obj_t _a, mp_obj_t _b) {
     }
     ndarray_obj_t *a = MP_OBJ_TO_PTR(_a);
     ndarray_obj_t *b = MP_OBJ_TO_PTR(_b);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(a->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(b->dtype)
     if((a->ndim != 1) || (b->ndim != 1) || (a->len != b->len) || (a->len != 3)) {
         mp_raise_ValueError(translate("cross is defined for 1D arrays of length 3"));
     }
@@ -873,6 +919,7 @@ mp_obj_t numerical_diff(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_ar
     }
 
     ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
     int8_t ax = args[2].u_int;
     if(ax < 0) ax += ndarray->ndim;
 
@@ -891,13 +938,12 @@ mp_obj_t numerical_diff(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_ar
 
     int8_t *stencil = m_new(int8_t, N+1);
     stencil[0] = 1;
-    for(uint8_t i=1; i < N+1; i++) {
+    for(uint8_t i = 1; i < N+1; i++) {
         stencil[i] = -stencil[i-1]*(N-i+1)/i;
     }
 
-    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
-    memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
-    for(uint8_t i=0; i < ULAB_MAX_DIMS; i++) {
+    size_t *shape = m_new0(size_t, ULAB_MAX_DIMS);
+    for(uint8_t i = 0; i < ULAB_MAX_DIMS; i++) {
         shape[i] = ndarray->shape[i];
         if(i == index) {
             shape[i] -= N;
@@ -908,8 +954,7 @@ mp_obj_t numerical_diff(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_ar
     uint8_t *rarray = (uint8_t *)results->array;
 
     memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
-    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
-    memset(strides, 0, sizeof(int32_t)*ULAB_MAX_DIMS);
+    int32_t *strides = m_new0(int32_t, ULAB_MAX_DIMS);
     numerical_reduce_axes(ndarray, ax, shape, strides);
 
     if(ndarray->dtype == NDARRAY_UINT8) {
@@ -956,17 +1001,14 @@ mp_obj_t numerical_flip(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_ar
     ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
     if(args[1].u_obj == mp_const_none) { // flip the flattened array
         results = ndarray_new_linear_array(ndarray->len, ndarray->dtype);
-        ndarray_copy_array(ndarray, results);
+        ndarray_copy_array(ndarray, results, 0);
         uint8_t *rarray = (uint8_t *)results->array;
         rarray += (results->len - 1) * results->itemsize;
         results->array = rarray;
         results->strides[ULAB_MAX_DIMS - 1] = -results->strides[ULAB_MAX_DIMS - 1];
     } else if(mp_obj_is_int(args[1].u_obj)){
-        int8_t ax = mp_obj_get_int(args[1].u_obj);
-        if(ax < 0) ax += ndarray->ndim;
-        if((ax < 0) || (ax > ndarray->ndim - 1)) {
-            mp_raise_ValueError(translate("index out of range"));
-        }
+        int8_t ax = tools_get_axis(args[1].u_obj, ndarray->ndim);
+
         ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
         int32_t offset = (ndarray->shape[ax] - 1) * ndarray->strides[ax];
         results = ndarray_new_view(ndarray, ndarray->ndim, ndarray->shape, ndarray->strides, offset);
@@ -1044,17 +1086,16 @@ mp_obj_t numerical_median(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_
         }
         return mp_obj_new_float(median);
     } else {
-        int8_t ax = mp_obj_get_int(args[1].u_obj);
-        if(ax < 0) ax += ndarray->ndim;
-        // here we can save the exception, because if the axis is out of range,
-        // then numerical_sort_helper has already taken care of the issue
-        size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
-        memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
-        int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
-        memset(strides, 0, sizeof(uint32_t)*ULAB_MAX_DIMS);
+        int8_t ax = tools_get_axis(args[1].u_obj, ndarray->ndim);
+
+        size_t *shape = m_new0(size_t, ULAB_MAX_DIMS);
+        int32_t *strides = m_new0(int32_t, ULAB_MAX_DIMS);
         numerical_reduce_axes(ndarray, ax, shape, strides);
+
         ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
         ndarray_obj_t *results = ndarray_new_dense_ndarray(ndarray->ndim-1, shape, NDARRAY_FLOAT);
+        m_del(size_t, shape, ULAB_MAX_DIMS);
+
         mp_float_t *rarray = (mp_float_t *)results->array;
 
         uint8_t *array = (uint8_t *)ndarray->array;
@@ -1200,21 +1241,14 @@ mp_obj_t numerical_roll(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_ar
         } while(i <  ndarray->shape[ULAB_MAX_DIMS - 4]);
         #endif
     } else if(mp_obj_is_int(args[2].u_obj)){
-        int8_t ax = mp_obj_get_int(args[2].u_obj);
-        if(ax < 0) ax += ndarray->ndim;
-        if((ax < 0) || (ax > ndarray->ndim - 1)) {
-            mp_raise_ValueError(translate("index out of range"));
-        }
-        size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
-        memset(shape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
-        int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
-        memset(strides, 0, sizeof(int32_t)*ULAB_MAX_DIMS);
+        int8_t ax = tools_get_axis(args[2].u_obj, ndarray->ndim);
+
+        size_t *shape = m_new0(size_t, ULAB_MAX_DIMS);
+        int32_t *strides = m_new0(int32_t, ULAB_MAX_DIMS);
         numerical_reduce_axes(ndarray, ax, shape, strides);
 
-        size_t *rshape = m_new(size_t, ULAB_MAX_DIMS);
-        memset(rshape, 0, sizeof(size_t)*ULAB_MAX_DIMS);
-        int32_t *rstrides = m_new(int32_t, ULAB_MAX_DIMS);
-        memset(rstrides, 0, sizeof(int32_t)*ULAB_MAX_DIMS);
+        size_t *rshape = m_new0(size_t, ULAB_MAX_DIMS);
+        int32_t *rstrides = m_new0(int32_t, ULAB_MAX_DIMS);
         numerical_reduce_axes(results, ax, rshape, rstrides);
 
         ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
@@ -1275,9 +1309,16 @@ mp_obj_t numerical_roll(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_ar
             i++;
         } while(i < shape[ULAB_MAX_DIMS - 3]);
         #endif
+
+        m_del(size_t, shape, ULAB_MAX_DIMS);
+        m_del(int32_t, strides, ULAB_MAX_DIMS);
+        m_del(size_t, rshape, ULAB_MAX_DIMS);
+        m_del(int32_t, rstrides, ULAB_MAX_DIMS);
+
     } else {
         mp_raise_TypeError(translate("wrong axis index"));
     }
+
     return results;
 }
 
diff --git a/python/port/mod/ulab/numpy/numerical.h b/python/port/mod/ulab/numpy/numerical.h
index 8d2971cd4..186c817b0 100644
--- a/python/port/mod/ulab/numpy/numerical.h
+++ b/python/port/mod/ulab/numpy/numerical.h
@@ -155,6 +155,7 @@
     type *_array = (type *)array;\
     type tmp;\
     uint16_t itmp, c, q = (N), p, r = (N) >> 1;\
+    assert(N);\
     for (;;) {\
         if (r > 0) {\
             r--;\
diff --git a/python/port/mod/ulab/numpy/numpy.c b/python/port/mod/ulab/numpy/numpy.c
index a6559ff8a..e1de9e6dd 100644
--- a/python/port/mod/ulab/numpy/numpy.c
+++ b/python/port/mod/ulab/numpy/numpy.c
@@ -8,7 +8,7 @@
  *
  * Copyright (c) 2020 Jeff Epler for Adafruit Industries
  *               2020 Scott Shawcroft for Adafruit Industries
- *               2020-2021 Zoltán Vörös
+ *               2020-2022 Zoltán Vörös
  *               2020 Taku Fukada
 */
 
@@ -17,11 +17,13 @@
 #include "py/runtime.h"
 
 #include "numpy.h"
-#include "../ulab_create.h"
 #include "approx.h"
+#include "carray/carray.h"
 #include "compare.h"
+#include "create.h"
 #include "fft/fft.h"
 #include "filter.h"
+#include "io/io.h"
 #include "linalg/linalg.h"
 #include "numerical.h"
 #include "stats.h"
@@ -125,6 +127,9 @@ static const mp_rom_map_elem_t ulab_numpy_globals_table[] = {
     { MP_ROM_QSTR(MP_QSTR_uint16), MP_ROM_INT(NDARRAY_UINT16) },
     { MP_ROM_QSTR(MP_QSTR_int16), MP_ROM_INT(NDARRAY_INT16) },
     { MP_ROM_QSTR(MP_QSTR_float), MP_ROM_INT(NDARRAY_FLOAT) },
+    #if ULAB_SUPPORTS_COMPLEX
+        { MP_ROM_QSTR(MP_QSTR_complex), MP_ROM_INT(NDARRAY_COMPLEX) },
+    #endif
     // modules of numpy
     #if ULAB_NUMPY_HAS_FFT_MODULE
         { MP_ROM_QSTR(MP_QSTR_fft), MP_ROM_PTR(&ulab_fft_module) },
@@ -142,9 +147,15 @@ static const mp_rom_map_elem_t ulab_numpy_globals_table[] = {
     #if ULAB_NUMPY_HAS_ARANGE
         { MP_ROM_QSTR(MP_QSTR_arange), (mp_obj_t)&create_arange_obj },
     #endif
+    #if ULAB_NUMPY_HAS_COMPRESS
+        { MP_ROM_QSTR(MP_QSTR_compress), (mp_obj_t)&transform_compress_obj },
+    #endif
     #if ULAB_NUMPY_HAS_CONCATENATE
         { MP_ROM_QSTR(MP_QSTR_concatenate), (mp_obj_t)&create_concatenate_obj },
     #endif
+    #if ULAB_NUMPY_HAS_DELETE
+        { MP_ROM_QSTR(MP_QSTR_delete), (mp_obj_t)&transform_delete_obj },
+    #endif
     #if ULAB_NUMPY_HAS_DIAG
         #if ULAB_MAX_DIMS > 1
             { MP_ROM_QSTR(MP_QSTR_diag), (mp_obj_t)&create_diag_obj },
@@ -224,6 +235,9 @@ static const mp_rom_map_elem_t ulab_numpy_globals_table[] = {
     #if ULAB_NUMPY_HAS_ARGSORT
         { MP_OBJ_NEW_QSTR(MP_QSTR_argsort), (mp_obj_t)&numerical_argsort_obj },
     #endif
+    #if ULAB_NUMPY_HAS_ASARRAY
+        { MP_OBJ_NEW_QSTR(MP_QSTR_asarray), (mp_obj_t)&create_asarray_obj },
+    #endif
     #if ULAB_NUMPY_HAS_CROSS
         { MP_OBJ_NEW_QSTR(MP_QSTR_cross), (mp_obj_t)&numerical_cross_obj },
     #endif
@@ -243,6 +257,12 @@ static const mp_rom_map_elem_t ulab_numpy_globals_table[] = {
     #if ULAB_NUMPY_HAS_FLIP
         { MP_OBJ_NEW_QSTR(MP_QSTR_flip), (mp_obj_t)&numerical_flip_obj },
     #endif
+    #if ULAB_NUMPY_HAS_LOAD
+        { MP_OBJ_NEW_QSTR(MP_QSTR_load), (mp_obj_t)&io_load_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_LOADTXT
+        { MP_OBJ_NEW_QSTR(MP_QSTR_loadtxt), (mp_obj_t)&io_loadtxt_obj },
+    #endif
     #if ULAB_NUMPY_HAS_MINMAX
         { MP_OBJ_NEW_QSTR(MP_QSTR_max), (mp_obj_t)&numerical_max_obj },
     #endif
@@ -258,6 +278,15 @@ static const mp_rom_map_elem_t ulab_numpy_globals_table[] = {
     #if ULAB_NUMPY_HAS_ROLL
         { MP_OBJ_NEW_QSTR(MP_QSTR_roll), (mp_obj_t)&numerical_roll_obj },
     #endif
+    #if ULAB_NUMPY_HAS_SAVE
+        { MP_OBJ_NEW_QSTR(MP_QSTR_save), (mp_obj_t)&io_save_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_SAVETXT
+        { MP_OBJ_NEW_QSTR(MP_QSTR_savetxt), (mp_obj_t)&io_savetxt_obj },
+    #endif
+    #if ULAB_NUMPY_HAS_SIZE
+        { MP_OBJ_NEW_QSTR(MP_QSTR_size), (mp_obj_t)&transform_size_obj },
+    #endif
     #if ULAB_NUMPY_HAS_SORT
         { MP_OBJ_NEW_QSTR(MP_QSTR_sort), (mp_obj_t)&numerical_sort_obj },
     #endif
@@ -276,81 +305,94 @@ static const mp_rom_map_elem_t ulab_numpy_globals_table[] = {
     #endif
     // functions of the vector sub-module
     #if ULAB_NUMPY_HAS_ACOS
-    { MP_OBJ_NEW_QSTR(MP_QSTR_acos), (mp_obj_t)&vectorise_acos_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_acos), (mp_obj_t)&vector_acos_obj },
     #endif
     #if ULAB_NUMPY_HAS_ACOSH
-    { MP_OBJ_NEW_QSTR(MP_QSTR_acosh), (mp_obj_t)&vectorise_acosh_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_acosh), (mp_obj_t)&vector_acosh_obj },
     #endif
     #if ULAB_NUMPY_HAS_ARCTAN2
-    { MP_OBJ_NEW_QSTR(MP_QSTR_arctan2), (mp_obj_t)&vectorise_arctan2_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_arctan2), (mp_obj_t)&vector_arctan2_obj },
     #endif
     #if ULAB_NUMPY_HAS_AROUND
-    { MP_OBJ_NEW_QSTR(MP_QSTR_around), (mp_obj_t)&vectorise_around_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_around), (mp_obj_t)&vector_around_obj },
     #endif
     #if ULAB_NUMPY_HAS_ASIN
-    { MP_OBJ_NEW_QSTR(MP_QSTR_asin), (mp_obj_t)&vectorise_asin_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_asin), (mp_obj_t)&vector_asin_obj },
     #endif
     #if ULAB_NUMPY_HAS_ASINH
-    { MP_OBJ_NEW_QSTR(MP_QSTR_asinh), (mp_obj_t)&vectorise_asinh_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_asinh), (mp_obj_t)&vector_asinh_obj },
     #endif
     #if ULAB_NUMPY_HAS_ATAN
-    { MP_OBJ_NEW_QSTR(MP_QSTR_atan), (mp_obj_t)&vectorise_atan_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_atan), (mp_obj_t)&vector_atan_obj },
     #endif
     #if ULAB_NUMPY_HAS_ATANH
-    { MP_OBJ_NEW_QSTR(MP_QSTR_atanh), (mp_obj_t)&vectorise_atanh_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_atanh), (mp_obj_t)&vector_atanh_obj },
     #endif
     #if ULAB_NUMPY_HAS_CEIL
-    { MP_OBJ_NEW_QSTR(MP_QSTR_ceil), (mp_obj_t)&vectorise_ceil_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_ceil), (mp_obj_t)&vector_ceil_obj },
     #endif
     #if ULAB_NUMPY_HAS_COS
-    { MP_OBJ_NEW_QSTR(MP_QSTR_cos), (mp_obj_t)&vectorise_cos_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_cos), (mp_obj_t)&vector_cos_obj },
     #endif
     #if ULAB_NUMPY_HAS_COSH
-    { MP_OBJ_NEW_QSTR(MP_QSTR_cosh), (mp_obj_t)&vectorise_cosh_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_cosh), (mp_obj_t)&vector_cosh_obj },
     #endif
     #if ULAB_NUMPY_HAS_DEGREES
-    { MP_OBJ_NEW_QSTR(MP_QSTR_degrees), (mp_obj_t)&vectorise_degrees_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_degrees), (mp_obj_t)&vector_degrees_obj },
     #endif
     #if ULAB_NUMPY_HAS_EXP
-    { MP_OBJ_NEW_QSTR(MP_QSTR_exp), (mp_obj_t)&vectorise_exp_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_exp), (mp_obj_t)&vector_exp_obj },
     #endif
     #if ULAB_NUMPY_HAS_EXPM1
-    { MP_OBJ_NEW_QSTR(MP_QSTR_expm1), (mp_obj_t)&vectorise_expm1_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_expm1), (mp_obj_t)&vector_expm1_obj },
     #endif
     #if ULAB_NUMPY_HAS_FLOOR
-    { MP_OBJ_NEW_QSTR(MP_QSTR_floor), (mp_obj_t)&vectorise_floor_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_floor), (mp_obj_t)&vector_floor_obj },
     #endif
     #if ULAB_NUMPY_HAS_LOG
-    { MP_OBJ_NEW_QSTR(MP_QSTR_log), (mp_obj_t)&vectorise_log_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_log), (mp_obj_t)&vector_log_obj },
     #endif
     #if ULAB_NUMPY_HAS_LOG10
-    { MP_OBJ_NEW_QSTR(MP_QSTR_log10), (mp_obj_t)&vectorise_log10_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_log10), (mp_obj_t)&vector_log10_obj },
     #endif
     #if ULAB_NUMPY_HAS_LOG2
-    { MP_OBJ_NEW_QSTR(MP_QSTR_log2), (mp_obj_t)&vectorise_log2_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_log2), (mp_obj_t)&vector_log2_obj },
     #endif
     #if ULAB_NUMPY_HAS_RADIANS
-    { MP_OBJ_NEW_QSTR(MP_QSTR_radians), (mp_obj_t)&vectorise_radians_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_radians), (mp_obj_t)&vector_radians_obj },
     #endif
     #if ULAB_NUMPY_HAS_SIN
-    { MP_OBJ_NEW_QSTR(MP_QSTR_sin), (mp_obj_t)&vectorise_sin_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_sin), (mp_obj_t)&vector_sin_obj },
     #endif
     #if ULAB_NUMPY_HAS_SINH
-    { MP_OBJ_NEW_QSTR(MP_QSTR_sinh), (mp_obj_t)&vectorise_sinh_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_sinh), (mp_obj_t)&vector_sinh_obj },
     #endif
     #if ULAB_NUMPY_HAS_SQRT
-    { MP_OBJ_NEW_QSTR(MP_QSTR_sqrt), (mp_obj_t)&vectorise_sqrt_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_sqrt), (mp_obj_t)&vector_sqrt_obj },
     #endif
     #if ULAB_NUMPY_HAS_TAN
-    { MP_OBJ_NEW_QSTR(MP_QSTR_tan), (mp_obj_t)&vectorise_tan_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_tan), (mp_obj_t)&vector_tan_obj },
     #endif
     #if ULAB_NUMPY_HAS_TANH
-    { MP_OBJ_NEW_QSTR(MP_QSTR_tanh), (mp_obj_t)&vectorise_tanh_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_tanh), (mp_obj_t)&vector_tanh_obj },
     #endif
     #if ULAB_NUMPY_HAS_VECTORIZE
-    { MP_OBJ_NEW_QSTR(MP_QSTR_vectorize), (mp_obj_t)&vectorise_vectorize_obj },
+    { MP_OBJ_NEW_QSTR(MP_QSTR_vectorize), (mp_obj_t)&vector_vectorize_obj },
+    #endif
+    #if ULAB_SUPPORTS_COMPLEX
+        #if ULAB_NUMPY_HAS_REAL
+        { MP_OBJ_NEW_QSTR(MP_QSTR_real), (mp_obj_t)&carray_real_obj },
+        #endif
+        #if ULAB_NUMPY_HAS_IMAG
+        { MP_OBJ_NEW_QSTR(MP_QSTR_imag), (mp_obj_t)&carray_imag_obj },
+        #endif
+        #if ULAB_NUMPY_HAS_CONJUGATE
+            { MP_ROM_QSTR(MP_QSTR_conjugate), (mp_obj_t)&carray_conjugate_obj },
+        #endif
+        #if ULAB_NUMPY_HAS_SORT_COMPLEX
+            { MP_ROM_QSTR(MP_QSTR_sort_complex), (mp_obj_t)&carray_sort_complex_obj },
+        #endif
     #endif
-
 };
 
 static MP_DEFINE_CONST_DICT(mp_module_ulab_numpy_globals, ulab_numpy_globals_table);
diff --git a/python/port/mod/ulab/numpy/poly.c b/python/port/mod/ulab/numpy/poly.c
index 7ea7feb1a..97ee5c75f 100644
--- a/python/port/mod/ulab/numpy/poly.c
+++ b/python/port/mod/ulab/numpy/poly.c
@@ -19,6 +19,7 @@
 #include "../ulab.h"
 #include "linalg/linalg_tools.h"
 #include "../ulab_tools.h"
+#include "carray/carray_tools.h"
 #include "poly.h"
 
 #if ULAB_NUMPY_HAS_POLYFIT
@@ -27,6 +28,12 @@ mp_obj_t poly_polyfit(size_t n_args, const mp_obj_t *args) {
     if(!ndarray_object_is_array_like(args[0])) {
         mp_raise_ValueError(translate("input data must be an iterable"));
     }
+    #if ULAB_SUPPORTS_COMPLEX
+    if(mp_obj_is_type(args[0], &ulab_ndarray_type)) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0]);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    }
+    #endif
     size_t lenx = 0, leny = 0;
     uint8_t deg = 0;
     mp_float_t *x, *XT, *y, *prod;
@@ -142,6 +149,17 @@ mp_obj_t poly_polyval(mp_obj_t o_p, mp_obj_t o_x) {
     if(!ndarray_object_is_array_like(o_p) || !ndarray_object_is_array_like(o_x)) {
         mp_raise_TypeError(translate("inputs are not iterable"));
     }
+    #if ULAB_SUPPORTS_COMPLEX
+    ndarray_obj_t *input;
+    if(mp_obj_is_type(o_p, &ulab_ndarray_type)) {
+        input = MP_OBJ_TO_PTR(o_p);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(input->dtype)
+    }
+    if(mp_obj_is_type(o_x, &ulab_ndarray_type)) {
+        input = MP_OBJ_TO_PTR(o_x);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(input->dtype)
+    }
+    #endif
     // p had better be a one-dimensional standard iterable
     uint8_t plen = mp_obj_get_int(mp_obj_len_maybe(o_p));
     mp_float_t *p = m_new(mp_float_t, plen);
@@ -164,7 +182,7 @@ mp_obj_t poly_polyval(mp_obj_t o_p, mp_obj_t o_x) {
 
         mp_float_t (*func)(void *) = ndarray_get_float_function(source->dtype);
 
-        // TODO: these loops are really nothing, but the re-implementation of
+        // TODO: these loops are really nothing, but the re-impplementation of
         // ITERATE_VECTOR from vectorise.c. We could pass a function pointer here
         #if ULAB_MAX_DIMS > 3
         size_t i = 0;
diff --git a/python/port/mod/ulab/numpy/stats.c b/python/port/mod/ulab/numpy/stats.c
index a63964fea..2d3488937 100644
--- a/python/port/mod/ulab/numpy/stats.c
+++ b/python/port/mod/ulab/numpy/stats.c
@@ -21,6 +21,7 @@
 
 #include "../ulab.h"
 #include "../ulab_tools.h"
+#include "carray/carray_tools.h"
 #include "stats.h"
 
 #if ULAB_MAX_DIMS > 1
@@ -36,6 +37,7 @@
 
 static mp_obj_t stats_trace(mp_obj_t oin) {
     ndarray_obj_t *ndarray = tools_object_is_square(oin);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
     mp_float_t trace = 0.0;
     for(size_t i=0; i < ndarray->shape[ULAB_MAX_DIMS - 1]; i++) {
         int32_t pos = i * (ndarray->strides[ULAB_MAX_DIMS - 1] + ndarray->strides[ULAB_MAX_DIMS - 2]);
diff --git a/python/port/mod/ulab/numpy/transform.c b/python/port/mod/ulab/numpy/transform.c
index 2c2d2dbdd..4f27ef346 100644
--- a/python/port/mod/ulab/numpy/transform.c
+++ b/python/port/mod/ulab/numpy/transform.c
@@ -9,17 +9,337 @@
  *
 */
 
+#include <unistd.h>
+#include <math.h>
 #include <stdlib.h>
 #include <string.h>
-#include <math.h>
 #include "py/obj.h"
 #include "py/runtime.h"
 #include "py/misc.h"
 
 #include "../ulab.h"
 #include "../ulab_tools.h"
+#include "carray/carray_tools.h"
+#include "numerical.h"
 #include "transform.h"
 
+#if ULAB_NUMPY_HAS_COMPRESS
+static mp_obj_t transform_compress(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t condition = args[0].u_obj;
+
+    if(!mp_obj_is_type(args[1].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("wrong input type"));
+    }
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[1].u_obj);
+    uint8_t *array = (uint8_t *)ndarray->array;
+
+    mp_obj_t axis = args[2].u_obj;
+
+    size_t len = MP_OBJ_SMALL_INT_VALUE(mp_obj_len_maybe(condition));
+    int8_t ax, shift_ax = 0;
+
+    if(axis != mp_const_none) {
+        ax = tools_get_axis(axis, ndarray->ndim);
+        shift_ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
+    }
+
+    if(((axis == mp_const_none) && (len != ndarray->len)) ||
+        ((axis != mp_const_none) && (len != ndarray->shape[shift_ax]))) {
+        mp_raise_ValueError(translate("wrong length of condition array"));
+    }
+
+    size_t true_count = 0;
+    mp_obj_iter_buf_t iter_buf;
+    mp_obj_t item, iterable = mp_getiter(condition, &iter_buf);
+    while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+        if(mp_obj_is_true(item)) {
+            true_count++;
+        }
+    }
+
+    iterable = mp_getiter(condition, &iter_buf);
+
+    ndarray_obj_t *result = NULL;
+
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    memcpy(shape, ndarray->shape, ULAB_MAX_DIMS * sizeof(size_t));
+
+    size_t *rshape = m_new(size_t, ULAB_MAX_DIMS);
+    memcpy(rshape, ndarray->shape, ULAB_MAX_DIMS * sizeof(size_t));
+
+    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+    memcpy(strides, ndarray->strides, ULAB_MAX_DIMS * sizeof(int32_t));
+
+    int32_t *rstrides = m_new0(int32_t, ULAB_MAX_DIMS);
+
+    if(axis == mp_const_none) {
+        result = ndarray_new_linear_array(true_count, ndarray->dtype);
+
+        rstrides[ULAB_MAX_DIMS - 1] = ndarray->itemsize;
+        rshape[ULAB_MAX_DIMS - 1] = 0;
+    } else {
+        rshape[shift_ax] = true_count;
+
+        result = ndarray_new_dense_ndarray(ndarray->ndim, rshape, ndarray->dtype);
+
+        SWAP(size_t, shape[shift_ax], shape[ULAB_MAX_DIMS - 1]);
+        SWAP(size_t, rshape[shift_ax], rshape[ULAB_MAX_DIMS - 1]);
+        SWAP(int32_t, strides[shift_ax], strides[ULAB_MAX_DIMS - 1]);
+
+        memcpy(rstrides, result->strides, ULAB_MAX_DIMS * sizeof(int32_t));
+        SWAP(int32_t, rstrides[shift_ax], rstrides[ULAB_MAX_DIMS - 1]);
+    }
+
+    uint8_t *rarray = (uint8_t *)result->array;
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                if(axis != mp_const_none) {
+                    iterable = mp_getiter(condition, &iter_buf);
+                }
+                do {
+                    item = mp_iternext(iterable);
+                    if(mp_obj_is_true(item)) {
+                        memcpy(rarray, array, ndarray->itemsize);
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                    }
+                    array += strides[ULAB_MAX_DIMS - 1];
+                    l++;
+                } while(l < shape[ULAB_MAX_DIMS - 1]);
+            #if ULAB_MAX_DIMS > 1
+                array -= strides[ULAB_MAX_DIMS - 1] * shape[ULAB_MAX_DIMS - 1];
+                array += strides[ULAB_MAX_DIMS - 2];
+                rarray -= rstrides[ULAB_MAX_DIMS - 1] * rshape[ULAB_MAX_DIMS - 1];
+                rarray += rstrides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            array -= strides[ULAB_MAX_DIMS - 2] * shape[ULAB_MAX_DIMS - 2];
+            array += strides[ULAB_MAX_DIMS - 3];
+            rarray -= rstrides[ULAB_MAX_DIMS - 2] * rshape[ULAB_MAX_DIMS - 2];
+            rarray += rstrides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        array -= strides[ULAB_MAX_DIMS - 3] * shape[ULAB_MAX_DIMS - 3];
+        array += strides[ULAB_MAX_DIMS - 4];
+        rarray -= rstrides[ULAB_MAX_DIMS - 2] * rshape[ULAB_MAX_DIMS - 2];
+        rarray += rstrides[ULAB_MAX_DIMS - 3];
+        i++;
+    } while(i < shape[ULAB_MAX_DIMS - 4]);
+    #endif
+
+    m_del(size_t, shape, ULAB_MAX_DIMS);
+    m_del(size_t, rshape, ULAB_MAX_DIMS);
+    m_del(int32_t, strides, ULAB_MAX_DIMS);
+    m_del(int32_t, rstrides, ULAB_MAX_DIMS);
+
+    return result;
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(transform_compress_obj, 2, transform_compress);
+#endif /* ULAB_NUMPY_HAS_COMPRESS */
+
+#if ULAB_NUMPY_HAS_DELETE
+static mp_obj_t transform_delete(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        mp_raise_TypeError(translate("first argument must be an ndarray"));
+    }
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
+    uint8_t *array = (uint8_t *)ndarray->array;
+
+    mp_obj_t indices = args[1].u_obj;
+
+    mp_obj_t axis = args[2].u_obj;
+
+    int8_t shift_ax;
+
+    size_t axis_len;
+
+    if(axis != mp_const_none) {
+        int8_t ax = tools_get_axis(axis, ndarray->ndim);
+        shift_ax = ULAB_MAX_DIMS - ndarray->ndim + ax;
+        axis_len = ndarray->shape[shift_ax];
+    } else {
+        axis_len = ndarray->len;
+    }
+
+    size_t index_len;
+    if(mp_obj_is_int(indices)) {
+        index_len = 1;
+    } else {
+        if(mp_obj_len_maybe(indices) == MP_OBJ_NULL) {
+            mp_raise_TypeError(translate("wrong index type"));
+        }
+        index_len = MP_OBJ_SMALL_INT_VALUE(mp_obj_len_maybe(indices));
+    }
+
+    if(index_len > axis_len) {
+        mp_raise_ValueError(translate("wrong length of index array"));
+    }
+
+    size_t *index_array = m_new(size_t, index_len);
+
+    if(mp_obj_is_int(indices)) {
+        ssize_t value = (ssize_t)mp_obj_get_int(indices);
+        if(value < 0) {
+            value += axis_len;
+        }
+        if((value < 0) || (value > (ssize_t)axis_len)) {
+            mp_raise_ValueError(translate("index is out of bounds"));
+        } else {
+            *index_array++ = (size_t)value;
+        }
+    } else {
+        mp_obj_iter_buf_t iter_buf;
+        mp_obj_t item, iterable = mp_getiter(indices, &iter_buf);
+        while((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+            ssize_t value = (ssize_t)mp_obj_get_int(item);
+            if(value < 0) {
+                value += axis_len;
+            }
+            if((value < 0) || (value > (ssize_t)axis_len)) {
+                mp_raise_ValueError(translate("index is out of bounds"));
+            } else {
+                *index_array++ = (size_t)value;
+            }
+        }
+    }
+
+    // sort the array, since it is not guaranteed that the input is sorted
+    HEAPSORT1(size_t, index_array, 1, index_len);
+
+    size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
+    memcpy(shape, ndarray->shape, ULAB_MAX_DIMS * sizeof(size_t));
+
+    size_t *rshape = m_new(size_t, ULAB_MAX_DIMS);
+    memcpy(rshape, ndarray->shape, ULAB_MAX_DIMS * sizeof(size_t));
+
+    int32_t *strides = m_new(int32_t, ULAB_MAX_DIMS);
+    memcpy(strides, ndarray->strides, ULAB_MAX_DIMS * sizeof(int32_t));
+
+    int32_t *rstrides = m_new0(int32_t, ULAB_MAX_DIMS);
+
+    ndarray_obj_t *result = NULL;
+
+    if(axis == mp_const_none) {
+        result = ndarray_new_linear_array(ndarray->len - index_len, ndarray->dtype);
+        rstrides[ULAB_MAX_DIMS - 1] = ndarray->itemsize;
+        memset(rshape, 0, sizeof(size_t) * ULAB_MAX_DIMS);
+    } else {
+        rshape[shift_ax] = shape[shift_ax] - index_len;
+
+        result = ndarray_new_dense_ndarray(ndarray->ndim, rshape, ndarray->dtype);
+
+        SWAP(size_t, shape[shift_ax], shape[ULAB_MAX_DIMS - 1]);
+        SWAP(size_t, rshape[shift_ax], rshape[ULAB_MAX_DIMS - 1]);
+        SWAP(int32_t, strides[shift_ax], strides[ULAB_MAX_DIMS - 1]);
+
+        memcpy(rstrides, result->strides, ULAB_MAX_DIMS * sizeof(int32_t));
+        SWAP(int32_t, rstrides[shift_ax], rstrides[ULAB_MAX_DIMS - 1]);
+    }
+
+    uint8_t *rarray = (uint8_t *)result->array;
+    index_array -= index_len;
+    size_t count = 0;
+
+    #if ULAB_MAX_DIMS > 3
+    size_t i = 0;
+    do {
+    #endif
+        #if ULAB_MAX_DIMS > 2
+        size_t j = 0;
+        do {
+        #endif
+            #if ULAB_MAX_DIMS > 1
+            size_t k = 0;
+            do {
+            #endif
+                size_t l = 0;
+                do {
+                    if(count == *index_array) {
+                        index_array++;
+                    } else {
+                        memcpy(rarray, array, ndarray->itemsize);
+                        rarray += rstrides[ULAB_MAX_DIMS - 1];
+                    }
+                    array += strides[ULAB_MAX_DIMS - 1];
+                    l++;
+                    count++;
+                } while(l < shape[ULAB_MAX_DIMS - 1]);
+                if(axis != mp_const_none) {
+                    index_array -= index_len;
+                    count = 0;
+                }
+            #if ULAB_MAX_DIMS > 1
+                array -= strides[ULAB_MAX_DIMS - 1] * shape[ULAB_MAX_DIMS - 1];
+                array += strides[ULAB_MAX_DIMS - 2];
+                rarray -= rstrides[ULAB_MAX_DIMS - 1] * rshape[ULAB_MAX_DIMS - 1];
+                rarray += rstrides[ULAB_MAX_DIMS - 2];
+                k++;
+            } while(k < shape[ULAB_MAX_DIMS - 2]);
+            #endif
+        #if ULAB_MAX_DIMS > 2
+            array -= strides[ULAB_MAX_DIMS - 2] * shape[ULAB_MAX_DIMS - 2];
+            array += strides[ULAB_MAX_DIMS - 3];
+            rarray -= rstrides[ULAB_MAX_DIMS - 2] * rshape[ULAB_MAX_DIMS - 2];
+            rarray += rstrides[ULAB_MAX_DIMS - 3];
+            j++;
+        } while(j < shape[ULAB_MAX_DIMS - 3]);
+        #endif
+    #if ULAB_MAX_DIMS > 3
+        array -= strides[ULAB_MAX_DIMS - 3] * shape[ULAB_MAX_DIMS - 3];
+        array += strides[ULAB_MAX_DIMS - 4];
+        rarray -= rstrides[ULAB_MAX_DIMS - 3] * rshape[ULAB_MAX_DIMS - 3];
+        rarray += rstrides[ULAB_MAX_DIMS - 4];
+        i++;
+    } while(i < shape[ULAB_MAX_DIMS - 4]);
+    #endif
+
+    // TODO: deleting shape generates a seg fault
+    // m_del(size_t, shape, ULAB_MAX_DIMS);
+    m_del(size_t, rshape, ULAB_MAX_DIMS);
+    m_del(int32_t, strides, ULAB_MAX_DIMS);
+    m_del(int32_t, rstrides, ULAB_MAX_DIMS);
+
+    return MP_OBJ_FROM_PTR(result);
+}
+
+MP_DEFINE_CONST_FUN_OBJ_KW(transform_delete_obj, 2, transform_delete);
+#endif /* ULAB_NUMPY_HAS_DELETE */
+
+
 #if ULAB_MAX_DIMS > 1
 #if ULAB_NUMPY_HAS_DOT
 //| def dot(m1: ulab.numpy.ndarray, m2: ulab.numpy.ndarray) -> Union[ulab.numpy.ndarray, _float]:
@@ -39,6 +359,9 @@ mp_obj_t transform_dot(mp_obj_t _m1, mp_obj_t _m2) {
     }
     ndarray_obj_t *m1 = MP_OBJ_TO_PTR(_m1);
     ndarray_obj_t *m2 = MP_OBJ_TO_PTR(_m2);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(m1->dtype)
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(m2->dtype)
+
     uint8_t *array1 = (uint8_t *)m1->array;
     uint8_t *array2 = (uint8_t *)m2->array;
 
@@ -86,5 +409,42 @@ mp_obj_t transform_dot(mp_obj_t _m1, mp_obj_t _m2) {
 }
 
 MP_DEFINE_CONST_FUN_OBJ_2(transform_dot_obj, transform_dot);
+#endif /* ULAB_NUMPY_HAS_DOT */
+#endif /* ULAB_MAX_DIMS > 1 */
+
+#if ULAB_NUMPY_HAS_SIZE
+static mp_obj_t transform_size(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_axis, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    if(ulab_tools_mp_obj_is_scalar(args[0].u_obj)) {
+        return mp_obj_new_int(1);
+    }
+
+    if(!ndarray_object_is_array_like(args[0].u_obj)) {
+        mp_raise_TypeError(translate("first argument must be an ndarray"));
+    }
+    if(!mp_obj_is_type(args[0].u_obj, &ulab_ndarray_type)) {
+        return mp_obj_len_maybe(args[0].u_obj);
+    }
+
+    // at this point, the args[0] is most certainly an ndarray
+    ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[0].u_obj);
+    mp_obj_t axis = args[1].u_obj;
+    size_t len;
+    if(axis != mp_const_none) {
+        int8_t ax = tools_get_axis(axis, ndarray->ndim);
+        len = ndarray->shape[ULAB_MAX_DIMS - ndarray->ndim + ax];
+    } else {
+        len = ndarray->len;
+    }
+
+    return mp_obj_new_int(len);
+}
+MP_DEFINE_CONST_FUN_OBJ_KW(transform_size_obj, 1, transform_size);
 #endif
-#endif
\ No newline at end of file
diff --git a/python/port/mod/ulab/numpy/transform.h b/python/port/mod/ulab/numpy/transform.h
index f4a09b8ef..bfb4482cc 100644
--- a/python/port/mod/ulab/numpy/transform.h
+++ b/python/port/mod/ulab/numpy/transform.h
@@ -21,8 +21,10 @@
 
 #include "../ulab.h"
 #include "../ulab_tools.h"
-#include "transform.h"
 
+MP_DECLARE_CONST_FUN_OBJ_KW(transform_compress_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(transform_delete_obj);
 MP_DECLARE_CONST_FUN_OBJ_2(transform_dot_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(transform_size_obj);
 
 #endif
diff --git a/python/port/mod/ulab/numpy/vector.c b/python/port/mod/ulab/numpy/vector.c
index ceba25598..97ab66d21 100644
--- a/python/port/mod/ulab/numpy/vector.c
+++ b/python/port/mod/ulab/numpy/vector.c
@@ -22,6 +22,7 @@
 
 #include "../ulab.h"
 #include "../ulab_tools.h"
+#include "carray/carray_tools.h"
 #include "vector.h"
 
 //| """Element-by-element functions
@@ -31,7 +32,7 @@
 //| much more efficient than expressing the same operation as a Python loop."""
 //|
 
-static mp_obj_t vectorise_generic_vector(mp_obj_t o_in, mp_float_t (*f)(mp_float_t)) {
+static mp_obj_t vector_generic_vector(mp_obj_t o_in, mp_float_t (*f)(mp_float_t)) {
     // Return a single value, if o_in is not iterable
     if(mp_obj_is_float(o_in) || mp_obj_is_int(o_in)) {
         return mp_obj_new_float(f(mp_obj_get_float(o_in)));
@@ -39,6 +40,7 @@ static mp_obj_t vectorise_generic_vector(mp_obj_t o_in, mp_float_t (*f)(mp_float
     ndarray_obj_t *ndarray = NULL;
     if(mp_obj_is_type(o_in, &ulab_ndarray_type)) {
         ndarray_obj_t *source = MP_OBJ_TO_PTR(o_in);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(source->dtype)
         uint8_t *sarray = (uint8_t *)source->array;
         ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_FLOAT);
         mp_float_t *array = (mp_float_t *)ndarray->array;
@@ -99,10 +101,10 @@ static mp_obj_t vectorise_generic_vector(mp_obj_t o_in, mp_float_t (*f)(mp_float
         #endif /* ULAB_VECTORISE_USES_FUN_POINTER */
     } else {
         ndarray = ndarray_from_mp_obj(o_in, 0);
-        mp_float_t *array = (mp_float_t *)ndarray->array;
+        mp_float_t *narray = (mp_float_t *)ndarray->array;
         for(size_t i = 0; i < ndarray->len; i++) {
-            *array = f(*array);
-            array++;
+            *narray = f(*narray);
+            narray++;
         }
     }
     return MP_OBJ_FROM_PTR(ndarray);
@@ -115,7 +117,7 @@ static mp_obj_t vectorise_generic_vector(mp_obj_t o_in, mp_float_t (*f)(mp_float
 //|
 
 MATH_FUN_1(acos, acos);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_acos_obj, vectorise_acos);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_acos_obj, vector_acos);
 #endif
 
 #if ULAB_NUMPY_HAS_ACOSH
@@ -125,7 +127,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_acos_obj, vectorise_acos);
 //|
 
 MATH_FUN_1(acosh, acosh);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_acosh_obj, vectorise_acosh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_acosh_obj, vector_acosh);
 #endif
 
 #if ULAB_NUMPY_HAS_ASIN
@@ -135,7 +137,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_acosh_obj, vectorise_acosh);
 //|
 
 MATH_FUN_1(asin, asin);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_asin_obj, vectorise_asin);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_asin_obj, vector_asin);
 #endif
 
 #if ULAB_NUMPY_HAS_ASINH
@@ -145,7 +147,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_asin_obj, vectorise_asin);
 //|
 
 MATH_FUN_1(asinh, asinh);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_asinh_obj, vectorise_asinh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_asinh_obj, vector_asinh);
 #endif
 
 #if ULAB_NUMPY_HAS_AROUND
@@ -155,7 +157,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_asinh_obj, vectorise_asinh);
 //|    ...
 //|
 
-mp_obj_t vectorise_around(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+mp_obj_t vector_around(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
     static const mp_arg_t allowed_args[] = {
         { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} },
         { MP_QSTR_decimals, MP_ARG_KW_ONLY | MP_ARG_INT, {.u_int = 0 } }
@@ -169,6 +171,7 @@ mp_obj_t vectorise_around(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_
     int8_t n = args[1].u_int;
     mp_float_t mul = MICROPY_FLOAT_C_FUN(pow)(10.0, n);
     ndarray_obj_t *source = MP_OBJ_TO_PTR(args[0].u_obj);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(source->dtype)
     ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_FLOAT);
     mp_float_t *narray = (mp_float_t *)ndarray->array;
     uint8_t *sarray = (uint8_t *)source->array;
@@ -215,7 +218,7 @@ mp_obj_t vectorise_around(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_
     return MP_OBJ_FROM_PTR(ndarray);
 }
 
-MP_DEFINE_CONST_FUN_OBJ_KW(vectorise_around_obj, 1, vectorise_around);
+MP_DEFINE_CONST_FUN_OBJ_KW(vector_around_obj, 1, vector_around);
 #endif
 
 #if ULAB_NUMPY_HAS_ATAN
@@ -226,7 +229,7 @@ MP_DEFINE_CONST_FUN_OBJ_KW(vectorise_around_obj, 1, vectorise_around);
 //|
 
 MATH_FUN_1(atan, atan);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_atan_obj, vectorise_atan);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_atan_obj, vector_atan);
 #endif
 
 #if ULAB_NUMPY_HAS_ARCTAN2
@@ -236,9 +239,12 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_atan_obj, vectorise_atan);
 //|    ...
 //|
 
-mp_obj_t vectorise_arctan2(mp_obj_t y, mp_obj_t x) {
+mp_obj_t vector_arctan2(mp_obj_t y, mp_obj_t x) {
     ndarray_obj_t *ndarray_x = ndarray_from_mp_obj(x, 0);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray_x->dtype)
+
     ndarray_obj_t *ndarray_y = ndarray_from_mp_obj(y, 0);
+    COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray_y->dtype)
 
     uint8_t ndim = 0;
     size_t *shape = m_new(size_t, ULAB_MAX_DIMS);
@@ -309,7 +315,7 @@ mp_obj_t vectorise_arctan2(mp_obj_t y, mp_obj_t x) {
     return MP_OBJ_FROM_PTR(results);
 }
 
-MP_DEFINE_CONST_FUN_OBJ_2(vectorise_arctan2_obj, vectorise_arctan2);
+MP_DEFINE_CONST_FUN_OBJ_2(vector_arctan2_obj, vector_arctan2);
 #endif /* ULAB_VECTORISE_HAS_ARCTAN2 */
 
 #if ULAB_NUMPY_HAS_ATANH
@@ -319,7 +325,7 @@ MP_DEFINE_CONST_FUN_OBJ_2(vectorise_arctan2_obj, vectorise_arctan2);
 //|
 
 MATH_FUN_1(atanh, atanh);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_atanh_obj, vectorise_atanh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_atanh_obj, vector_atanh);
 #endif
 
 #if ULAB_NUMPY_HAS_CEIL
@@ -329,7 +335,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_atanh_obj, vectorise_atanh);
 //|
 
 MATH_FUN_1(ceil, ceil);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_ceil_obj, vectorise_ceil);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_ceil_obj, vector_ceil);
 #endif
 
 #if ULAB_NUMPY_HAS_COS
@@ -339,7 +345,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_ceil_obj, vectorise_ceil);
 //|
 
 MATH_FUN_1(cos, cos);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_cos_obj, vectorise_cos);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_cos_obj, vector_cos);
 #endif
 
 #if ULAB_NUMPY_HAS_COSH
@@ -349,7 +355,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_cos_obj, vectorise_cos);
 //|
 
 MATH_FUN_1(cosh, cosh);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_cosh_obj, vectorise_cosh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_cosh_obj, vector_cosh);
 #endif
 
 #if ULAB_NUMPY_HAS_DEGREES
@@ -358,15 +364,15 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_cosh_obj, vectorise_cosh);
 //|    ...
 //|
 
-static mp_float_t vectorise_degrees_(mp_float_t value) {
+static mp_float_t vector_degrees_(mp_float_t value) {
     return value * MICROPY_FLOAT_CONST(180.0) / MP_PI;
 }
 
-static mp_obj_t vectorise_degrees(mp_obj_t x_obj) {
-    return vectorise_generic_vector(x_obj, vectorise_degrees_);
+static mp_obj_t vector_degrees(mp_obj_t x_obj) {
+    return vector_generic_vector(x_obj, vector_degrees_);
 }
 
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_degrees_obj, vectorise_degrees);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_degrees_obj, vector_degrees);
 #endif
 
 #if ULAB_SCIPY_SPECIAL_HAS_ERF
@@ -376,7 +382,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_degrees_obj, vectorise_degrees);
 //|
 
 MATH_FUN_1(erf, erf);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_erf_obj, vectorise_erf);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_erf_obj, vector_erf);
 #endif
 
 #if ULAB_SCIPY_SPECIAL_HAS_ERFC
@@ -386,7 +392,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_erf_obj, vectorise_erf);
 //|
 
 MATH_FUN_1(erfc, erfc);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_erfc_obj, vectorise_erfc);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_erfc_obj, vector_erfc);
 #endif
 
 #if ULAB_NUMPY_HAS_EXP
@@ -395,8 +401,69 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_erfc_obj, vectorise_erfc);
 //|    ...
 //|
 
-MATH_FUN_1(exp, exp);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_exp_obj, vectorise_exp);
+static mp_obj_t vector_exp(mp_obj_t o_in) {
+    #if ULAB_SUPPORTS_COMPLEX
+    if(mp_obj_is_type(o_in, &mp_type_complex)) {
+        mp_float_t real, imag;
+        mp_obj_get_complex(o_in, &real, &imag);
+        mp_float_t exp_real = MICROPY_FLOAT_C_FUN(exp)(real);
+        return mp_obj_new_complex(exp_real * MICROPY_FLOAT_C_FUN(cos)(imag), exp_real * MICROPY_FLOAT_C_FUN(sin)(imag));
+    } else if(mp_obj_is_type(o_in, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(o_in);
+        if(source->dtype == NDARRAY_COMPLEX) {
+            uint8_t *sarray = (uint8_t *)source->array;
+            ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_COMPLEX);
+            mp_float_t *array = (mp_float_t *)ndarray->array;
+            uint8_t itemsize = sizeof(mp_float_t);
+
+            #if ULAB_MAX_DIMS > 3
+            size_t i = 0;
+            do {
+            #endif
+                #if ULAB_MAX_DIMS > 2
+                size_t j = 0;
+                do {
+                #endif
+                    #if ULAB_MAX_DIMS > 1
+                    size_t k = 0;
+                    do {
+                    #endif
+                        size_t l = 0;
+                        do {
+                            mp_float_t real = *(mp_float_t *)sarray;
+                            mp_float_t imag = *(mp_float_t *)(sarray + itemsize);
+                            mp_float_t exp_real = MICROPY_FLOAT_C_FUN(exp)(real);
+                            *array++ = exp_real * MICROPY_FLOAT_C_FUN(cos)(imag);
+                            *array++ = exp_real * MICROPY_FLOAT_C_FUN(sin)(imag);
+                            sarray += source->strides[ULAB_MAX_DIMS - 1];
+                            l++;
+                        } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+                    #if ULAB_MAX_DIMS > 1
+                        sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                        sarray += source->strides[ULAB_MAX_DIMS - 2];
+                        k++;
+                    } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+                    #endif /* ULAB_MAX_DIMS > 1 */
+                #if ULAB_MAX_DIMS > 2
+                    sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                    sarray += source->strides[ULAB_MAX_DIMS - 3];
+                    j++;
+                } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+                #endif /* ULAB_MAX_DIMS > 2 */
+            #if ULAB_MAX_DIMS > 3
+                sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+                sarray += source->strides[ULAB_MAX_DIMS - 4];
+                i++;
+            } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+            #endif /* ULAB_MAX_DIMS > 3 */
+            return MP_OBJ_FROM_PTR(ndarray);
+        }
+    }
+    #endif
+    return vector_generic_vector(o_in, MICROPY_FLOAT_C_FUN(exp));
+}
+
+MP_DEFINE_CONST_FUN_OBJ_1(vector_exp_obj, vector_exp);
 #endif
 
 #if ULAB_NUMPY_HAS_EXPM1
@@ -406,7 +473,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_exp_obj, vectorise_exp);
 //|
 
 MATH_FUN_1(expm1, expm1);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_expm1_obj, vectorise_expm1);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_expm1_obj, vector_expm1);
 #endif
 
 #if ULAB_NUMPY_HAS_FLOOR
@@ -416,7 +483,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_expm1_obj, vectorise_expm1);
 //|
 
 MATH_FUN_1(floor, floor);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_floor_obj, vectorise_floor);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_floor_obj, vector_floor);
 #endif
 
 #if ULAB_SCIPY_SPECIAL_HAS_GAMMA
@@ -426,7 +493,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_floor_obj, vectorise_floor);
 //|
 
 MATH_FUN_1(gamma, tgamma);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_gamma_obj, vectorise_gamma);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_gamma_obj, vector_gamma);
 #endif
 
 #if ULAB_SCIPY_SPECIAL_HAS_GAMMALN
@@ -436,7 +503,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_gamma_obj, vectorise_gamma);
 //|
 
 MATH_FUN_1(lgamma, lgamma);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_lgamma_obj, vectorise_lgamma);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_lgamma_obj, vector_lgamma);
 #endif
 
 #if ULAB_NUMPY_HAS_LOG
@@ -446,7 +513,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_lgamma_obj, vectorise_lgamma);
 //|
 
 MATH_FUN_1(log, log);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_log_obj, vectorise_log);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_log_obj, vector_log);
 #endif
 
 #if ULAB_NUMPY_HAS_LOG10
@@ -456,7 +523,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_log_obj, vectorise_log);
 //|
 
 MATH_FUN_1(log10, log10);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_log10_obj, vectorise_log10);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_log10_obj, vector_log10);
 #endif
 
 #if ULAB_NUMPY_HAS_LOG2
@@ -466,7 +533,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_log10_obj, vectorise_log10);
 //|
 
 MATH_FUN_1(log2, log2);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_log2_obj, vectorise_log2);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_log2_obj, vector_log2);
 #endif
 
 #if ULAB_NUMPY_HAS_RADIANS
@@ -475,15 +542,15 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_log2_obj, vectorise_log2);
 //|    ...
 //|
 
-static mp_float_t vectorise_radians_(mp_float_t value) {
+static mp_float_t vector_radians_(mp_float_t value) {
     return value * MP_PI / MICROPY_FLOAT_CONST(180.0);
 }
 
-static mp_obj_t vectorise_radians(mp_obj_t x_obj) {
-    return vectorise_generic_vector(x_obj, vectorise_radians_);
+static mp_obj_t vector_radians(mp_obj_t x_obj) {
+    return vector_generic_vector(x_obj, vector_radians_);
 }
 
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_radians_obj, vectorise_radians);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_radians_obj, vector_radians);
 #endif
 
 #if ULAB_NUMPY_HAS_SIN
@@ -493,7 +560,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_radians_obj, vectorise_radians);
 //|
 
 MATH_FUN_1(sin, sin);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_sin_obj, vectorise_sin);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_sin_obj, vector_sin);
 #endif
 
 #if ULAB_NUMPY_HAS_SINH
@@ -503,18 +570,158 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_sin_obj, vectorise_sin);
 //|
 
 MATH_FUN_1(sinh, sinh);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_sinh_obj, vectorise_sinh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_sinh_obj, vector_sinh);
 #endif
 
+
 #if ULAB_NUMPY_HAS_SQRT
 //| def sqrt(a: _ArrayLike) -> ulab.numpy.ndarray:
 //|    """Computes the square root"""
 //|    ...
 //|
 
+#if ULAB_SUPPORTS_COMPLEX
+mp_obj_t vector_sqrt(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, { .u_rom_obj = mp_const_none } },
+        { MP_QSTR_dtype, MP_ARG_KW_ONLY | MP_ARG_OBJ, { .u_rom_obj = MP_ROM_INT(NDARRAY_FLOAT) } },
+    };
+
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args, pos_args, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    mp_obj_t o_in = args[0].u_obj;
+    uint8_t dtype = mp_obj_get_int(args[1].u_obj);
+    if((dtype != NDARRAY_FLOAT) && (dtype != NDARRAY_COMPLEX)) {
+        mp_raise_TypeError(translate("dtype must be float, or complex"));
+    }
+
+    if(mp_obj_is_type(o_in, &mp_type_complex)) {
+        mp_float_t real, imag;
+        mp_obj_get_complex(o_in, &real, &imag);
+        mp_float_t sqrt_abs = MICROPY_FLOAT_C_FUN(sqrt)(real * real + imag * imag);
+        sqrt_abs = MICROPY_FLOAT_C_FUN(sqrt)(sqrt_abs);
+        mp_float_t theta = MICROPY_FLOAT_CONST(0.5) * MICROPY_FLOAT_C_FUN(atan2)(imag, real);
+        return mp_obj_new_complex(sqrt_abs * MICROPY_FLOAT_C_FUN(cos)(theta), sqrt_abs * MICROPY_FLOAT_C_FUN(sin)(theta));
+    } else if(mp_obj_is_type(o_in, &ulab_ndarray_type)) {
+        ndarray_obj_t *source = MP_OBJ_TO_PTR(o_in);
+        if((source->dtype == NDARRAY_COMPLEX) && (dtype == NDARRAY_FLOAT)) {
+            mp_raise_TypeError(translate("can't convert complex to float"));
+        }
+
+        if(dtype == NDARRAY_COMPLEX) {
+            if(source->dtype == NDARRAY_COMPLEX) {
+                uint8_t *sarray = (uint8_t *)source->array;
+                ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_COMPLEX);
+                mp_float_t *array = (mp_float_t *)ndarray->array;
+                uint8_t itemsize = sizeof(mp_float_t);
+
+                #if ULAB_MAX_DIMS > 3
+                size_t i = 0;
+                do {
+                #endif
+                    #if ULAB_MAX_DIMS > 2
+                    size_t j = 0;
+                    do {
+                    #endif
+                        #if ULAB_MAX_DIMS > 1
+                        size_t k = 0;
+                        do {
+                        #endif
+                            size_t l = 0;
+                            do {
+                                mp_float_t real = *(mp_float_t *)sarray;
+                                mp_float_t imag = *(mp_float_t *)(sarray + itemsize);
+                                mp_float_t sqrt_abs = MICROPY_FLOAT_C_FUN(sqrt)(real * real + imag * imag);
+                                sqrt_abs = MICROPY_FLOAT_C_FUN(sqrt)(sqrt_abs);
+                                mp_float_t theta = MICROPY_FLOAT_CONST(0.5) * MICROPY_FLOAT_C_FUN(atan2)(imag, real);
+                                *array++ = sqrt_abs * MICROPY_FLOAT_C_FUN(cos)(theta);
+                                *array++ = sqrt_abs * MICROPY_FLOAT_C_FUN(sin)(theta);
+                                sarray += source->strides[ULAB_MAX_DIMS - 1];
+                                l++;
+                            } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+                        #if ULAB_MAX_DIMS > 1
+                            sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                            sarray += source->strides[ULAB_MAX_DIMS - 2];
+                            k++;
+                        } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+                        #endif /* ULAB_MAX_DIMS > 1 */
+                    #if ULAB_MAX_DIMS > 2
+                        sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                        sarray += source->strides[ULAB_MAX_DIMS - 3];
+                        j++;
+                    } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+                    #endif /* ULAB_MAX_DIMS > 2 */
+                #if ULAB_MAX_DIMS > 3
+                    sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+                    sarray += source->strides[ULAB_MAX_DIMS - 4];
+                    i++;
+                } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+                #endif /* ULAB_MAX_DIMS > 3 */
+                return MP_OBJ_FROM_PTR(ndarray);
+            } else if(source->dtype == NDARRAY_FLOAT) {
+                uint8_t *sarray = (uint8_t *)source->array;
+                ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, NDARRAY_COMPLEX);
+                mp_float_t *array = (mp_float_t *)ndarray->array;
+
+                #if ULAB_MAX_DIMS > 3
+                size_t i = 0;
+                do {
+                #endif
+                    #if ULAB_MAX_DIMS > 2
+                    size_t j = 0;
+                    do {
+                    #endif
+                        #if ULAB_MAX_DIMS > 1
+                        size_t k = 0;
+                        do {
+                        #endif
+                            size_t l = 0;
+                            do {
+                                mp_float_t value = *(mp_float_t *)sarray;
+                                if(value >= MICROPY_FLOAT_CONST(0.0)) {
+                                    *array++ = MICROPY_FLOAT_C_FUN(sqrt)(value);
+                                    array++;
+                                } else {
+                                    array++;
+                                    *array++ = MICROPY_FLOAT_C_FUN(sqrt)(-value);
+                                }
+                                sarray += source->strides[ULAB_MAX_DIMS - 1];
+                                l++;
+                            } while(l < source->shape[ULAB_MAX_DIMS - 1]);
+                        #if ULAB_MAX_DIMS > 1
+                            sarray -= source->strides[ULAB_MAX_DIMS - 1] * source->shape[ULAB_MAX_DIMS-1];
+                            sarray += source->strides[ULAB_MAX_DIMS - 2];
+                            k++;
+                        } while(k < source->shape[ULAB_MAX_DIMS - 2]);
+                        #endif /* ULAB_MAX_DIMS > 1 */
+                    #if ULAB_MAX_DIMS > 2
+                        sarray -= source->strides[ULAB_MAX_DIMS - 2] * source->shape[ULAB_MAX_DIMS-2];
+                        sarray += source->strides[ULAB_MAX_DIMS - 3];
+                        j++;
+                    } while(j < source->shape[ULAB_MAX_DIMS - 3]);
+                    #endif /* ULAB_MAX_DIMS > 2 */
+                #if ULAB_MAX_DIMS > 3
+                    sarray -= source->strides[ULAB_MAX_DIMS - 3] * source->shape[ULAB_MAX_DIMS-3];
+                    sarray += source->strides[ULAB_MAX_DIMS - 4];
+                    i++;
+                } while(i < source->shape[ULAB_MAX_DIMS - 4]);
+                #endif /* ULAB_MAX_DIMS > 3 */
+                return MP_OBJ_FROM_PTR(ndarray);
+            } else {
+                mp_raise_TypeError(translate("input dtype must be float or complex"));
+            }
+        }
+    }
+    return vector_generic_vector(o_in, MICROPY_FLOAT_C_FUN(sqrt));
+}
+MP_DEFINE_CONST_FUN_OBJ_KW(vector_sqrt_obj, 1, vector_sqrt);
+#else
 MATH_FUN_1(sqrt, sqrt);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_sqrt_obj, vectorise_sqrt);
-#endif
+MP_DEFINE_CONST_FUN_OBJ_1(vector_sqrt_obj, vector_sqrt);
+#endif /* ULAB_SUPPORTS_COMPLEX */
+
+#endif /* ULAB_NUMPY_HAS_SQRT */
 
 #if ULAB_NUMPY_HAS_TAN
 //| def tan(a: _ArrayLike) -> ulab.numpy.ndarray:
@@ -523,7 +730,7 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_sqrt_obj, vectorise_sqrt);
 //|
 
 MATH_FUN_1(tan, tan);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_tan_obj, vectorise_tan);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_tan_obj, vector_tan);
 #endif
 
 #if ULAB_NUMPY_HAS_TANH
@@ -532,11 +739,11 @@ MP_DEFINE_CONST_FUN_OBJ_1(vectorise_tan_obj, vectorise_tan);
 //|    ...
 
 MATH_FUN_1(tanh, tanh);
-MP_DEFINE_CONST_FUN_OBJ_1(vectorise_tanh_obj, vectorise_tanh);
+MP_DEFINE_CONST_FUN_OBJ_1(vector_tanh_obj, vector_tanh);
 #endif
 
 #if ULAB_NUMPY_HAS_VECTORIZE
-static mp_obj_t vectorise_vectorized_function_call(mp_obj_t self_in, size_t n_args, size_t n_kw, const mp_obj_t *args) {
+static mp_obj_t vector_vectorized_function_call(mp_obj_t self_in, size_t n_args, size_t n_kw, const mp_obj_t *args) {
     (void) n_args;
     (void) n_kw;
     vectorized_function_obj_t *self = MP_OBJ_TO_PTR(self_in);
@@ -544,6 +751,7 @@ static mp_obj_t vectorise_vectorized_function_call(mp_obj_t self_in, size_t n_ar
     mp_obj_t fvalue;
     if(mp_obj_is_type(args[0], &ulab_ndarray_type)) {
         ndarray_obj_t *source = MP_OBJ_TO_PTR(args[0]);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(source->dtype)
         ndarray_obj_t *ndarray = ndarray_new_dense_ndarray(source->ndim, source->shape, self->otypes);
         for(size_t i=0; i < source->len; i++) {
             avalue[0] = mp_binary_get_val_array(source->dtype, source->array, i);
@@ -575,12 +783,12 @@ static mp_obj_t vectorise_vectorized_function_call(mp_obj_t self_in, size_t n_ar
     return mp_const_none;
 }
 
-const mp_obj_type_t vectorise_function_type = {
+const mp_obj_type_t vector_function_type = {
     { &mp_type_type },
     .flags = MP_TYPE_FLAG_EXTENDED,
     .name = MP_QSTR_,
     MP_TYPE_EXTENDED_FIELDS(
-    .call = vectorise_vectorized_function_call,
+    .call = vector_vectorized_function_call,
     )
 };
 
@@ -598,7 +806,7 @@ const mp_obj_type_t vectorise_function_type = {
 //|    ...
 //|
 
-static mp_obj_t vectorise_vectorize(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+static mp_obj_t vector_vectorize(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
     static const mp_arg_t allowed_args[] = {
         { MP_QSTR_, MP_ARG_REQUIRED | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} },
         { MP_QSTR_otypes, MP_ARG_KW_ONLY | MP_ARG_OBJ, {.u_rom_obj = mp_const_none} }
@@ -625,12 +833,12 @@ static mp_obj_t vectorise_vectorize(size_t n_args, const mp_obj_t *pos_args, mp_
         mp_raise_ValueError(translate("wrong output type"));
     }
     vectorized_function_obj_t *function = m_new_obj(vectorized_function_obj_t);
-    function->base.type = &vectorise_function_type;
+    function->base.type = &vector_function_type;
     function->otypes = otypes;
     function->fun = args[0].u_obj;
     function->type = type;
     return MP_OBJ_FROM_PTR(function);
 }
 
-MP_DEFINE_CONST_FUN_OBJ_KW(vectorise_vectorize_obj, 1, vectorise_vectorize);
+MP_DEFINE_CONST_FUN_OBJ_KW(vector_vectorize_obj, 1, vector_vectorize);
 #endif
diff --git a/python/port/mod/ulab/numpy/vector.h b/python/port/mod/ulab/numpy/vector.h
index dbd0b33ea..ea38b0fdc 100644
--- a/python/port/mod/ulab/numpy/vector.h
+++ b/python/port/mod/ulab/numpy/vector.h
@@ -15,35 +15,39 @@
 #include "../ulab.h"
 #include "../ndarray.h"
 
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_acos_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_acosh_obj);
-MP_DECLARE_CONST_FUN_OBJ_2(vectorise_arctan2_obj);
-MP_DECLARE_CONST_FUN_OBJ_KW(vectorise_around_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_asin_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_asinh_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_atan_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_atanh_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_ceil_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_cos_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_cosh_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_degrees_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_erf_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_erfc_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_exp_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_expm1_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_floor_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_gamma_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_lgamma_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_log_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_log10_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_log2_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_radians_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_sin_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_sinh_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_sqrt_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_tan_obj);
-MP_DECLARE_CONST_FUN_OBJ_1(vectorise_tanh_obj);
-MP_DECLARE_CONST_FUN_OBJ_KW(vectorise_vectorize_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_acos_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_acosh_obj);
+MP_DECLARE_CONST_FUN_OBJ_2(vector_arctan2_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(vector_around_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_asin_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_asinh_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_atan_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_atanh_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_ceil_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_cos_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_cosh_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_degrees_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_erf_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_erfc_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_exp_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_expm1_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_floor_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_gamma_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_lgamma_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_log_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_log10_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_log2_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_radians_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_sin_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_sinh_obj);
+#if ULAB_SUPPORTS_COMPLEX
+MP_DECLARE_CONST_FUN_OBJ_KW(vector_sqrt_obj);
+#else
+MP_DECLARE_CONST_FUN_OBJ_1(vector_sqrt_obj);
+#endif
+MP_DECLARE_CONST_FUN_OBJ_1(vector_tan_obj);
+MP_DECLARE_CONST_FUN_OBJ_1(vector_tanh_obj);
+MP_DECLARE_CONST_FUN_OBJ_KW(vector_vectorize_obj);
 
 typedef struct _vectorized_function_obj_t {
     mp_obj_base_t base;
@@ -53,12 +57,13 @@ typedef struct _vectorized_function_obj_t {
 } vectorized_function_obj_t;
 
 #if ULAB_HAS_FUNCTION_ITERATOR
-#define ITERATE_VECTOR(type, array, source, sarray)\
+#define ITERATE_VECTOR(type, array, source, sarray, shift)\
 ({\
     size_t *scoords = ndarray_new_coords((source)->ndim);\
     for(size_t i=0; i < (source)->len/(source)->shape[ULAB_MAX_DIMS -1]; i++) {\
         for(size_t l=0; l < (source)->shape[ULAB_MAX_DIMS - 1]; l++) {\
-            *(array)++ = f(*((type *)(sarray)));\
+            *(array) = f(*((type *)(sarray)));\
+            (array) += (shift);\
             (sarray) += (source)->strides[ULAB_MAX_DIMS - 1];\
         }\
         ndarray_rewind_array((source)->ndim, sarray, (source)->shape, (source)->strides, scoords);\
@@ -149,8 +154,8 @@ typedef struct _vectorized_function_obj_t {
 #endif /* ULAB_HAS_FUNCTION_ITERATOR */
 
 #define MATH_FUN_1(py_name, c_name) \
-    static mp_obj_t vectorise_ ## py_name(mp_obj_t x_obj) { \
-        return vectorise_generic_vector(x_obj, MICROPY_FLOAT_C_FUN(c_name)); \
+    static mp_obj_t vector_ ## py_name(mp_obj_t x_obj) { \
+        return vector_generic_vector(x_obj, MICROPY_FLOAT_C_FUN(c_name)); \
 }
 
 #endif /* _VECTOR_ */
diff --git a/python/port/mod/ulab/scipy/optimize/optimize.c b/python/port/mod/ulab/scipy/optimize/optimize.c
index 30b61283d..c2ed6fff8 100644
--- a/python/port/mod/ulab/scipy/optimize/optimize.c
+++ b/python/port/mod/ulab/scipy/optimize/optimize.c
@@ -121,7 +121,7 @@ MP_DEFINE_CONST_FUN_OBJ_KW(optimize_bisect_obj, 3, optimize_bisect);
 //|     Find a minimum of the function ``f(x)`` using the downhill simplex method.
 //|     The located ``x`` is within ``fxtol`` of the actual minimum, and ``f(x)``
 //|     is within ``fatol`` of the actual minimum unless more than ``maxiter``
-//|     steps are required."""
+//|     steps are requried."""
 //|     ...
 //|
 
@@ -344,7 +344,7 @@ MP_DEFINE_CONST_FUN_OBJ_KW(optimize_curve_fit_obj, 2, optimize_curve_fit);
 //|
 //|     Find a solution (zero) of the function ``f(x)`` using Newton's Method.
 //|     The result is accurate to within ``xtol * rtol * |f(x)|`` unless more than
-//|     ``maxiter`` steps are required."""
+//|     ``maxiter`` steps are requried."""
 //|     ...
 //|
 
diff --git a/python/port/mod/ulab/scipy/scipy.c b/python/port/mod/ulab/scipy/scipy.c
index c37aa4ee8..a7683e2d3 100644
--- a/python/port/mod/ulab/scipy/scipy.c
+++ b/python/port/mod/ulab/scipy/scipy.c
@@ -48,4 +48,4 @@ mp_obj_module_t ulab_scipy_module = {
     .base = { &mp_type_module },
     .globals = (mp_obj_dict_t*)&mp_module_ulab_scipy_globals,
 };
-#endif
+#endif /* ULAB_HAS_SCIPY */
diff --git a/python/port/mod/ulab/scipy/signal/signal.c b/python/port/mod/ulab/scipy/signal/signal.c
index cc559b598..60dbad073 100644
--- a/python/port/mod/ulab/scipy/signal/signal.c
+++ b/python/port/mod/ulab/scipy/signal/signal.c
@@ -18,32 +18,9 @@
 
 #include "../../ulab.h"
 #include "../../ndarray.h"
-#include "../../numpy/fft/fft_tools.h"
+#include "../../numpy/carray/carray_tools.h"
 
-#if ULAB_SCIPY_SIGNAL_HAS_SPECTROGRAM
-//| import ulab.numpy
-//|
-//| def spectrogram(r: ulab.numpy.ndarray) -> ulab.numpy.ndarray:
-//|     """
-//|     :param ulab.numpy.ndarray r: A 1-dimension array of values whose size is a power of 2
-//|
-//|     Computes the spectrum of the input signal.  This is the absolute value of the (complex-valued) fft of the signal.
-//|     This function is similar to scipy's ``scipy.signal.spectrogram``."""
-//|     ...
-//|
-
-mp_obj_t signal_spectrogram(size_t n_args, const mp_obj_t *args) {
-    if(n_args == 2) {
-        return fft_fft_ifft_spectrogram(n_args, args[0], args[1], FFT_SPECTROGRAM);
-    } else {
-        return fft_fft_ifft_spectrogram(n_args, args[0], mp_const_none, FFT_SPECTROGRAM);
-    }
-}
-
-MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(signal_spectrogram_obj, 1, 2, signal_spectrogram);
-#endif /* ULAB_SCIPY_SIGNAL_HAS_SPECTROGRAM */
-
-#if ULAB_SCIPY_SIGNAL_HAS_SOSFILT
+#if ULAB_SCIPY_SIGNAL_HAS_SOSFILT & ULAB_MAX_DIMS > 1
 static void signal_sosfilt_array(mp_float_t *x, const mp_float_t *coeffs, mp_float_t *zf, const size_t len) {
     for(size_t i=0; i < len; i++) {
         mp_float_t xn = *x;
@@ -68,6 +45,12 @@ mp_obj_t signal_sosfilt(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_ar
     if(!ndarray_object_is_array_like(args[0].u_obj) || !ndarray_object_is_array_like(args[1].u_obj)) {
         mp_raise_TypeError(translate("sosfilt requires iterable arguments"));
     }
+    #if ULAB_SUPPORTS_COMPLEX
+    if(mp_obj_is_type(args[1].u_obj, &ulab_ndarray_type)) {
+        ndarray_obj_t *ndarray = MP_OBJ_TO_PTR(args[1].u_obj);
+        COMPLEX_DTYPE_NOT_IMPLEMENTED(ndarray->dtype)
+    }
+    #endif
     size_t lenx = (size_t)mp_obj_get_int(mp_obj_len_maybe(args[1].u_obj));
     ndarray_obj_t *y = ndarray_new_linear_array(lenx, NDARRAY_FLOAT);
     mp_float_t *yarray = (mp_float_t *)y->array;
@@ -102,7 +85,7 @@ mp_obj_t signal_sosfilt(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_ar
             mp_raise_TypeError(translate("zi must be an ndarray"));
         } else {
             ndarray_obj_t *zi = MP_OBJ_TO_PTR(args[2].u_obj);
-            if((zi->shape[ULAB_MAX_DIMS - 1] != lensos) || (zi->shape[ULAB_MAX_DIMS - 1] != 2)) {
+            if((zi->shape[ULAB_MAX_DIMS - 2] != lensos) || (zi->shape[ULAB_MAX_DIMS - 1] != 2)) {
                 mp_raise_ValueError(translate("zi must be of shape (n_section, 2)"));
             }
             if(zi->dtype != NDARRAY_FLOAT) {
@@ -139,10 +122,7 @@ MP_DEFINE_CONST_FUN_OBJ_KW(signal_sosfilt_obj, 2, signal_sosfilt);
 
 static const mp_rom_map_elem_t ulab_scipy_signal_globals_table[] = {
     { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_signal) },
-    #if ULAB_SCIPY_SIGNAL_HAS_SPECTROGRAM
-        { MP_OBJ_NEW_QSTR(MP_QSTR_spectrogram), (mp_obj_t)&signal_spectrogram_obj },
-    #endif
-    #if ULAB_SCIPY_SIGNAL_HAS_SOSFILT
+    #if ULAB_SCIPY_SIGNAL_HAS_SOSFILT & ULAB_MAX_DIMS > 1
         { MP_OBJ_NEW_QSTR(MP_QSTR_sosfilt), (mp_obj_t)&signal_sosfilt_obj },
     #endif
 };
diff --git a/python/port/mod/ulab/scipy/signal/signal.h b/python/port/mod/ulab/scipy/signal/signal.h
index d33220e62..3c2343a82 100644
--- a/python/port/mod/ulab/scipy/signal/signal.h
+++ b/python/port/mod/ulab/scipy/signal/signal.h
@@ -18,7 +18,6 @@
 
 extern mp_obj_module_t ulab_scipy_signal_module;
 
-MP_DECLARE_CONST_FUN_OBJ_VAR_BETWEEN(signal_spectrogram_obj);
 MP_DECLARE_CONST_FUN_OBJ_KW(signal_sosfilt_obj);
 
 #endif /* _SCIPY_SIGNAL_ */
diff --git a/python/port/mod/ulab/scipy/special/special.c b/python/port/mod/ulab/scipy/special/special.c
index 82b53247d..79d9b77f3 100644
--- a/python/port/mod/ulab/scipy/special/special.c
+++ b/python/port/mod/ulab/scipy/special/special.c
@@ -21,16 +21,16 @@
 static const mp_rom_map_elem_t ulab_scipy_special_globals_table[] = {
     { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_special) },
     #if ULAB_SCIPY_SPECIAL_HAS_ERF
-		{ MP_OBJ_NEW_QSTR(MP_QSTR_erf), (mp_obj_t)&vectorise_erf_obj },
+		{ MP_OBJ_NEW_QSTR(MP_QSTR_erf), (mp_obj_t)&vector_erf_obj },
     #endif
 	#if ULAB_SCIPY_SPECIAL_HAS_ERFC
-		{ MP_OBJ_NEW_QSTR(MP_QSTR_erfc), (mp_obj_t)&vectorise_erfc_obj },
+		{ MP_OBJ_NEW_QSTR(MP_QSTR_erfc), (mp_obj_t)&vector_erfc_obj },
 	#endif
 	#if ULAB_SCIPY_SPECIAL_HAS_GAMMA
-		{ MP_OBJ_NEW_QSTR(MP_QSTR_gamma), (mp_obj_t)&vectorise_gamma_obj },
+		{ MP_OBJ_NEW_QSTR(MP_QSTR_gamma), (mp_obj_t)&vector_gamma_obj },
 	#endif
 	#if ULAB_SCIPY_SPECIAL_HAS_GAMMALN
-		{ MP_OBJ_NEW_QSTR(MP_QSTR_gammaln), (mp_obj_t)&vectorise_lgamma_obj },
+		{ MP_OBJ_NEW_QSTR(MP_QSTR_gammaln), (mp_obj_t)&vector_lgamma_obj },
 	#endif
 };
 
diff --git a/python/port/mod/ulab/ulab.c b/python/port/mod/ulab/ulab.c
index 2b9ebd73f..9785a8dd2 100644
--- a/python/port/mod/ulab/ulab.c
+++ b/python/port/mod/ulab/ulab.c
@@ -20,9 +20,9 @@
 #include "py/objarray.h"
 
 #include "ulab.h"
-#include "ulab_create.h"
 #include "ndarray.h"
 #include "ndarray_properties.h"
+#include "numpy/create.h"
 #include "numpy/ndarray/ndarray_iter.h"
 
 #include "numpy/numpy.h"
@@ -33,13 +33,21 @@
 #include "user/user.h"
 #include "utils/utils.h"
 
-#define ULAB_VERSION 3.3.8
+#define ULAB_VERSION 5.0.7
 #define xstr(s) str(s)
 #define str(s) #s
+
+#if ULAB_SUPPORTS_COMPLEX
+#define ULAB_VERSION_STRING xstr(ULAB_VERSION) xstr(-) xstr(ULAB_MAX_DIMS) xstr(D-c)
+#else
 #define ULAB_VERSION_STRING xstr(ULAB_VERSION) xstr(-) xstr(ULAB_MAX_DIMS) xstr(D)
+#endif
 
 STATIC MP_DEFINE_STR_OBJ(ulab_version_obj, ULAB_VERSION_STRING);
 
+#ifdef ULAB_HASH
+STATIC MP_DEFINE_STR_OBJ(ulab_sha_obj, xstr(ULAB_HASH));
+#endif
 
 STATIC const mp_rom_map_elem_t ulab_ndarray_locals_dict_table[] = {
     #if ULAB_MAX_DIMS > 1
@@ -62,6 +70,9 @@ STATIC const mp_rom_map_elem_t ulab_ndarray_locals_dict_table[] = {
     #if NDARRAY_HAS_TOBYTES
         { MP_ROM_QSTR(MP_QSTR_tobytes), MP_ROM_PTR(&ndarray_tobytes_obj) },
     #endif
+    #if NDARRAY_HAS_TOLIST
+        { MP_ROM_QSTR(MP_QSTR_tolist), MP_ROM_PTR(&ndarray_tolist_obj) },
+    #endif
     #if NDARRAY_HAS_SORT
         { MP_ROM_QSTR(MP_QSTR_sort), MP_ROM_PTR(&numerical_sort_inplace_obj) },
     #endif
@@ -141,6 +152,9 @@ const mp_obj_type_t ndarray_flatiter_type = {
 STATIC const mp_map_elem_t ulab_globals_table[] = {
     { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_ulab) },
     { MP_ROM_QSTR(MP_QSTR___version__), MP_ROM_PTR(&ulab_version_obj) },
+    #ifdef ULAB_HASH
+    { MP_ROM_QSTR(MP_QSTR___sha__), MP_ROM_PTR(&ulab_sha_obj) },
+    #endif
     #if ULAB_HAS_DTYPE_OBJECT
         { MP_OBJ_NEW_QSTR(MP_QSTR_dtype), (mp_obj_t)&ulab_dtype_type },
     #else
@@ -174,4 +188,10 @@ const mp_obj_module_t ulab_user_cmodule = {
     .globals = (mp_obj_dict_t*)&mp_module_ulab_globals,
 };
 
+// Use old three-argument MP_REGISTER_MODULE for
+// MicroPython <= v1.18.0: (1 << 16) | (18 << 8) | 0
+#if MICROPY_VERSION <= 70144
 MP_REGISTER_MODULE(MP_QSTR_ulab, ulab_user_cmodule, MODULE_ULAB_ENABLED);
+#else
+MP_REGISTER_MODULE(MP_QSTR_ulab, ulab_user_cmodule);
+#endif
diff --git a/python/port/mod/ulab/ulab.h b/python/port/mod/ulab/ulab.h
index 248047c85..abeda967f 100644
--- a/python/port/mod/ulab/ulab.h
+++ b/python/port/mod/ulab/ulab.h
@@ -6,7 +6,7 @@
  *
  * The MIT License (MIT)
  *
- * Copyright (c) 2019-2021 Zoltán Vörös
+ * Copyright (c) 2019-2022 Zoltán Vörös
 */
 
 #ifndef __ULAB__
@@ -18,9 +18,9 @@
 //
 // - how many dimensions ulab can handle
 // - which functions are included in the compiled firmware
-// - whether the python syntax is numpy-like, or modular
 // - whether arrays can be sliced and iterated over
 // - which binary/unary operators are supported
+// - whether ulab can deal with complex numbers
 //
 // A considerable amount of flash space can be saved by removing (setting
 // the corresponding constants to 0) the unnecessary functions and features.
@@ -31,6 +31,10 @@
 #include ULAB_CONFIG_FILE
 #endif
 
+// Adds support for complex ndarrays
+#ifndef ULAB_SUPPORTS_COMPLEX
+#define ULAB_SUPPORTS_COMPLEX               (0)
+#endif
 
 // Determines, whether scipy is defined in ulab. The sub-modules and functions
 // of scipy have to be defined separately
@@ -228,6 +232,10 @@
 #define NDARRAY_HAS_TOBYTES             (1)
 #endif
 
+#ifndef NDARRAY_HAS_TOLIST
+#define NDARRAY_HAS_TOLIST              (1)
+#endif
+
 #ifndef NDARRAY_HAS_TRANSPOSE
 #define NDARRAY_HAS_TRANSPOSE           (1)
 #endif
@@ -385,6 +393,15 @@
 #define ULAB_NUMPY_HAS_FFT_MODULE       (1)
 #endif
 
+// By setting this constant to 1, the FFT routine will behave in a
+// numpy-compatible way, i.e., it will output a complex array
+// This setting has no effect, if ULAB_SUPPORTS_COMPLEX is 0
+// Note that in this case, the input also must be numpythonic,
+// i.e., the real an imaginary parts cannot be passed as two arguments
+#ifndef ULAB_FFT_IS_NUMPY_COMPATIBLE
+#define ULAB_FFT_IS_NUMPY_COMPATIBLE    (0)
+#endif
+
 #ifndef ULAB_FFT_HAS_FFT
 #define ULAB_FFT_HAS_FFT                (1)
 #endif
@@ -409,6 +426,14 @@
 #define ULAB_NUMPY_HAS_ARGSORT          (1)
 #endif
 
+#ifndef ULAB_NUMPY_HAS_ASARRAY
+#define ULAB_NUMPY_HAS_ASARRAY          (0)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_COMPRESS
+#define ULAB_NUMPY_HAS_COMPRESS         (1)
+#endif
+
 #ifndef ULAB_NUMPY_HAS_CONVOLVE
 #define ULAB_NUMPY_HAS_CONVOLVE         (1)
 #endif
@@ -417,6 +442,10 @@
 #define ULAB_NUMPY_HAS_CROSS            (1)
 #endif
 
+#ifndef ULAB_NUMPY_HAS_DELETE
+#define ULAB_NUMPY_HAS_DELETE           (1)
+#endif
+
 #ifndef ULAB_NUMPY_HAS_DIFF
 #define ULAB_NUMPY_HAS_DIFF             (1)
 #endif
@@ -433,6 +462,14 @@
 #define ULAB_NUMPY_HAS_INTERP           (1)
 #endif
 
+#ifndef ULAB_NUMPY_HAS_LOAD
+#define ULAB_NUMPY_HAS_LOAD             (0)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_LOADTXT
+#define ULAB_NUMPY_HAS_LOADTXT          (0)
+#endif
+
 #ifndef ULAB_NUMPY_HAS_MEAN
 #define ULAB_NUMPY_HAS_MEAN             (1)
 #endif
@@ -457,6 +494,18 @@
 #define ULAB_NUMPY_HAS_ROLL             (1)
 #endif
 
+#ifndef ULAB_NUMPY_HAS_SAVE
+#define ULAB_NUMPY_HAS_SAVE             (0)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_SAVETXT
+#define ULAB_NUMPY_HAS_SAVETXT          (0)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_SIZE
+#define ULAB_NUMPY_HAS_SIZE             (1)
+#endif
+
 #ifndef ULAB_NUMPY_HAS_SORT
 #define ULAB_NUMPY_HAS_SORT             (1)
 #endif
@@ -579,6 +628,25 @@
 #define ULAB_NUMPY_HAS_VECTORIZE        (1)
 #endif
 
+// Complex functions. The implementations are compiled into
+// the firmware, only if ULAB_SUPPORTS_COMPLEX is set to 1
+#ifndef ULAB_NUMPY_HAS_CONJUGATE
+#define ULAB_NUMPY_HAS_CONJUGATE        (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_IMAG
+#define ULAB_NUMPY_HAS_IMAG             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_REAL
+#define ULAB_NUMPY_HAS_REAL             (1)
+#endif
+
+#ifndef ULAB_NUMPY_HAS_SORT_COMPLEX
+#define ULAB_NUMPY_HAS_SORT_COMPLEX     (0)
+#endif
+
+// scipy modules
 #ifndef ULAB_SCIPY_HAS_LINALG_MODULE
 #define ULAB_SCIPY_HAS_LINALG_MODULE        (1)
 #endif
@@ -595,10 +663,6 @@
 #define ULAB_SCIPY_HAS_SIGNAL_MODULE        (1)
 #endif
 
-#ifndef ULAB_SCIPY_SIGNAL_HAS_SPECTROGRAM
-#define ULAB_SCIPY_SIGNAL_HAS_SPECTROGRAM   (1)
-#endif
-
 #ifndef ULAB_SCIPY_SIGNAL_HAS_SOSFILT
 #define ULAB_SCIPY_SIGNAL_HAS_SOSFILT       (1)
 #endif
@@ -643,12 +707,7 @@
 #define ULAB_SCIPY_SPECIAL_HAS_GAMMALN      (1)
 #endif
 
-// user-defined module; source of the module and
-// its sub-modules should be placed in code/user/
-#ifndef ULAB_HAS_USER_MODULE
-#define ULAB_HAS_USER_MODULE                (0)
-#endif
-
+// functions of the utils module
 #ifndef ULAB_HAS_UTILS_MODULE
 #define ULAB_HAS_UTILS_MODULE               (1)
 #endif
@@ -669,4 +728,14 @@
 #define ULAB_UTILS_HAS_FROM_UINT32_BUFFER   (1)
 #endif
 
+#ifndef ULAB_UTILS_HAS_SPECTROGRAM
+#define ULAB_UTILS_HAS_SPECTROGRAM          (1)
+#endif
+
+// user-defined module; source of the module and
+// its sub-modules should be placed in code/user/
+#ifndef ULAB_HAS_USER_MODULE
+#define ULAB_HAS_USER_MODULE                (0)
+#endif
+
 #endif
diff --git a/python/port/mod/ulab/ulab_tools.c b/python/port/mod/ulab/ulab_tools.c
index acd3d8a58..68f14bf55 100644
--- a/python/port/mod/ulab/ulab_tools.c
+++ b/python/port/mod/ulab/ulab_tools.c
@@ -5,7 +5,7 @@
  *
  * The MIT License (MIT)
  *
- * Copyright (c) 2020-2021 Zoltán Vörös
+ * Copyright (c) 2020-2022 Zoltán Vörös
  */
 
 
@@ -216,6 +216,14 @@ shape_strides tools_reduce_axes(ndarray_obj_t *ndarray, mp_obj_t axis) {
     return _shape_strides;
 }
 
+int8_t tools_get_axis(mp_obj_t axis, uint8_t ndim) {
+    int8_t ax = mp_obj_get_int(axis);
+    if(ax < 0) ax += ndim;
+    if((ax < 0) || (ax > ndim - 1)) {
+        mp_raise_ValueError(translate("axis is out of bounds"));
+    }
+    return ax;
+}
 
 #if ULAB_MAX_DIMS > 1
 ndarray_obj_t *tools_object_is_square(mp_obj_t obj) {
@@ -231,3 +239,38 @@ ndarray_obj_t *tools_object_is_square(mp_obj_t obj) {
     return ndarray;
 }
 #endif
+
+uint8_t ulab_binary_get_size(uint8_t dtype) {
+    #if ULAB_SUPPORTS_COMPLEX
+    if(dtype == NDARRAY_COMPLEX) {
+        return 2 * (uint8_t)sizeof(mp_float_t);
+    }
+    #endif
+    return dtype == NDARRAY_BOOL ? 1 : mp_binary_get_size('@', dtype, NULL);
+}
+
+#if ULAB_SUPPORTS_COMPLEX
+void ulab_rescale_float_strides(int32_t *strides) {
+    // re-scale the strides, so that we can work with floats, when iterating
+    uint8_t sz = sizeof(mp_float_t);
+    for(uint8_t i = 0; i < ULAB_MAX_DIMS; i++) {
+        strides[i] /= sz;
+    }
+}
+#endif
+
+bool ulab_tools_mp_obj_is_scalar(mp_obj_t obj) {
+    #if ULAB_SUPPORTS_COMPLEX
+    if(mp_obj_is_int(obj) || mp_obj_is_float(obj) || mp_obj_is_type(obj, &mp_type_complex)) {
+        return true;
+    } else {
+        return false;
+    }
+    #else
+    if(mp_obj_is_int(obj) || mp_obj_is_float(obj)) {
+        return true;
+    } else {
+        return false;
+    }
+    #endif
+}
\ No newline at end of file
diff --git a/python/port/mod/ulab/ulab_tools.h b/python/port/mod/ulab/ulab_tools.h
index 378e4f0ca..5ae99df90 100644
--- a/python/port/mod/ulab/ulab_tools.h
+++ b/python/port/mod/ulab/ulab_tools.h
@@ -5,7 +5,7 @@
  *
  * The MIT License (MIT)
  *
- * Copyright (c) 2020-2021 Zoltán Vörös
+ * Copyright (c) 2020-2022 Zoltán Vörös
 */
 
 #ifndef _TOOLS_
@@ -33,5 +33,14 @@ uint8_t ndarray_upcast_dtype(uint8_t , uint8_t );
 void *ndarray_set_float_function(uint8_t );
 
 shape_strides tools_reduce_axes(ndarray_obj_t *, mp_obj_t );
+int8_t tools_get_axis(mp_obj_t , uint8_t );
 ndarray_obj_t *tools_object_is_square(mp_obj_t );
+
+uint8_t ulab_binary_get_size(uint8_t );
+
+#if ULAB_SUPPORTS_COMPLEX
+void ulab_rescale_float_strides(int32_t *);
+#endif
+
+bool ulab_tools_mp_obj_is_scalar(mp_obj_t );
 #endif
diff --git a/python/port/mod/ulab/user/user.c b/python/port/mod/ulab/user/user.c
index f69089da5..835c091c7 100644
--- a/python/port/mod/ulab/user/user.c
+++ b/python/port/mod/ulab/user/user.c
@@ -74,7 +74,7 @@ static mp_obj_t user_square(mp_obj_t arg) {
             *rarray++ = (*array) * (*array);
         }
     }
-    // at the end, return a micropython object
+    // at the end, return a micrppython object
     return MP_OBJ_FROM_PTR(results);
 }
 
diff --git a/python/port/mod/ulab/utils/utils.c b/python/port/mod/ulab/utils/utils.c
index 2b7dc093c..31b6893c7 100644
--- a/python/port/mod/ulab/utils/utils.c
+++ b/python/port/mod/ulab/utils/utils.c
@@ -16,6 +16,8 @@
 #include "py/misc.h"
 #include "utils.h"
 
+#include "../numpy/fft/fft_tools.h"
+
 #if ULAB_HAS_UTILS_MODULE
 
 enum UTILS_BUFFER_TYPE {
@@ -187,8 +189,41 @@ static mp_obj_t utils_from_uint32_buffer(size_t n_args, const mp_obj_t *pos_args
 MP_DEFINE_CONST_FUN_OBJ_KW(utils_from_uint32_buffer_obj, 1, utils_from_uint32_buffer);
 #endif
 
+#endif /* ULAB_UTILS_HAS_FROM_INT16_BUFFER | ULAB_UTILS_HAS_FROM_UINT16_BUFFER | ULAB_UTILS_HAS_FROM_INT32_BUFFER | ULAB_UTILS_HAS_FROM_UINT32_BUFFER */
+
+#if ULAB_UTILS_HAS_SPECTROGRAM
+//| import ulab.numpy
+//|
+//| def spectrogram(r: ulab.numpy.ndarray) -> ulab.numpy.ndarray:
+//|     """
+//|     :param ulab.numpy.ndarray r: A 1-dimension array of values whose size is a power of 2
+//|
+//|     Computes the spectrum of the input signal.  This is the absolute value of the (complex-valued) fft of the signal.
+//|     This function is similar to scipy's ``scipy.signal.welch`` https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.welch.html."""
+//|     ...
+//|
+
+mp_obj_t utils_spectrogram(size_t n_args, const mp_obj_t *args) {
+    #if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+        return fft_fft_ifft_spectrogram(args[0], FFT_SPECTROGRAM);
+    #else
+    if(n_args == 2) {
+        return fft_fft_ifft_spectrogram(n_args, args[0], args[1], FFT_SPECTROGRAM);
+    } else {
+        return fft_fft_ifft_spectrogram(n_args, args[0], mp_const_none, FFT_SPECTROGRAM);
+    }
+    #endif
+}
+
+#if ULAB_SUPPORTS_COMPLEX & ULAB_FFT_IS_NUMPY_COMPATIBLE
+MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(utils_spectrogram_obj, 1, 1, utils_spectrogram);
+#else
+MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(utils_spectrogram_obj, 1, 2, utils_spectrogram);
 #endif
 
+#endif /* ULAB_UTILS_HAS_SPECTROGRAM */
+
+
 static const mp_rom_map_elem_t ulab_utils_globals_table[] = {
     { MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR_utils) },
     #if ULAB_UTILS_HAS_FROM_INT16_BUFFER
@@ -203,6 +238,9 @@ static const mp_rom_map_elem_t ulab_utils_globals_table[] = {
     #if ULAB_UTILS_HAS_FROM_UINT32_BUFFER
         { MP_OBJ_NEW_QSTR(MP_QSTR_from_uint32_buffer), (mp_obj_t)&utils_from_uint32_buffer_obj },
     #endif
+    #if ULAB_UTILS_HAS_SPECTROGRAM
+        { MP_OBJ_NEW_QSTR(MP_QSTR_spectrogram), (mp_obj_t)&utils_spectrogram_obj },
+    #endif
 };
 
 static MP_DEFINE_CONST_DICT(mp_module_ulab_utils_globals, ulab_utils_globals_table);
@@ -212,4 +250,4 @@ mp_obj_module_t ulab_utils_module = {
     .globals = (mp_obj_dict_t*)&mp_module_ulab_utils_globals,
 };
 
-#endif
+#endif /* ULAB_HAS_UTILS_MODULE */
diff --git a/python/port/mpconfigport.h b/python/port/mpconfigport.h
index 26f813ba9..9040132fe 100644
--- a/python/port/mpconfigport.h
+++ b/python/port/mpconfigport.h
@@ -1,6 +1,9 @@
 #include <stdint.h>
 #include <alloca.h>
+// Include helpers when this is not a MicroPython build.
+#ifdef EPSILON_VERSION
 #include "helpers.h"
+#endif
 
 /* MicroPython configuration options
  * We're not listing the default options as defined in mpconfig.h */
@@ -16,6 +19,11 @@
  * are therefore erased prematurely. */
 #define MICROPY_ENABLE_PYSTACK (0)
 
+// Whether to encode None/False/True as immediate objects instead of pointers to
+// real objects.  Reduces code size by a decent amount without hurting
+// performance, for all representations except D on some architectures.
+#define MICROPY_OBJ_IMMEDIATE_OBJS 0
+
 // Maximum length of a path in the filesystem
 #define MICROPY_ALLOC_PATH_MAX (32)
 
@@ -139,40 +147,6 @@ typedef long mp_off_t;
 
 #define MP_STATE_PORT MP_STATE_VM
 
-extern const struct _mp_obj_module_t modion_module;
-extern const struct _mp_obj_module_t modkandinsky_module;
-extern const struct _mp_obj_module_t modmatplotlib_module;
-extern const struct _mp_obj_module_t modpyplot_module;
-extern const struct _mp_obj_module_t modtime_module;
-extern const struct _mp_obj_module_t modos_module;
-extern const struct _mp_obj_module_t modturtle_module;
-
-#if !defined(INCLUDE_ULAB)
-
-#define MICROPY_PORT_BUILTIN_MODULES \
-    { MP_ROM_QSTR(MP_QSTR_ion), MP_ROM_PTR(&modion_module) }, \
-    { MP_ROM_QSTR(MP_QSTR_kandinsky), MP_ROM_PTR(&modkandinsky_module) }, \
-    { MP_ROM_QSTR(MP_QSTR_matplotlib), MP_ROM_PTR(&modmatplotlib_module) }, \
-    { MP_ROM_QSTR(MP_QSTR_matplotlib_dot_pyplot), MP_ROM_PTR(&modpyplot_module) }, \
-    { MP_ROM_QSTR(MP_QSTR_time), MP_ROM_PTR(&modtime_module) }, \
-    { MP_ROM_QSTR(MP_QSTR_os), MP_ROM_PTR(&modos_module) }, \
-    { MP_ROM_QSTR(MP_QSTR_turtle), MP_ROM_PTR(&modturtle_module) }, \
-
-#else
-extern const struct _mp_obj_module_t ulab_user_cmodule;
-
-#define MICROPY_PORT_BUILTIN_MODULES \
-    { MP_ROM_QSTR(MP_QSTR_ion), MP_ROM_PTR(&modion_module) }, \
-    { MP_ROM_QSTR(MP_QSTR_kandinsky), MP_ROM_PTR(&modkandinsky_module) }, \
-    { MP_ROM_QSTR(MP_QSTR_matplotlib), MP_ROM_PTR(&modmatplotlib_module) }, \
-    { MP_ROM_QSTR(MP_QSTR_matplotlib_dot_pyplot), MP_ROM_PTR(&modpyplot_module) }, \
-    { MP_ROM_QSTR(MP_QSTR_time), MP_ROM_PTR(&modtime_module) }, \
-    { MP_ROM_QSTR(MP_QSTR_os), MP_ROM_PTR(&modos_module) }, \
-    { MP_ROM_QSTR(MP_QSTR_turtle), MP_ROM_PTR(&modturtle_module) }, \
-    { MP_ROM_QSTR(MP_QSTR_ulab), MP_ROM_PTR(&ulab_user_cmodule) },        \
-
-#endif
-
 
 // Enable setjmp in debug mode. This is to avoid some optimizations done
 // specifically for x86_64 using inline assembly, which makes the debug binary
diff --git a/python/src/extmod/moduplatform.h b/python/src/extmod/moduplatform.h
new file mode 100644
index 000000000..740a745b6
--- /dev/null
+++ b/python/src/extmod/moduplatform.h
@@ -0,0 +1,7 @@
+// This file is needed, because MicroPython includes it in the modsys.c file.
+// https://github.com/micropython/micropython/commit/402df833fe6da5233c83c58421e81493cda54f67#diff-99822946f0f35edf3fa262e9a3b213da739cfd30f5c8c0c44ef0eb67d7d7b4b0R39
+// It is just a hack to make it work.
+// It will be needed if the sys module is enabled.
+#if MICROPY_PY_SYS
+#warning "To build the sys module, you need to use the real moduplatform module from MicroPython."
+#endif
diff --git a/python/src/py/asmarm.c b/python/src/py/asmarm.c
index 4ba93d080..42724e4d4 100644
--- a/python/src/py/asmarm.c
+++ b/python/src/py/asmarm.c
@@ -304,6 +304,11 @@ void asm_arm_ldrh_reg_reg(asm_arm_t *as, uint rd, uint rn) {
     emit_al(as, 0x1d000b0 | (rn << 16) | (rd << 12));
 }
 
+void asm_arm_ldrh_reg_reg_offset(asm_arm_t *as, uint rd, uint rn, uint byte_offset) {
+    // ldrh rd, [rn, #off]
+    emit_al(as, 0x1f000b0 | (rn << 16) | (rd << 12) | ((byte_offset & 0xf0) << 4) | (byte_offset & 0xf));
+}
+
 void asm_arm_ldrb_reg_reg(asm_arm_t *as, uint rd, uint rn) {
     // ldrb rd, [rn]
     emit_al(as, 0x5d00000 | (rn << 16) | (rd << 12));
diff --git a/python/src/py/asmarm.h b/python/src/py/asmarm.h
index 0e029f20e..561d69a4b 100644
--- a/python/src/py/asmarm.h
+++ b/python/src/py/asmarm.h
@@ -109,6 +109,7 @@ void asm_arm_asr_reg_reg(asm_arm_t *as, uint rd, uint rs);
 // memory
 void asm_arm_ldr_reg_reg(asm_arm_t *as, uint rd, uint rn, uint byte_offset);
 void asm_arm_ldrh_reg_reg(asm_arm_t *as, uint rd, uint rn);
+void asm_arm_ldrh_reg_reg_offset(asm_arm_t *as, uint rd, uint rn, uint byte_offset);
 void asm_arm_ldrb_reg_reg(asm_arm_t *as, uint rd, uint rn);
 void asm_arm_str_reg_reg(asm_arm_t *as, uint rd, uint rm, uint byte_offset);
 void asm_arm_strh_reg_reg(asm_arm_t *as, uint rd, uint rm);
@@ -203,6 +204,7 @@ void asm_arm_bx_reg(asm_arm_t *as, uint reg_src);
 #define ASM_LOAD_REG_REG_OFFSET(as, reg_dest, reg_base, word_offset) asm_arm_ldr_reg_reg((as), (reg_dest), (reg_base), 4 * (word_offset))
 #define ASM_LOAD8_REG_REG(as, reg_dest, reg_base) asm_arm_ldrb_reg_reg((as), (reg_dest), (reg_base))
 #define ASM_LOAD16_REG_REG(as, reg_dest, reg_base) asm_arm_ldrh_reg_reg((as), (reg_dest), (reg_base))
+#define ASM_LOAD16_REG_REG_OFFSET(as, reg_dest, reg_base, uint16_offset) asm_arm_ldrh_reg_reg_offset((as), (reg_dest), (reg_base), 2 * (uint16_offset))
 #define ASM_LOAD32_REG_REG(as, reg_dest, reg_base) asm_arm_ldr_reg_reg((as), (reg_dest), (reg_base), 0)
 
 #define ASM_STORE_REG_REG(as, reg_value, reg_base) asm_arm_str_reg_reg((as), (reg_value), (reg_base), 0)
diff --git a/python/src/py/asmbase.c b/python/src/py/asmbase.c
index 344e03e7a..4a3fd089c 100644
--- a/python/src/py/asmbase.c
+++ b/python/src/py/asmbase.c
@@ -61,7 +61,8 @@ void mp_asm_base_start_pass(mp_asm_base_t *as, int pass) {
 // all functions must go through this one to emit bytes
 // if as->pass < MP_ASM_PASS_EMIT, then this function just counts the number
 // of bytes needed and returns NULL, and callers should not store any data
-uint8_t *mp_asm_base_get_cur_to_write_bytes(mp_asm_base_t *as, size_t num_bytes_to_write) {
+uint8_t *mp_asm_base_get_cur_to_write_bytes(void *as_in, size_t num_bytes_to_write) {
+    mp_asm_base_t *as = as_in;
     uint8_t *c = NULL;
     if (as->pass == MP_ASM_PASS_EMIT) {
         assert(as->code_offset + num_bytes_to_write <= as->code_size);
diff --git a/python/src/py/asmbase.h b/python/src/py/asmbase.h
index 24c3af867..960be7685 100644
--- a/python/src/py/asmbase.h
+++ b/python/src/py/asmbase.h
@@ -45,7 +45,7 @@ typedef struct _mp_asm_base_t {
 void mp_asm_base_init(mp_asm_base_t *as, size_t max_num_labels);
 void mp_asm_base_deinit(mp_asm_base_t *as, bool free_code);
 void mp_asm_base_start_pass(mp_asm_base_t *as, int pass);
-uint8_t *mp_asm_base_get_cur_to_write_bytes(mp_asm_base_t *as, size_t num_bytes_to_write);
+uint8_t *mp_asm_base_get_cur_to_write_bytes(void *as, size_t num_bytes_to_write);
 void mp_asm_base_label_assign(mp_asm_base_t *as, size_t label);
 void mp_asm_base_align(mp_asm_base_t *as, unsigned int align);
 void mp_asm_base_data(mp_asm_base_t *as, unsigned int bytesize, uintptr_t val);
diff --git a/python/src/py/asmthumb.c b/python/src/py/asmthumb.c
index db4520ce1..49574c43a 100644
--- a/python/src/py/asmthumb.c
+++ b/python/src/py/asmthumb.c
@@ -34,9 +34,25 @@
 #if MICROPY_EMIT_THUMB || MICROPY_EMIT_INLINE_THUMB
 
 #include "py/mpstate.h"
-#include "py/persistentcode.h"
 #include "py/asmthumb.h"
 
+#ifdef _MSC_VER
+#include <intrin.h>
+
+static uint32_t mp_clz(uint32_t x) {
+    unsigned long lz = 0;
+    return _BitScanReverse(&lz, x) ? (sizeof(x) * 8 - 1) - lz : 0;
+}
+
+static uint32_t mp_ctz(uint32_t x) {
+    unsigned long tz = 0;
+    return _BitScanForward(&tz, x) ? tz : 0;
+}
+#else
+#define mp_clz(x) __builtin_clz(x)
+#define mp_ctz(x) __builtin_ctz(x)
+#endif
+
 #define UNSIGNED_FIT5(x) ((uint32_t)(x) < 32)
 #define UNSIGNED_FIT7(x) ((uint32_t)(x) < 128)
 #define UNSIGNED_FIT8(x) (((x) & 0xffffff00) == 0)
@@ -46,7 +62,6 @@
 #define SIGNED_FIT12(x) (((x) & 0xfffff800) == 0) || (((x) & 0xfffff800) == 0xfffff800)
 #define SIGNED_FIT23(x) (((x) & 0xffc00000) == 0) || (((x) & 0xffc00000) == 0xffc00000)
 
-#if MICROPY_EMIT_THUMB_ARMV7M
 // Note: these actually take an imm12 but the high-bit is not encoded here
 #define OP_ADD_W_RRI_HI(reg_src) (0xf200 | (reg_src))
 #define OP_ADD_W_RRI_LO(reg_dest, imm11) ((imm11 << 4 & 0x7000) | reg_dest << 8 | (imm11 & 0xff))
@@ -55,7 +70,9 @@
 
 #define OP_LDR_W_HI(reg_base) (0xf8d0 | (reg_base))
 #define OP_LDR_W_LO(reg_dest, imm12) ((reg_dest) << 12 | (imm12))
-#endif
+
+#define OP_LDRH_W_HI(reg_base) (0xf8b0 | (reg_base))
+#define OP_LDRH_W_LO(reg_dest, imm12) ((reg_dest) << 12 | (imm12))
 
 static inline byte *asm_thumb_get_cur_to_write_bytes(asm_thumb_t *as, int n) {
     return mp_asm_base_get_cur_to_write_bytes(&as->base, n);
@@ -158,21 +175,21 @@ void asm_thumb_entry(asm_thumb_t *as, int num_locals) {
     }
     asm_thumb_op16(as, OP_PUSH_RLIST_LR(reglist));
     if (stack_adjust > 0) {
-        #if MICROPY_EMIT_THUMB_ARMV7M
-        if (UNSIGNED_FIT7(stack_adjust)) {
-            asm_thumb_op16(as, OP_SUB_SP(stack_adjust));
+        if (asm_thumb_allow_armv7m(as)) {
+            if (UNSIGNED_FIT7(stack_adjust)) {
+                asm_thumb_op16(as, OP_SUB_SP(stack_adjust));
+            } else {
+                asm_thumb_op32(as, OP_SUB_W_RRI_HI(ASM_THUMB_REG_SP), OP_SUB_W_RRI_LO(ASM_THUMB_REG_SP, stack_adjust * 4));
+            }
         } else {
-            asm_thumb_op32(as, OP_SUB_W_RRI_HI(ASM_THUMB_REG_SP), OP_SUB_W_RRI_LO(ASM_THUMB_REG_SP, stack_adjust * 4));
+            int adj = stack_adjust;
+            // we don't expect the stack_adjust to be massive
+            while (!UNSIGNED_FIT7(adj)) {
+                asm_thumb_op16(as, OP_SUB_SP(127));
+                adj -= 127;
+            }
+            asm_thumb_op16(as, OP_SUB_SP(adj));
         }
-        #else
-        int adj = stack_adjust;
-        // we don't expect the stack_adjust to be massive
-        while (!UNSIGNED_FIT7(adj)) {
-            asm_thumb_op16(as, OP_SUB_SP(127));
-            adj -= 127;
-        }
-        asm_thumb_op16(as, OP_SUB_SP(adj));
-        #endif
     }
     as->push_reglist = reglist;
     as->stack_adjust = stack_adjust;
@@ -180,21 +197,21 @@ void asm_thumb_entry(asm_thumb_t *as, int num_locals) {
 
 void asm_thumb_exit(asm_thumb_t *as) {
     if (as->stack_adjust > 0) {
-        #if MICROPY_EMIT_THUMB_ARMV7M
-        if (UNSIGNED_FIT7(as->stack_adjust)) {
-            asm_thumb_op16(as, OP_ADD_SP(as->stack_adjust));
+        if (asm_thumb_allow_armv7m(as)) {
+            if (UNSIGNED_FIT7(as->stack_adjust)) {
+                asm_thumb_op16(as, OP_ADD_SP(as->stack_adjust));
+            } else {
+                asm_thumb_op32(as, OP_ADD_W_RRI_HI(ASM_THUMB_REG_SP), OP_ADD_W_RRI_LO(ASM_THUMB_REG_SP, as->stack_adjust * 4));
+            }
         } else {
-            asm_thumb_op32(as, OP_ADD_W_RRI_HI(ASM_THUMB_REG_SP), OP_ADD_W_RRI_LO(ASM_THUMB_REG_SP, as->stack_adjust * 4));
+            int adj = as->stack_adjust;
+            // we don't expect the stack_adjust to be massive
+            while (!UNSIGNED_FIT7(adj)) {
+                asm_thumb_op16(as, OP_ADD_SP(127));
+                adj -= 127;
+            }
+            asm_thumb_op16(as, OP_ADD_SP(adj));
         }
-        #else
-        int adj = as->stack_adjust;
-        // we don't expect the stack_adjust to be massive
-        while (!UNSIGNED_FIT7(adj)) {
-            asm_thumb_op16(as, OP_ADD_SP(127));
-            adj -= 127;
-        }
-        asm_thumb_op16(as, OP_ADD_SP(adj));
-        #endif
     }
     asm_thumb_op16(as, OP_POP_RLIST_PC(as->push_reglist));
 }
@@ -248,27 +265,19 @@ void asm_thumb_mov_reg_reg(asm_thumb_t *as, uint reg_dest, uint reg_src) {
     asm_thumb_op16(as, 0x4600 | op_lo);
 }
 
-#if MICROPY_EMIT_THUMB_ARMV7M
-
 // if loading lo half with movw, the i16 value will be zero extended into the r32 register!
-size_t asm_thumb_mov_reg_i16(asm_thumb_t *as, uint mov_op, uint reg_dest, int i16_src) {
+void asm_thumb_mov_reg_i16(asm_thumb_t *as, uint mov_op, uint reg_dest, int i16_src) {
     assert(reg_dest < ASM_THUMB_REG_R15);
-    size_t loc = mp_asm_base_get_code_pos(&as->base);
     // mov[wt] reg_dest, #i16_src
     asm_thumb_op32(as, mov_op | ((i16_src >> 1) & 0x0400) | ((i16_src >> 12) & 0xf), ((i16_src << 4) & 0x7000) | (reg_dest << 8) | (i16_src & 0xff));
-    return loc;
 }
 
-#else
-
-void asm_thumb_mov_rlo_i16(asm_thumb_t *as, uint rlo_dest, int i16_src) {
+static void asm_thumb_mov_rlo_i16(asm_thumb_t *as, uint rlo_dest, int i16_src) {
     asm_thumb_mov_rlo_i8(as, rlo_dest, (i16_src >> 8) & 0xff);
     asm_thumb_lsl_rlo_rlo_i5(as, rlo_dest, rlo_dest, 8);
     asm_thumb_add_rlo_i8(as, rlo_dest, i16_src & 0xff);
 }
 
-#endif
-
 #define OP_B_N(byte_offset) (0xe000 | (((byte_offset) >> 1) & 0x07ff))
 
 bool asm_thumb_b_n_label(asm_thumb_t *as, uint label) {
@@ -292,14 +301,12 @@ bool asm_thumb_bcc_nw_label(asm_thumb_t *as, int cond, uint label, bool wide) {
     if (!wide) {
         asm_thumb_op16(as, OP_BCC_N(cond, rel));
         return as->base.pass != MP_ASM_PASS_EMIT || SIGNED_FIT9(rel);
-    } else {
-        #if MICROPY_EMIT_THUMB_ARMV7M
+    } else if (asm_thumb_allow_armv7m(as)) {
         asm_thumb_op32(as, OP_BCC_W_HI(cond, rel), OP_BCC_W_LO(rel));
         return true;
-        #else
+    } else {
         // this method should not be called for ARMV6M
         return false;
-        #endif
     }
 }
 
@@ -320,30 +327,30 @@ size_t asm_thumb_mov_reg_i32(asm_thumb_t *as, uint reg_dest, mp_uint_t i32) {
 
     size_t loc = mp_asm_base_get_code_pos(&as->base);
 
-    #if MICROPY_EMIT_THUMB_ARMV7M
-    asm_thumb_mov_reg_i16(as, ASM_THUMB_OP_MOVW, reg_dest, i32);
-    asm_thumb_mov_reg_i16(as, ASM_THUMB_OP_MOVT, reg_dest, i32 >> 16);
-    #else
-    // should only be called with lo reg for ARMV6M
-    assert(reg_dest < ASM_THUMB_REG_R8);
+    if (asm_thumb_allow_armv7m(as)) {
+        asm_thumb_mov_reg_i16(as, ASM_THUMB_OP_MOVW, reg_dest, i32);
+        asm_thumb_mov_reg_i16(as, ASM_THUMB_OP_MOVT, reg_dest, i32 >> 16);
+    } else {
+        // should only be called with lo reg for ARMV6M
+        assert(reg_dest < ASM_THUMB_REG_R8);
 
-    // sanity check that generated code is aligned
-    assert(!as->base.code_base || !(3u & (uintptr_t)as->base.code_base));
+        // sanity check that generated code is aligned
+        assert(!as->base.code_base || !(3u & (uintptr_t)as->base.code_base));
 
-    // basically:
-    //        (nop)
-    //        ldr reg_dest, _data
-    //        b 1f
-    // _data: .word i32
-    //  1:
-    if (as->base.code_offset & 2u) {
-        asm_thumb_op16(as, ASM_THUMB_OP_NOP);
+        // basically:
+        //        (nop)
+        //        ldr reg_dest, _data
+        //        b 1f
+        // _data: .word i32
+        //  1:
+        if (as->base.code_offset & 2u) {
+            asm_thumb_op16(as, ASM_THUMB_OP_NOP);
+        }
+        asm_thumb_ldr_rlo_pcrel_i8(as, reg_dest, 0);
+        asm_thumb_op16(as, OP_B_N(2));
+        asm_thumb_op16(as, i32 & 0xffff);
+        asm_thumb_op16(as, i32 >> 16);
     }
-    asm_thumb_ldr_rlo_pcrel_i8(as, reg_dest, 0);
-    asm_thumb_op16(as, OP_B_N(2));
-    asm_thumb_op16(as, i32 & 0xffff);
-    asm_thumb_op16(as, i32 >> 16);
-    #endif
 
     return loc;
 }
@@ -351,14 +358,13 @@ size_t asm_thumb_mov_reg_i32(asm_thumb_t *as, uint reg_dest, mp_uint_t i32) {
 void asm_thumb_mov_reg_i32_optimised(asm_thumb_t *as, uint reg_dest, int i32) {
     if (reg_dest < 8 && UNSIGNED_FIT8(i32)) {
         asm_thumb_mov_rlo_i8(as, reg_dest, i32);
-    } else {
-        #if MICROPY_EMIT_THUMB_ARMV7M
+    } else if (asm_thumb_allow_armv7m(as)) {
         if (UNSIGNED_FIT16(i32)) {
             asm_thumb_mov_reg_i16(as, ASM_THUMB_OP_MOVW, reg_dest, i32);
         } else {
             asm_thumb_mov_reg_i32(as, reg_dest, i32);
         }
-        #else
+    } else {
         uint rlo_dest = reg_dest;
         assert(rlo_dest < ASM_THUMB_REG_R8); // should never be called for ARMV6M
 
@@ -367,8 +373,8 @@ void asm_thumb_mov_reg_i32_optimised(asm_thumb_t *as, uint reg_dest, int i32) {
             i32 = -i32;
         }
 
-        uint clz = __builtin_clz(i32);
-        uint ctz = i32 ? __builtin_ctz(i32) : 0;
+        uint clz = mp_clz(i32);
+        uint ctz = i32 ? mp_ctz(i32) : 0;
         assert(clz + ctz <= 32);
         if (clz + ctz >= 24) {
             asm_thumb_mov_rlo_i8(as, rlo_dest, (i32 >> ctz) & 0xff);
@@ -386,7 +392,6 @@ void asm_thumb_mov_reg_i32_optimised(asm_thumb_t *as, uint reg_dest, int i32) {
         if (negate) {
             asm_thumb_neg_rlo_rlo(as, rlo_dest, rlo_dest);
         }
-        #endif
     }
 }
 
@@ -429,62 +434,76 @@ void asm_thumb_mov_reg_pcrel(asm_thumb_t *as, uint rlo_dest, uint label) {
     mp_uint_t dest = get_label_dest(as, label);
     mp_int_t rel = dest - as->base.code_offset;
     rel |= 1; // to stay in Thumb state when jumping to this address
-    #if MICROPY_EMIT_THUMB_ARMV7M
-    rel -= 4 + 4; // adjust for mov_reg_i16 and then PC+4 prefetch of add_reg_reg
-    asm_thumb_mov_reg_i16(as, ASM_THUMB_OP_MOVW, rlo_dest, rel); // 4 bytes
-    #else
-    rel -= 8 + 4; // adjust for four instructions and then PC+4 prefetch of add_reg_reg
-    // 6 bytes
-    asm_thumb_mov_rlo_i16(as, rlo_dest, rel);
-    // 2 bytes - not always needed, but we want to keep the size the same
-    asm_thumb_sxth_rlo_rlo(as, rlo_dest, rlo_dest);
-    #endif
+    if (asm_thumb_allow_armv7m(as)) {
+        rel -= 6 + 4; // adjust for mov_reg_i16, sxth_rlo_rlo and then PC+4 prefetch of add_reg_reg
+        asm_thumb_mov_reg_i16(as, ASM_THUMB_OP_MOVW, rlo_dest, rel); // 4 bytes
+        asm_thumb_sxth_rlo_rlo(as, rlo_dest, rlo_dest); // 2 bytes
+    } else {
+        rel -= 8 + 4; // adjust for four instructions and then PC+4 prefetch of add_reg_reg
+        // 6 bytes
+        asm_thumb_mov_rlo_i16(as, rlo_dest, rel);
+        // 2 bytes - not always needed, but we want to keep the size the same
+        asm_thumb_sxth_rlo_rlo(as, rlo_dest, rlo_dest);
+    }
     asm_thumb_add_reg_reg(as, rlo_dest, ASM_THUMB_REG_R15); // 2 bytes
 }
 
-#if MICROPY_EMIT_THUMB_ARMV7M
+// ARMv7-M only
 static inline void asm_thumb_ldr_reg_reg_i12(asm_thumb_t *as, uint reg_dest, uint reg_base, uint word_offset) {
     asm_thumb_op32(as, OP_LDR_W_HI(reg_base), OP_LDR_W_LO(reg_dest, word_offset * 4));
 }
-#endif
+
+// emits code for: reg_dest = reg_base + offset << offset_shift
+static void asm_thumb_add_reg_reg_offset(asm_thumb_t *as, uint reg_dest, uint reg_base, uint offset, uint offset_shift) {
+    if (reg_dest < ASM_THUMB_REG_R8 && reg_base < ASM_THUMB_REG_R8) {
+        if (offset << offset_shift < 256) {
+            if (reg_dest != reg_base) {
+                asm_thumb_mov_reg_reg(as, reg_dest, reg_base);
+            }
+            asm_thumb_add_rlo_i8(as, reg_dest, offset << offset_shift);
+        } else if (UNSIGNED_FIT8(offset) && reg_dest != reg_base) {
+            asm_thumb_mov_rlo_i8(as, reg_dest, offset);
+            asm_thumb_lsl_rlo_rlo_i5(as, reg_dest, reg_dest, offset_shift);
+            asm_thumb_add_rlo_rlo_rlo(as, reg_dest, reg_dest, reg_base);
+        } else if (reg_dest != reg_base) {
+            asm_thumb_mov_rlo_i16(as, reg_dest, offset << offset_shift);
+            asm_thumb_add_rlo_rlo_rlo(as, reg_dest, reg_dest, reg_dest);
+        } else {
+            uint reg_other = reg_dest ^ 7;
+            asm_thumb_op16(as, OP_PUSH_RLIST((1 << reg_other)));
+            asm_thumb_mov_rlo_i16(as, reg_other, offset << offset_shift);
+            asm_thumb_add_rlo_rlo_rlo(as, reg_dest, reg_dest, reg_other);
+            asm_thumb_op16(as, OP_POP_RLIST((1 << reg_other)));
+        }
+    } else {
+        assert(0); // should never be called for ARMV6M
+    }
+}
 
 void asm_thumb_ldr_reg_reg_i12_optimised(asm_thumb_t *as, uint reg_dest, uint reg_base, uint word_offset) {
     if (reg_dest < ASM_THUMB_REG_R8 && reg_base < ASM_THUMB_REG_R8 && UNSIGNED_FIT5(word_offset)) {
         asm_thumb_ldr_rlo_rlo_i5(as, reg_dest, reg_base, word_offset);
-    } else {
-        #if MICROPY_EMIT_THUMB_ARMV7M
+    } else if (asm_thumb_allow_armv7m(as)) {
         asm_thumb_ldr_reg_reg_i12(as, reg_dest, reg_base, word_offset);
-        #else
-        word_offset -= 31;
-        if (reg_dest < ASM_THUMB_REG_R8 && reg_base < ASM_THUMB_REG_R8) {
-            if (UNSIGNED_FIT8(word_offset) && (word_offset < 64 || reg_dest != reg_base)) {
-                if (word_offset < 64) {
-                    if (reg_dest != reg_base) {
-                        asm_thumb_mov_reg_reg(as, reg_dest, reg_base);
-                    }
-                    asm_thumb_add_rlo_i8(as, reg_dest, word_offset * 4);
-                } else {
-                    asm_thumb_mov_rlo_i8(as, reg_dest, word_offset);
-                    asm_thumb_lsl_rlo_rlo_i5(as, reg_dest, reg_dest, 2);
-                    asm_thumb_add_rlo_rlo_rlo(as, reg_dest, reg_dest, reg_base);
-                }
-            } else {
-                if (reg_dest != reg_base) {
-                    asm_thumb_mov_rlo_i16(as, reg_dest, word_offset * 4);
-                    asm_thumb_add_rlo_rlo_rlo(as, reg_dest, reg_dest, reg_dest);
-                } else {
-                    uint reg_other = reg_dest ^ 7;
-                    asm_thumb_op16(as, OP_PUSH_RLIST((1 << reg_other)));
-                    asm_thumb_mov_rlo_i16(as, reg_other, word_offset * 4);
-                    asm_thumb_add_rlo_rlo_rlo(as, reg_dest, reg_dest, reg_other);
-                    asm_thumb_op16(as, OP_POP_RLIST((1 << reg_other)));
-                }
-            }
-        } else {
-            assert(0); // should never be called for ARMV6M
-        }
+    } else {
+        asm_thumb_add_reg_reg_offset(as, reg_dest, reg_base, word_offset - 31, 2);
         asm_thumb_ldr_rlo_rlo_i5(as, reg_dest, reg_dest, 31);
-        #endif
+    }
+}
+
+// ARMv7-M only
+static inline void asm_thumb_ldrh_reg_reg_i12(asm_thumb_t *as, uint reg_dest, uint reg_base, uint uint16_offset) {
+    asm_thumb_op32(as, OP_LDRH_W_HI(reg_base), OP_LDRH_W_LO(reg_dest, uint16_offset * 2));
+}
+
+void asm_thumb_ldrh_reg_reg_i12_optimised(asm_thumb_t *as, uint reg_dest, uint reg_base, uint uint16_offset) {
+    if (reg_dest < ASM_THUMB_REG_R8 && reg_base < ASM_THUMB_REG_R8 && UNSIGNED_FIT5(uint16_offset)) {
+        asm_thumb_ldrh_rlo_rlo_i5(as, reg_dest, reg_base, uint16_offset);
+    } else if (asm_thumb_allow_armv7m(as)) {
+        asm_thumb_ldrh_reg_reg_i12(as, reg_dest, reg_base, uint16_offset);
+    } else {
+        asm_thumb_add_reg_reg_offset(as, reg_dest, reg_base, uint16_offset - 31, 1);
+        asm_thumb_ldrh_rlo_rlo_i5(as, reg_dest, reg_dest, 31);
     }
 }
 
@@ -496,20 +515,21 @@ void asm_thumb_b_label(asm_thumb_t *as, uint label) {
     mp_uint_t dest = get_label_dest(as, label);
     mp_int_t rel = dest - as->base.code_offset;
     rel -= 4; // account for instruction prefetch, PC is 4 bytes ahead of this instruction
+
     if (dest != (mp_uint_t)-1 && rel <= -4) {
         // is a backwards jump, so we know the size of the jump on the first pass
         // calculate rel assuming 12 bit relative jump
         if (SIGNED_FIT12(rel)) {
             asm_thumb_op16(as, OP_B_N(rel));
-        } else {
-            goto large_jump;
+            return;
         }
-    } else {
-        // is a forwards jump, so need to assume it's large
-    large_jump:
-        #if MICROPY_EMIT_THUMB_ARMV7M
+    }
+
+    // is a large backwards jump, or a forwards jump (that must be assumed large)
+
+    if (asm_thumb_allow_armv7m(as)) {
         asm_thumb_op32(as, OP_BW_HI(rel), OP_BW_LO(rel));
-        #else
+    } else {
         if (SIGNED_FIT12(rel)) {
             // this code path has to be the same number of instructions irrespective of rel
             asm_thumb_op16(as, OP_B_N(rel));
@@ -520,7 +540,6 @@ void asm_thumb_b_label(asm_thumb_t *as, uint label) {
                 mp_raise_NotImplementedError(MP_ERROR_TEXT("native method too big"));
             }
         }
-        #endif
     }
 }
 
@@ -528,24 +547,24 @@ void asm_thumb_bcc_label(asm_thumb_t *as, int cond, uint label) {
     mp_uint_t dest = get_label_dest(as, label);
     mp_int_t rel = dest - as->base.code_offset;
     rel -= 4; // account for instruction prefetch, PC is 4 bytes ahead of this instruction
+
     if (dest != (mp_uint_t)-1 && rel <= -4) {
         // is a backwards jump, so we know the size of the jump on the first pass
         // calculate rel assuming 9 bit relative jump
         if (SIGNED_FIT9(rel)) {
             asm_thumb_op16(as, OP_BCC_N(cond, rel));
-        } else {
-            goto large_jump;
+            return;
         }
-    } else {
-        // is a forwards jump, so need to assume it's large
-    large_jump:
-        #if MICROPY_EMIT_THUMB_ARMV7M
+    }
+
+    // is a large backwards jump, or a forwards jump (that must be assumed large)
+
+    if (asm_thumb_allow_armv7m(as)) {
         asm_thumb_op32(as, OP_BCC_W_HI(cond, rel), OP_BCC_W_LO(rel));
-        #else
+    } else {
         // reverse the sense of the branch to jump over a longer branch
         asm_thumb_op16(as, OP_BCC_N(cond ^ 1, 0));
         asm_thumb_b_label(as, label);
-        #endif
     }
 }
 
diff --git a/python/src/py/asmthumb.h b/python/src/py/asmthumb.h
index 1a01d20c6..86b816657 100644
--- a/python/src/py/asmthumb.h
+++ b/python/src/py/asmthumb.h
@@ -29,6 +29,7 @@
 #include <assert.h>
 #include "py/misc.h"
 #include "py/asmbase.h"
+#include "py/persistentcode.h"
 
 #define ASM_THUMB_REG_R0  (0)
 #define ASM_THUMB_REG_R1  (1)
@@ -70,6 +71,21 @@ typedef struct _asm_thumb_t {
     uint32_t stack_adjust;
 } asm_thumb_t;
 
+#if MICROPY_DYNAMIC_COMPILER
+
+static inline bool asm_thumb_allow_armv7m(asm_thumb_t *as) {
+    return MP_NATIVE_ARCH_ARMV7M <= mp_dynamic_compiler.native_arch
+           && mp_dynamic_compiler.native_arch <= MP_NATIVE_ARCH_ARMV7EMDP;
+}
+
+#else
+
+static inline bool asm_thumb_allow_armv7m(asm_thumb_t *as) {
+    return MICROPY_EMIT_THUMB_ARMV7M;
+}
+
+#endif
+
 static inline void asm_thumb_end_pass(asm_thumb_t *as) {
     (void)as;
 }
@@ -263,8 +279,8 @@ static inline void asm_thumb_str_rlo_rlo_i5(asm_thumb_t *as, uint rlo_src, uint
 static inline void asm_thumb_strb_rlo_rlo_i5(asm_thumb_t *as, uint rlo_src, uint rlo_base, uint byte_offset) {
     asm_thumb_format_9_10(as, ASM_THUMB_FORMAT_9_STR | ASM_THUMB_FORMAT_9_BYTE_TRANSFER, rlo_src, rlo_base, byte_offset);
 }
-static inline void asm_thumb_strh_rlo_rlo_i5(asm_thumb_t *as, uint rlo_src, uint rlo_base, uint byte_offset) {
-    asm_thumb_format_9_10(as, ASM_THUMB_FORMAT_10_STRH, rlo_src, rlo_base, byte_offset);
+static inline void asm_thumb_strh_rlo_rlo_i5(asm_thumb_t *as, uint rlo_src, uint rlo_base, uint uint16_offset) {
+    asm_thumb_format_9_10(as, ASM_THUMB_FORMAT_10_STRH, rlo_src, rlo_base, uint16_offset);
 }
 static inline void asm_thumb_ldr_rlo_rlo_i5(asm_thumb_t *as, uint rlo_dest, uint rlo_base, uint word_offset) {
     asm_thumb_format_9_10(as, ASM_THUMB_FORMAT_9_LDR | ASM_THUMB_FORMAT_9_WORD_TRANSFER, rlo_dest, rlo_base, word_offset);
@@ -272,8 +288,8 @@ static inline void asm_thumb_ldr_rlo_rlo_i5(asm_thumb_t *as, uint rlo_dest, uint
 static inline void asm_thumb_ldrb_rlo_rlo_i5(asm_thumb_t *as, uint rlo_dest, uint rlo_base, uint byte_offset) {
     asm_thumb_format_9_10(as, ASM_THUMB_FORMAT_9_LDR | ASM_THUMB_FORMAT_9_BYTE_TRANSFER, rlo_dest, rlo_base, byte_offset);
 }
-static inline void asm_thumb_ldrh_rlo_rlo_i5(asm_thumb_t *as, uint rlo_dest, uint rlo_base, uint byte_offset) {
-    asm_thumb_format_9_10(as, ASM_THUMB_FORMAT_10_LDRH, rlo_dest, rlo_base, byte_offset);
+static inline void asm_thumb_ldrh_rlo_rlo_i5(asm_thumb_t *as, uint rlo_dest, uint rlo_base, uint uint16_offset) {
+    asm_thumb_format_9_10(as, ASM_THUMB_FORMAT_10_LDRH, rlo_dest, rlo_base, uint16_offset);
 }
 static inline void asm_thumb_lsl_rlo_rlo_i5(asm_thumb_t *as, uint rlo_dest, uint rlo_src, uint shift) {
     asm_thumb_format_1(as, ASM_THUMB_FORMAT_1_LSL, rlo_dest, rlo_src, shift);
@@ -308,12 +324,7 @@ static inline void asm_thumb_sxth_rlo_rlo(asm_thumb_t *as, uint rlo_dest, uint r
 #define ASM_THUMB_OP_MOVT (0xf2c0)
 
 void asm_thumb_mov_reg_reg(asm_thumb_t *as, uint reg_dest, uint reg_src);
-
-#if MICROPY_EMIT_THUMB_ARMV7M
-size_t asm_thumb_mov_reg_i16(asm_thumb_t *as, uint mov_op, uint reg_dest, int i16_src);
-#else
-void asm_thumb_mov_rlo_i16(asm_thumb_t *as, uint rlo_dest, int i16_src);
-#endif
+void asm_thumb_mov_reg_i16(asm_thumb_t *as, uint mov_op, uint reg_dest, int i16_src);
 
 // these return true if the destination is in range, false otherwise
 bool asm_thumb_b_n_label(asm_thumb_t *as, uint label);
@@ -327,7 +338,8 @@ void asm_thumb_mov_reg_local(asm_thumb_t *as, uint rlo_dest, int local_num); //
 void asm_thumb_mov_reg_local_addr(asm_thumb_t *as, uint rlo_dest, int local_num); // convenience
 void asm_thumb_mov_reg_pcrel(asm_thumb_t *as, uint rlo_dest, uint label);
 
-void asm_thumb_ldr_reg_reg_i12_optimised(asm_thumb_t *as, uint reg_dest, uint reg_base, uint byte_offset); // convenience
+void asm_thumb_ldr_reg_reg_i12_optimised(asm_thumb_t *as, uint reg_dest, uint reg_base, uint word_offset); // convenience
+void asm_thumb_ldrh_reg_reg_i12_optimised(asm_thumb_t *as, uint reg_dest, uint reg_base, uint uint16_offset); // convenience
 
 void asm_thumb_b_label(asm_thumb_t *as, uint label); // convenience: picks narrow or wide branch
 void asm_thumb_bcc_label(asm_thumb_t *as, int cc, uint label); // convenience: picks narrow or wide branch
@@ -389,11 +401,6 @@ void asm_thumb_b_rel12(asm_thumb_t *as, int rel);
 
 #define ASM_MOV_LOCAL_REG(as, local_num, reg) asm_thumb_mov_local_reg((as), (local_num), (reg))
 #define ASM_MOV_REG_IMM(as, reg_dest, imm) asm_thumb_mov_reg_i32_optimised((as), (reg_dest), (imm))
-#if MICROPY_EMIT_THUMB_ARMV7M
-#define ASM_MOV_REG_IMM_FIX_U16(as, reg_dest, imm) asm_thumb_mov_reg_i16((as), ASM_THUMB_OP_MOVW, (reg_dest), (imm))
-#else
-#define ASM_MOV_REG_IMM_FIX_U16(as, reg_dest, imm) asm_thumb_mov_rlo_i16((as), (reg_dest), (imm))
-#endif
 #define ASM_MOV_REG_IMM_FIX_WORD(as, reg_dest, imm) asm_thumb_mov_reg_i32((as), (reg_dest), (imm))
 #define ASM_MOV_REG_LOCAL(as, reg_dest, local_num) asm_thumb_mov_reg_local((as), (reg_dest), (local_num))
 #define ASM_MOV_REG_REG(as, reg_dest, reg_src) asm_thumb_mov_reg_reg((as), (reg_dest), (reg_src))
@@ -414,6 +421,7 @@ void asm_thumb_b_rel12(asm_thumb_t *as, int rel);
 #define ASM_LOAD_REG_REG_OFFSET(as, reg_dest, reg_base, word_offset) asm_thumb_ldr_reg_reg_i12_optimised((as), (reg_dest), (reg_base), (word_offset))
 #define ASM_LOAD8_REG_REG(as, reg_dest, reg_base) asm_thumb_ldrb_rlo_rlo_i5((as), (reg_dest), (reg_base), 0)
 #define ASM_LOAD16_REG_REG(as, reg_dest, reg_base) asm_thumb_ldrh_rlo_rlo_i5((as), (reg_dest), (reg_base), 0)
+#define ASM_LOAD16_REG_REG_OFFSET(as, reg_dest, reg_base, uint16_offset) asm_thumb_ldrh_reg_reg_i12_optimised((as), (reg_dest), (reg_base), (uint16_offset))
 #define ASM_LOAD32_REG_REG(as, reg_dest, reg_base) asm_thumb_ldr_rlo_rlo_i5((as), (reg_dest), (reg_base), 0)
 
 #define ASM_STORE_REG_REG(as, reg_src, reg_base) asm_thumb_str_rlo_rlo_i5((as), (reg_src), (reg_base), 0)
diff --git a/python/src/py/asmx64.c b/python/src/py/asmx64.c
index 62df5c6d4..5c923a523 100644
--- a/python/src/py/asmx64.c
+++ b/python/src/py/asmx64.c
@@ -319,9 +319,7 @@ void asm_x64_mov_mem64_to_r64(asm_x64_t *as, int src_r64, int src_disp, int dest
 
 STATIC void asm_x64_lea_disp_to_r64(asm_x64_t *as, int src_r64, int src_disp, int dest_r64) {
     // use REX prefix for 64 bit operation
-    assert(src_r64 < 8);
-    assert(dest_r64 < 8);
-    asm_x64_write_byte_2(as, REX_PREFIX | REX_W, OPCODE_LEA_MEM_TO_R64);
+    asm_x64_write_byte_2(as, REX_PREFIX | REX_W | REX_R_FROM_R64(dest_r64) | REX_B_FROM_R64(src_r64), OPCODE_LEA_MEM_TO_R64);
     asm_x64_write_r64_disp(as, dest_r64, src_r64, src_disp);
 }
 
diff --git a/python/src/py/asmx64.h b/python/src/py/asmx64.h
index 1a4987f5c..d132ee193 100644
--- a/python/src/py/asmx64.h
+++ b/python/src/py/asmx64.h
@@ -207,6 +207,7 @@ void asm_x64_call_ind(asm_x64_t *as, size_t fun_id, int temp_r32);
 #define ASM_LOAD_REG_REG_OFFSET(as, reg_dest, reg_base, word_offset) asm_x64_mov_mem64_to_r64((as), (reg_base), 8 * (word_offset), (reg_dest))
 #define ASM_LOAD8_REG_REG(as, reg_dest, reg_base) asm_x64_mov_mem8_to_r64zx((as), (reg_base), 0, (reg_dest))
 #define ASM_LOAD16_REG_REG(as, reg_dest, reg_base) asm_x64_mov_mem16_to_r64zx((as), (reg_base), 0, (reg_dest))
+#define ASM_LOAD16_REG_REG_OFFSET(as, reg_dest, reg_base, uint16_offset) asm_x64_mov_mem16_to_r64zx((as), (reg_base), 2 * (uint16_offset), (reg_dest))
 #define ASM_LOAD32_REG_REG(as, reg_dest, reg_base) asm_x64_mov_mem32_to_r64zx((as), (reg_base), 0, (reg_dest))
 
 #define ASM_STORE_REG_REG(as, reg_src, reg_base) asm_x64_mov_r64_to_mem64((as), (reg_src), (reg_base), 0)
diff --git a/python/src/py/asmx86.h b/python/src/py/asmx86.h
index 8f1b06d22..e02e6c954 100644
--- a/python/src/py/asmx86.h
+++ b/python/src/py/asmx86.h
@@ -202,6 +202,7 @@ void asm_x86_call_ind(asm_x86_t *as, size_t fun_id, mp_uint_t n_args, int temp_r
 #define ASM_LOAD_REG_REG_OFFSET(as, reg_dest, reg_base, word_offset) asm_x86_mov_mem32_to_r32((as), (reg_base), 4 * (word_offset), (reg_dest))
 #define ASM_LOAD8_REG_REG(as, reg_dest, reg_base) asm_x86_mov_mem8_to_r32zx((as), (reg_base), 0, (reg_dest))
 #define ASM_LOAD16_REG_REG(as, reg_dest, reg_base) asm_x86_mov_mem16_to_r32zx((as), (reg_base), 0, (reg_dest))
+#define ASM_LOAD16_REG_REG_OFFSET(as, reg_dest, reg_base, uint16_offset) asm_x86_mov_mem16_to_r32zx((as), (reg_base), 2 * (uint16_offset), (reg_dest))
 #define ASM_LOAD32_REG_REG(as, reg_dest, reg_base) asm_x86_mov_mem32_to_r32((as), (reg_base), 0, (reg_dest))
 
 #define ASM_STORE_REG_REG(as, reg_src, reg_base) asm_x86_mov_r32_to_mem32((as), (reg_src), (reg_base), 0)
diff --git a/python/src/py/asmxtensa.c b/python/src/py/asmxtensa.c
index 0956d50f3..8ac914ec4 100644
--- a/python/src/py/asmxtensa.c
+++ b/python/src/py/asmxtensa.c
@@ -27,7 +27,7 @@
 #include <stdio.h>
 #include <assert.h>
 
-#include "py/mpconfig.h"
+#include "py/runtime.h"
 
 // wrapper around everything in this file
 #if MICROPY_EMIT_XTENSA || MICROPY_EMIT_INLINE_XTENSA || MICROPY_EMIT_XTENSAWIN
@@ -232,21 +232,33 @@ void asm_xtensa_mov_reg_pcrel(asm_xtensa_t *as, uint reg_dest, uint label) {
     asm_xtensa_op_add_n(as, reg_dest, reg_dest, ASM_XTENSA_REG_A0);
 }
 
-void asm_xtensa_call_ind(asm_xtensa_t *as, uint idx) {
-    if (idx < 16) {
-        asm_xtensa_op_l32i_n(as, ASM_XTENSA_REG_A0, ASM_XTENSA_REG_FUN_TABLE, idx);
+void asm_xtensa_l32i_optimised(asm_xtensa_t *as, uint reg_dest, uint reg_base, uint word_offset) {
+    if (word_offset < 16) {
+        asm_xtensa_op_l32i_n(as, reg_dest, reg_base, word_offset);
+    } else if (word_offset < 256) {
+        asm_xtensa_op_l32i(as, reg_dest, reg_base, word_offset);
     } else {
-        asm_xtensa_op_l32i(as, ASM_XTENSA_REG_A0, ASM_XTENSA_REG_FUN_TABLE, idx);
+        mp_raise_msg(&mp_type_RuntimeError, MP_ERROR_TEXT("asm overflow"));
     }
+}
+
+void asm_xtensa_s32i_optimised(asm_xtensa_t *as, uint reg_src, uint reg_base, uint word_offset) {
+    if (word_offset < 16) {
+        asm_xtensa_op_s32i_n(as, reg_src, reg_base, word_offset);
+    } else if (word_offset < 256) {
+        asm_xtensa_op_s32i(as, reg_src, reg_base, word_offset);
+    } else {
+        mp_raise_msg(&mp_type_RuntimeError, MP_ERROR_TEXT("asm overflow"));
+    }
+}
+
+void asm_xtensa_call_ind(asm_xtensa_t *as, uint idx) {
+    asm_xtensa_l32i_optimised(as, ASM_XTENSA_REG_A0, ASM_XTENSA_REG_FUN_TABLE, idx);
     asm_xtensa_op_callx0(as, ASM_XTENSA_REG_A0);
 }
 
 void asm_xtensa_call_ind_win(asm_xtensa_t *as, uint idx) {
-    if (idx < 16) {
-        asm_xtensa_op_l32i_n(as, ASM_XTENSA_REG_A8, ASM_XTENSA_REG_FUN_TABLE_WIN, idx);
-    } else {
-        asm_xtensa_op_l32i(as, ASM_XTENSA_REG_A8, ASM_XTENSA_REG_FUN_TABLE_WIN, idx);
-    }
+    asm_xtensa_l32i_optimised(as, ASM_XTENSA_REG_A8, ASM_XTENSA_REG_FUN_TABLE_WIN, idx);
     asm_xtensa_op_callx8(as, ASM_XTENSA_REG_A8);
 }
 
diff --git a/python/src/py/asmxtensa.h b/python/src/py/asmxtensa.h
index 43f1b608e..5aa86d3b2 100644
--- a/python/src/py/asmxtensa.h
+++ b/python/src/py/asmxtensa.h
@@ -278,6 +278,8 @@ void asm_xtensa_mov_local_reg(asm_xtensa_t *as, int local_num, uint reg_src);
 void asm_xtensa_mov_reg_local(asm_xtensa_t *as, uint reg_dest, int local_num);
 void asm_xtensa_mov_reg_local_addr(asm_xtensa_t *as, uint reg_dest, int local_num);
 void asm_xtensa_mov_reg_pcrel(asm_xtensa_t *as, uint reg_dest, uint label);
+void asm_xtensa_l32i_optimised(asm_xtensa_t *as, uint reg_dest, uint reg_base, uint word_offset);
+void asm_xtensa_s32i_optimised(asm_xtensa_t *as, uint reg_src, uint reg_base, uint word_offset);
 void asm_xtensa_call_ind(asm_xtensa_t *as, uint idx);
 void asm_xtensa_call_ind_win(asm_xtensa_t *as, uint idx);
 
@@ -393,12 +395,13 @@ void asm_xtensa_call_ind_win(asm_xtensa_t *as, uint idx);
 #define ASM_SUB_REG_REG(as, reg_dest, reg_src) asm_xtensa_op_sub((as), (reg_dest), (reg_dest), (reg_src))
 #define ASM_MUL_REG_REG(as, reg_dest, reg_src) asm_xtensa_op_mull((as), (reg_dest), (reg_dest), (reg_src))
 
-#define ASM_LOAD_REG_REG_OFFSET(as, reg_dest, reg_base, word_offset) asm_xtensa_op_l32i_n((as), (reg_dest), (reg_base), (word_offset))
+#define ASM_LOAD_REG_REG_OFFSET(as, reg_dest, reg_base, word_offset) asm_xtensa_l32i_optimised((as), (reg_dest), (reg_base), (word_offset))
 #define ASM_LOAD8_REG_REG(as, reg_dest, reg_base) asm_xtensa_op_l8ui((as), (reg_dest), (reg_base), 0)
 #define ASM_LOAD16_REG_REG(as, reg_dest, reg_base) asm_xtensa_op_l16ui((as), (reg_dest), (reg_base), 0)
+#define ASM_LOAD16_REG_REG_OFFSET(as, reg_dest, reg_base, uint16_offset) asm_xtensa_op_l16ui((as), (reg_dest), (reg_base), (uint16_offset))
 #define ASM_LOAD32_REG_REG(as, reg_dest, reg_base) asm_xtensa_op_l32i_n((as), (reg_dest), (reg_base), 0)
 
-#define ASM_STORE_REG_REG_OFFSET(as, reg_dest, reg_base, word_offset) asm_xtensa_op_s32i_n((as), (reg_dest), (reg_base), (word_offset))
+#define ASM_STORE_REG_REG_OFFSET(as, reg_dest, reg_base, word_offset) asm_xtensa_s32i_optimised((as), (reg_dest), (reg_base), (word_offset))
 #define ASM_STORE8_REG_REG(as, reg_src, reg_base) asm_xtensa_op_s8i((as), (reg_src), (reg_base), 0)
 #define ASM_STORE16_REG_REG(as, reg_src, reg_base) asm_xtensa_op_s16i((as), (reg_src), (reg_base), 0)
 #define ASM_STORE32_REG_REG(as, reg_src, reg_base) asm_xtensa_op_s32i_n((as), (reg_src), (reg_base), 0)
diff --git a/python/src/py/bc.c b/python/src/py/bc.c
index 58694b97d..e002bca26 100644
--- a/python/src/py/bc.c
+++ b/python/src/py/bc.c
@@ -29,9 +29,9 @@
 #include <string.h>
 #include <assert.h>
 
-#include "py/runtime.h"
 #include "py/bc0.h"
 #include "py/bc.h"
+#include "py/objfun.h"
 
 #if MICROPY_DEBUG_VERBOSE // print debugging info
 #define DEBUG_PRINT (1)
@@ -40,7 +40,23 @@
 #define DEBUG_printf(...) (void)0
 #endif
 
-#if !MICROPY_PERSISTENT_CODE
+void mp_encode_uint(void *env, mp_encode_uint_allocator_t allocator, mp_uint_t val) {
+    // We store each 7 bits in a separate byte, and that's how many bytes needed
+    byte buf[MP_ENCODE_UINT_MAX_BYTES];
+    byte *p = buf + sizeof(buf);
+    // We encode in little-ending order, but store in big-endian, to help decoding
+    do {
+        *--p = val & 0x7f;
+        val >>= 7;
+    } while (val != 0);
+    byte *c = allocator(env, buf + sizeof(buf) - p);
+    if (c != NULL) {
+        while (p != buf + sizeof(buf) - 1) {
+            *c++ = *p++ | 0x80;
+        }
+        *c = *p;
+    }
+}
 
 mp_uint_t mp_decode_uint(const byte **ptr) {
     mp_uint_t unum = 0;
@@ -72,8 +88,6 @@ const byte *mp_decode_uint_skip(const byte *ptr) {
     return ptr;
 }
 
-#endif
-
 STATIC NORETURN void fun_pos_args_mismatch(mp_obj_fun_bc_t *f, size_t expected, size_t given) {
     #if MICROPY_ERROR_REPORTING <= MICROPY_ERROR_REPORTING_TERSE
     // generic message, used also for other argument issues
@@ -107,46 +121,36 @@ STATIC void dump_args(const mp_obj_t *a, size_t sz) {
 // On entry code_state should be allocated somewhere (stack/heap) and
 // contain the following valid entries:
 //    - code_state->fun_bc should contain a pointer to the function object
-//    - code_state->ip should contain the offset in bytes from the pointer
-//      code_state->fun_bc->bytecode to the entry n_state (0 for bytecode, non-zero for native)
-void mp_setup_code_state(mp_code_state_t *code_state, size_t n_args, size_t n_kw, const mp_obj_t *args) {
+//    - code_state->ip should contain a pointer to the beginning of the prelude
+//    - code_state->sp should be: &code_state->state[0] - 1
+//    - code_state->n_state should be the number of objects in the local state
+STATIC void mp_setup_code_state_helper(mp_code_state_t *code_state, size_t n_args, size_t n_kw, const mp_obj_t *args) {
     // This function is pretty complicated.  It's main aim is to be efficient in speed and RAM
     // usage for the common case of positional only args.
 
     // get the function object that we want to set up (could be bytecode or native code)
     mp_obj_fun_bc_t *self = code_state->fun_bc;
 
-    // ip comes in as an offset into bytecode, so turn it into a true pointer
-    code_state->ip = self->bytecode + (size_t)code_state->ip;
-
-    #if MICROPY_STACKLESS
-    code_state->prev = NULL;
-    #endif
-
-    #if MICROPY_PY_SYS_SETTRACE
-    code_state->prev_state = NULL;
-    code_state->frame = NULL;
-    #endif
-
     // Get cached n_state (rather than decode it again)
     size_t n_state = code_state->n_state;
 
     // Decode prelude
     size_t n_state_unused, n_exc_stack_unused, scope_flags, n_pos_args, n_kwonly_args, n_def_pos_args;
     MP_BC_PRELUDE_SIG_DECODE_INTO(code_state->ip, n_state_unused, n_exc_stack_unused, scope_flags, n_pos_args, n_kwonly_args, n_def_pos_args);
+    MP_BC_PRELUDE_SIZE_DECODE(code_state->ip);
     (void)n_state_unused;
     (void)n_exc_stack_unused;
 
-    code_state->sp = &code_state->state[0] - 1;
+    mp_obj_t *code_state_state = code_state->sp + 1;
     code_state->exc_sp_idx = 0;
 
     // zero out the local stack to begin with
-    memset(code_state->state, 0, n_state * sizeof(*code_state->state));
+    memset(code_state_state, 0, n_state * sizeof(*code_state->state));
 
     const mp_obj_t *kwargs = args + n_args;
 
     // var_pos_kw_args points to the stack where the var-args tuple, and var-kw dict, should go (if they are needed)
-    mp_obj_t *var_pos_kw_args = &code_state->state[n_state - 1 - n_pos_args - n_kwonly_args];
+    mp_obj_t *var_pos_kw_args = &code_state_state[n_state - 1 - n_pos_args - n_kwonly_args];
 
     // check positional arguments
 
@@ -169,7 +173,7 @@ void mp_setup_code_state(mp_code_state_t *code_state, size_t n_args, size_t n_kw
             if (n_args >= (size_t)(n_pos_args - n_def_pos_args)) {
                 // given enough arguments, but may need to use some default arguments
                 for (size_t i = n_args; i < n_pos_args; i++) {
-                    code_state->state[n_state - 1 - i] = self->extra_args[i - (n_pos_args - n_def_pos_args)];
+                    code_state_state[n_state - 1 - i] = self->extra_args[i - (n_pos_args - n_def_pos_args)];
                 }
             } else {
                 fun_pos_args_mismatch(self, n_pos_args - n_def_pos_args, n_args);
@@ -179,14 +183,14 @@ void mp_setup_code_state(mp_code_state_t *code_state, size_t n_args, size_t n_kw
 
     // copy positional args into state
     for (size_t i = 0; i < n_args; i++) {
-        code_state->state[n_state - 1 - i] = args[i];
+        code_state_state[n_state - 1 - i] = args[i];
     }
 
     // check keyword arguments
 
     if (n_kw != 0 || (scope_flags & MP_SCOPE_FLAG_DEFKWARGS) != 0) {
         DEBUG_printf("Initial args: ");
-        dump_args(code_state->state + n_state - n_pos_args - n_kwonly_args, n_pos_args + n_kwonly_args);
+        dump_args(code_state_state + n_state - n_pos_args - n_kwonly_args, n_pos_args + n_kwonly_args);
 
         mp_obj_t dict = MP_OBJ_NULL;
         if ((scope_flags & MP_SCOPE_FLAG_VARKEYWORDS) != 0) {
@@ -194,19 +198,25 @@ void mp_setup_code_state(mp_code_state_t *code_state, size_t n_args, size_t n_kw
             *var_pos_kw_args = dict;
         }
 
-        // get pointer to arg_names array
-        const mp_obj_t *arg_names = (const mp_obj_t *)self->const_table;
-
         for (size_t i = 0; i < n_kw; i++) {
             // the keys in kwargs are expected to be qstr objects
             mp_obj_t wanted_arg_name = kwargs[2 * i];
+
+            // get pointer to arg_names array
+            const uint8_t *arg_names = code_state->ip;
+            arg_names = mp_decode_uint_skip(arg_names);
+
             for (size_t j = 0; j < n_pos_args + n_kwonly_args; j++) {
-                if (wanted_arg_name == arg_names[j]) {
-                    if (code_state->state[n_state - 1 - j] != MP_OBJ_NULL) {
+                qstr arg_qstr = mp_decode_uint(&arg_names);
+                #if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
+                arg_qstr = self->context->constants.qstr_table[arg_qstr];
+                #endif
+                if (wanted_arg_name == MP_OBJ_NEW_QSTR(arg_qstr)) {
+                    if (code_state_state[n_state - 1 - j] != MP_OBJ_NULL) {
                         mp_raise_msg_varg(&mp_type_TypeError,
                             MP_ERROR_TEXT("function got multiple values for argument '%q'"), MP_OBJ_QSTR_VALUE(wanted_arg_name));
                     }
-                    code_state->state[n_state - 1 - j] = kwargs[2 * i + 1];
+                    code_state_state[n_state - 1 - j] = kwargs[2 * i + 1];
                     goto continue2;
                 }
             }
@@ -224,10 +234,10 @@ void mp_setup_code_state(mp_code_state_t *code_state, size_t n_args, size_t n_kw
         }
 
         DEBUG_printf("Args with kws flattened: ");
-        dump_args(code_state->state + n_state - n_pos_args - n_kwonly_args, n_pos_args + n_kwonly_args);
+        dump_args(code_state_state + n_state - n_pos_args - n_kwonly_args, n_pos_args + n_kwonly_args);
 
         // fill in defaults for positional args
-        mp_obj_t *d = &code_state->state[n_state - n_pos_args];
+        mp_obj_t *d = &code_state_state[n_state - n_pos_args];
         mp_obj_t *s = &self->extra_args[n_def_pos_args - 1];
         for (size_t i = n_def_pos_args; i > 0; i--, d++, s--) {
             if (*d == MP_OBJ_NULL) {
@@ -236,29 +246,37 @@ void mp_setup_code_state(mp_code_state_t *code_state, size_t n_args, size_t n_kw
         }
 
         DEBUG_printf("Args after filling default positional: ");
-        dump_args(code_state->state + n_state - n_pos_args - n_kwonly_args, n_pos_args + n_kwonly_args);
+        dump_args(code_state_state + n_state - n_pos_args - n_kwonly_args, n_pos_args + n_kwonly_args);
 
         // Check that all mandatory positional args are specified
-        while (d < &code_state->state[n_state]) {
+        while (d < &code_state_state[n_state]) {
             if (*d++ == MP_OBJ_NULL) {
                 mp_raise_msg_varg(&mp_type_TypeError,
-                    MP_ERROR_TEXT("function missing required positional argument #%d"), &code_state->state[n_state] - d);
+                    MP_ERROR_TEXT("function missing required positional argument #%d"), &code_state_state[n_state] - d);
             }
         }
 
         // Check that all mandatory keyword args are specified
         // Fill in default kw args if we have them
+        const uint8_t *arg_names = mp_decode_uint_skip(code_state->ip);
+        for (size_t i = 0; i < n_pos_args; i++) {
+            arg_names = mp_decode_uint_skip(arg_names);
+        }
         for (size_t i = 0; i < n_kwonly_args; i++) {
-            if (code_state->state[n_state - 1 - n_pos_args - i] == MP_OBJ_NULL) {
+            qstr arg_qstr = mp_decode_uint(&arg_names);
+            #if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
+            arg_qstr = self->context->constants.qstr_table[arg_qstr];
+            #endif
+            if (code_state_state[n_state - 1 - n_pos_args - i] == MP_OBJ_NULL) {
                 mp_map_elem_t *elem = NULL;
                 if ((scope_flags & MP_SCOPE_FLAG_DEFKWARGS) != 0) {
-                    elem = mp_map_lookup(&((mp_obj_dict_t *)MP_OBJ_TO_PTR(self->extra_args[n_def_pos_args]))->map, arg_names[n_pos_args + i], MP_MAP_LOOKUP);
+                    elem = mp_map_lookup(&((mp_obj_dict_t *)MP_OBJ_TO_PTR(self->extra_args[n_def_pos_args]))->map, MP_OBJ_NEW_QSTR(arg_qstr), MP_MAP_LOOKUP);
                 }
                 if (elem != NULL) {
-                    code_state->state[n_state - 1 - n_pos_args - i] = elem->value;
+                    code_state_state[n_state - 1 - n_pos_args - i] = elem->value;
                 } else {
                     mp_raise_msg_varg(&mp_type_TypeError,
-                        MP_ERROR_TEXT("function missing required keyword argument '%q'"), MP_OBJ_QSTR_VALUE(arg_names[n_pos_args + i]));
+                        MP_ERROR_TEXT("function missing required keyword argument '%q'"), arg_qstr);
                 }
             }
         }
@@ -273,71 +291,49 @@ void mp_setup_code_state(mp_code_state_t *code_state, size_t n_args, size_t n_kw
         }
     }
 
-    // read the size part of the prelude
-    const byte *ip = code_state->ip;
-    MP_BC_PRELUDE_SIZE_DECODE(ip);
-
-    // jump over code info (source file and line-number mapping)
-    ip += n_info;
+    // jump over code info (source file, argument names and line-number mapping)
+    const uint8_t *ip = code_state->ip + n_info;
 
     // bytecode prelude: initialise closed over variables
     for (; n_cell; --n_cell) {
         size_t local_num = *ip++;
-        code_state->state[n_state - 1 - local_num] =
-            mp_obj_new_cell(code_state->state[n_state - 1 - local_num]);
+        code_state_state[n_state - 1 - local_num] =
+            mp_obj_new_cell(code_state_state[n_state - 1 - local_num]);
     }
 
-    #if !MICROPY_PERSISTENT_CODE
-    // so bytecode is aligned
-    ip = MP_ALIGN(ip, sizeof(mp_uint_t));
-    #endif
-
     // now that we skipped over the prelude, set the ip for the VM
     code_state->ip = ip;
 
     DEBUG_printf("Calling: n_pos_args=%d, n_kwonly_args=%d\n", n_pos_args, n_kwonly_args);
-    dump_args(code_state->state + n_state - n_pos_args - n_kwonly_args, n_pos_args + n_kwonly_args);
-    dump_args(code_state->state, n_state);
+    dump_args(code_state_state + n_state - n_pos_args - n_kwonly_args, n_pos_args + n_kwonly_args);
+    dump_args(code_state_state, n_state);
 }
 
-#if MICROPY_PERSISTENT_CODE_LOAD || MICROPY_PERSISTENT_CODE_SAVE
-
-// The following table encodes the number of bytes that a specific opcode
-// takes up.  Some opcodes have an extra byte, defined by MP_BC_MASK_EXTRA_BYTE.
-// There are 4 special opcodes that have an extra byte only when
-// MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE is enabled (and they take a qstr):
-//     MP_BC_LOAD_NAME
-//     MP_BC_LOAD_GLOBAL
-//     MP_BC_LOAD_ATTR
-//     MP_BC_STORE_ATTR
-uint mp_opcode_format(const byte *ip, size_t *opcode_size, bool count_var_uint) {
-    uint f = MP_BC_FORMAT(*ip);
-    const byte *ip_start = ip;
-    if (f == MP_BC_FORMAT_QSTR) {
-        if (MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE_DYNAMIC) {
-            if (*ip == MP_BC_LOAD_NAME
-                || *ip == MP_BC_LOAD_GLOBAL
-                || *ip == MP_BC_LOAD_ATTR
-                || *ip == MP_BC_STORE_ATTR) {
-                ip += 1;
-            }
-        }
-        ip += 3;
-    } else {
-        int extra_byte = (*ip & MP_BC_MASK_EXTRA_BYTE) == 0;
-        ip += 1;
-        if (f == MP_BC_FORMAT_VAR_UINT) {
-            if (count_var_uint) {
-                while ((*ip++ & 0x80) != 0) {
-                }
-            }
-        } else if (f == MP_BC_FORMAT_OFFSET) {
-            ip += 2;
-        }
-        ip += extra_byte;
-    }
-    *opcode_size = ip - ip_start;
-    return f;
+// On entry code_state should be allocated somewhere (stack/heap) and
+// contain the following valid entries:
+//    - code_state->fun_bc should contain a pointer to the function object
+//    - code_state->n_state should be the number of objects in the local state
+void mp_setup_code_state(mp_code_state_t *code_state, size_t n_args, size_t n_kw, const mp_obj_t *args) {
+    code_state->ip = code_state->fun_bc->bytecode;
+    code_state->sp = &code_state->state[0] - 1;
+    #if MICROPY_STACKLESS
+    code_state->prev = NULL;
+    #endif
+    #if MICROPY_PY_SYS_SETTRACE
+    code_state->prev_state = NULL;
+    code_state->frame = NULL;
+    #endif
+    mp_setup_code_state_helper(code_state, n_args, n_kw, args);
 }
 
-#endif // MICROPY_PERSISTENT_CODE_LOAD || MICROPY_PERSISTENT_CODE_SAVE
+#if MICROPY_EMIT_NATIVE
+// On entry code_state should be allocated somewhere (stack/heap) and
+// contain the following valid entries:
+//    - code_state->fun_bc should contain a pointer to the function object
+//    - code_state->ip should contain a pointer to the beginning of the prelude
+//    - code_state->n_state should be the number of objects in the local state
+void mp_setup_code_state_native(mp_code_state_native_t *code_state, size_t n_args, size_t n_kw, const mp_obj_t *args) {
+    code_state->sp = &code_state->state[0] - 1;
+    mp_setup_code_state_helper((mp_code_state_t *)code_state, n_args, n_kw, args);
+}
+#endif
diff --git a/python/src/py/bc.h b/python/src/py/bc.h
index ef5afeae1..6350eee52 100644
--- a/python/src/py/bc.h
+++ b/python/src/py/bc.h
@@ -28,7 +28,6 @@
 #define MICROPY_INCLUDED_PY_BC_H
 
 #include "py/runtime.h"
-#include "py/objfun.h"
 
 // bytecode layout:
 //
@@ -50,7 +49,9 @@
 //
 //  source info section:
 //      simple_name : var qstr
-//      source_file : var qstr
+//      argname0    : var qstr
+//      ...         : var qstr
+//      argnameN    : var qstr      N = num_pos_args + num_kwonly_args - 1
 //      <line number info>
 //
 //  closure section:
@@ -58,19 +59,16 @@
 //      ...         : byte
 //      local_numN  : byte          N = n_cells-1
 //
-//  <word alignment padding>        only needed if bytecode contains pointers
-//
 //  <bytecode>
 //
 //
 // constant table layout:
 //
-//  argname0        : obj (qstr)
-//  ...             : obj (qstr)
-//  argnameN        : obj (qstr)    N = num_pos_args + num_kwonly_args
 //  const0          : obj
 //  constN          : obj
 
+#define MP_ENCODE_UINT_MAX_BYTES ((MP_BYTES_PER_OBJ_WORD * 8 + 6) / 7)
+
 #define MP_BC_PRELUDE_SIG_ENCODE(S, E, scope, out_byte, out_env) \
     do {                                                            \
         /*// Get values to store in prelude */                      \
@@ -182,9 +180,9 @@ typedef struct _mp_bytecode_prelude_t {
     uint n_pos_args;
     uint n_kwonly_args;
     uint n_def_pos_args;
-    qstr qstr_block_name;
-    qstr qstr_source_file;
+    qstr qstr_block_name_idx;
     const byte *line_info;
+    const byte *line_info_top;
     const byte *opcodes;
 } mp_bytecode_prelude_t;
 
@@ -198,12 +196,46 @@ typedef struct _mp_exc_stack_t {
     mp_obj_base_t *prev_exc;
 } mp_exc_stack_t;
 
+// Constants associated with a module, to interface bytecode with runtime.
+typedef struct _mp_module_constants_t {
+    #if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
+    qstr_short_t *qstr_table;
+    #else
+    qstr source_file;
+    #endif
+    mp_obj_t *obj_table;
+} mp_module_constants_t;
+
+// State associated with a module.
+typedef struct _mp_module_context_t {
+    mp_obj_module_t module;
+    mp_module_constants_t constants;
+} mp_module_context_t;
+
+// Outer level struct defining a compiled module.
+typedef struct _mp_compiled_module_t {
+    const mp_module_context_t *context;
+    const struct _mp_raw_code_t *rc;
+    #if MICROPY_PERSISTENT_CODE_SAVE
+    bool has_native;
+    size_t n_qstr;
+    size_t n_obj;
+    #endif
+} mp_compiled_module_t;
+
+// Outer level struct defining a frozen module.
+typedef struct _mp_frozen_module_t {
+    const mp_module_constants_t constants;
+    const struct _mp_raw_code_t *rc;
+} mp_frozen_module_t;
+
+// State for an executing function.
 typedef struct _mp_code_state_t {
     // The fun_bc entry points to the underlying function object that is being executed.
     // It is needed to access the start of bytecode and the const_table.
     // It is also needed to prevent the GC from reclaiming the bytecode during execution,
     // because the ip pointer below will always point to the interior of the bytecode.
-    mp_obj_fun_bc_t *fun_bc;
+    struct _mp_obj_fun_bc_t *fun_bc;
     const byte *ip;
     mp_obj_t *sp;
     uint16_t n_state;
@@ -222,17 +254,37 @@ typedef struct _mp_code_state_t {
     // mp_exc_stack_t exc_state[0];
 } mp_code_state_t;
 
+// State for an executing native function (based on mp_code_state_t).
+typedef struct _mp_code_state_native_t {
+    struct _mp_obj_fun_bc_t *fun_bc;
+    const byte *ip;
+    mp_obj_t *sp;
+    uint16_t n_state;
+    uint16_t exc_sp_idx;
+    mp_obj_dict_t *old_globals;
+    mp_obj_t state[0];
+} mp_code_state_native_t;
+
+// Allocator may return NULL, in which case data is not stored (can be used to compute size).
+typedef uint8_t *(*mp_encode_uint_allocator_t)(void *env, size_t nbytes);
+
+void mp_encode_uint(void *env, mp_encode_uint_allocator_t allocator, mp_uint_t val);
 mp_uint_t mp_decode_uint(const byte **ptr);
 mp_uint_t mp_decode_uint_value(const byte *ptr);
 const byte *mp_decode_uint_skip(const byte *ptr);
 
-mp_vm_return_kind_t mp_execute_bytecode(mp_code_state_t *code_state, volatile mp_obj_t inject_exc);
+mp_vm_return_kind_t mp_execute_bytecode(mp_code_state_t *code_state,
+#ifndef __cplusplus
+    volatile
+#endif
+    mp_obj_t inject_exc);
 mp_code_state_t *mp_obj_fun_bc_prepare_codestate(mp_obj_t func, size_t n_args, size_t n_kw, const mp_obj_t *args);
 void mp_setup_code_state(mp_code_state_t *code_state, size_t n_args, size_t n_kw, const mp_obj_t *args);
-void mp_bytecode_print(const mp_print_t *print, const void *descr, const byte *code, mp_uint_t len, const mp_uint_t *const_table);
-void mp_bytecode_print2(const mp_print_t *print, const byte *code, size_t len, const mp_uint_t *const_table);
-const byte *mp_bytecode_print_str(const mp_print_t *print, const byte *ip);
-#define mp_bytecode_print_inst(print, code, const_table) mp_bytecode_print2(print, code, 1, const_table)
+void mp_setup_code_state_native(mp_code_state_native_t *code_state, size_t n_args, size_t n_kw, const mp_obj_t *args);
+void mp_bytecode_print(const mp_print_t *print, const struct _mp_raw_code_t *rc, const mp_module_constants_t *cm);
+void mp_bytecode_print2(const mp_print_t *print, const byte *ip, size_t len, struct _mp_raw_code_t *const *child_table, const mp_module_constants_t *cm);
+const byte *mp_bytecode_print_str(const mp_print_t *print, const byte *ip_start, const byte *ip, struct _mp_raw_code_t *const *child_table, const mp_module_constants_t *cm);
+#define mp_bytecode_print_inst(print, code, x_table) mp_bytecode_print2(print, code, 1, x_table)
 
 // Helper macros to access pointer with least significant bits holding flags
 #define MP_TAGPTR_PTR(x) ((void *)((uintptr_t)(x) & ~((uintptr_t)3)))
@@ -240,16 +292,26 @@ const byte *mp_bytecode_print_str(const mp_print_t *print, const byte *ip);
 #define MP_TAGPTR_TAG1(x) ((uintptr_t)(x) & 2)
 #define MP_TAGPTR_MAKE(ptr, tag) ((void *)((uintptr_t)(ptr) | (tag)))
 
-#if MICROPY_PERSISTENT_CODE_LOAD || MICROPY_PERSISTENT_CODE_SAVE
+static inline void mp_module_context_alloc_tables(mp_module_context_t *context, size_t n_qstr, size_t n_obj) {
+    #if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
+    size_t nq = (n_qstr * sizeof(qstr_short_t) + sizeof(mp_uint_t) - 1) / sizeof(mp_uint_t);
+    size_t no = n_obj;
+    mp_uint_t *mem = m_new(mp_uint_t, nq + no);
+    context->constants.qstr_table = (qstr_short_t *)mem;
+    context->constants.obj_table = (mp_obj_t *)(mem + nq);
+    #else
+    if (n_obj == 0) {
+        context->constants.obj_table = NULL;
+    } else {
+        context->constants.obj_table = m_new(mp_obj_t, n_obj);
+    }
+    #endif
+}
 
-uint mp_opcode_format(const byte *ip, size_t *opcode_size, bool count_var_uint);
-
-#endif
-
-static inline size_t mp_bytecode_get_source_line(const byte *line_info, size_t bc_offset) {
+static inline size_t mp_bytecode_get_source_line(const byte *line_info, const byte *line_info_top, size_t bc_offset) {
     size_t source_line = 1;
-    size_t c;
-    while ((c = *line_info)) {
+    while (line_info < line_info_top) {
+        size_t c = *line_info;
         size_t b, l;
         if ((c & 0x80) == 0) {
             // 0b0LLBBBBB encoding
diff --git a/python/src/py/bc0.h b/python/src/py/bc0.h
index 842034ebf..a4a0acf93 100644
--- a/python/src/py/bc0.h
+++ b/python/src/py/bc0.h
@@ -28,6 +28,18 @@
 
 // MicroPython bytecode opcodes, grouped based on the format of the opcode
 
+// All opcodes are encoded as a byte with an optional argument.  Arguments are
+// variable-length encoded so they can be as small as possible.  The possible
+// encodings for arguments are (ip[0] is the opcode):
+//
+//  - unsigned relative bytecode offset:
+//      - if ip[1] high bit is clear then: arg = ip[1]
+//      - if ip[1] high bit is set then:   arg = ip[1] & 0x7f | ip[2] << 7
+//
+//  - signed relative bytecode offset:
+//      - if ip[1] high bit is clear then: arg = ip[1] - 0x40
+//      - if ip[1] high bit is set then:   arg = (ip[1] & 0x7f | ip[2] << 7) - 0x4000
+
 #define MP_BC_MASK_FORMAT                   (0xf0)
 #define MP_BC_MASK_EXTRA_BYTE               (0x9e)
 
@@ -101,17 +113,17 @@
 #define MP_BC_ROT_TWO                       (MP_BC_BASE_BYTE_O + 0x0a)
 #define MP_BC_ROT_THREE                     (MP_BC_BASE_BYTE_O + 0x0b)
 
-#define MP_BC_JUMP                          (MP_BC_BASE_JUMP_E + 0x02) // rel byte code offset, 16-bit signed, in excess
-#define MP_BC_POP_JUMP_IF_TRUE              (MP_BC_BASE_JUMP_E + 0x03) // rel byte code offset, 16-bit signed, in excess
-#define MP_BC_POP_JUMP_IF_FALSE             (MP_BC_BASE_JUMP_E + 0x04) // rel byte code offset, 16-bit signed, in excess
-#define MP_BC_JUMP_IF_TRUE_OR_POP           (MP_BC_BASE_JUMP_E + 0x05) // rel byte code offset, 16-bit signed, in excess
-#define MP_BC_JUMP_IF_FALSE_OR_POP          (MP_BC_BASE_JUMP_E + 0x06) // rel byte code offset, 16-bit signed, in excess
-#define MP_BC_UNWIND_JUMP                   (MP_BC_BASE_JUMP_E + 0x00) // rel byte code offset, 16-bit signed, in excess; then a byte
-#define MP_BC_SETUP_WITH                    (MP_BC_BASE_JUMP_E + 0x07) // rel byte code offset, 16-bit unsigned
-#define MP_BC_SETUP_EXCEPT                  (MP_BC_BASE_JUMP_E + 0x08) // rel byte code offset, 16-bit unsigned
-#define MP_BC_SETUP_FINALLY                 (MP_BC_BASE_JUMP_E + 0x09) // rel byte code offset, 16-bit unsigned
-#define MP_BC_POP_EXCEPT_JUMP               (MP_BC_BASE_JUMP_E + 0x0a) // rel byte code offset, 16-bit unsigned
-#define MP_BC_FOR_ITER                      (MP_BC_BASE_JUMP_E + 0x0b) // rel byte code offset, 16-bit unsigned
+#define MP_BC_UNWIND_JUMP                   (MP_BC_BASE_JUMP_E + 0x00) // signed relative bytecode offset; then a byte
+#define MP_BC_JUMP                          (MP_BC_BASE_JUMP_E + 0x02) // signed relative bytecode offset
+#define MP_BC_POP_JUMP_IF_TRUE              (MP_BC_BASE_JUMP_E + 0x03) // signed relative bytecode offset
+#define MP_BC_POP_JUMP_IF_FALSE             (MP_BC_BASE_JUMP_E + 0x04) // signed relative bytecode offset
+#define MP_BC_JUMP_IF_TRUE_OR_POP           (MP_BC_BASE_JUMP_E + 0x05) // unsigned relative bytecode offset
+#define MP_BC_JUMP_IF_FALSE_OR_POP          (MP_BC_BASE_JUMP_E + 0x06) // unsigned relative bytecode offset
+#define MP_BC_SETUP_WITH                    (MP_BC_BASE_JUMP_E + 0x07) // unsigned relative bytecode offset
+#define MP_BC_SETUP_EXCEPT                  (MP_BC_BASE_JUMP_E + 0x08) // unsigned relative bytecode offset
+#define MP_BC_SETUP_FINALLY                 (MP_BC_BASE_JUMP_E + 0x09) // unsigned relative bytecode offset
+#define MP_BC_POP_EXCEPT_JUMP               (MP_BC_BASE_JUMP_E + 0x0a) // unsigned relative bytecode offset
+#define MP_BC_FOR_ITER                      (MP_BC_BASE_JUMP_E + 0x0b) // unsigned relative bytecode offset
 #define MP_BC_WITH_CLEANUP                  (MP_BC_BASE_BYTE_O + 0x0c)
 #define MP_BC_END_FINALLY                   (MP_BC_BASE_BYTE_O + 0x0d)
 #define MP_BC_GET_ITER                      (MP_BC_BASE_BYTE_O + 0x0e)
diff --git a/python/src/py/builtin.h b/python/src/py/builtin.h
index 1e4769cd6..a6f824ca2 100644
--- a/python/src/py/builtin.h
+++ b/python/src/py/builtin.h
@@ -28,8 +28,43 @@
 
 #include "py/obj.h"
 
-mp_obj_t mp_builtin___import__(size_t n_args, const mp_obj_t *args);
+typedef enum {
+    MP_IMPORT_STAT_NO_EXIST,
+    MP_IMPORT_STAT_DIR,
+    MP_IMPORT_STAT_FILE,
+} mp_import_stat_t;
+
+#if MICROPY_VFS
+
+// Delegate to the VFS for import stat and builtin open.
+
+#define mp_builtin_open_obj mp_vfs_open_obj
+
+mp_import_stat_t mp_vfs_import_stat(const char *path);
+mp_obj_t mp_vfs_open(size_t n_args, const mp_obj_t *args, mp_map_t *kwargs);
+
+MP_DECLARE_CONST_FUN_OBJ_KW(mp_vfs_open_obj);
+
+static inline mp_import_stat_t mp_import_stat(const char *path) {
+    return mp_vfs_import_stat(path);
+}
+
+static inline mp_obj_t mp_builtin_open(size_t n_args, const mp_obj_t *args, mp_map_t *kwargs) {
+    return mp_vfs_open(n_args, args, kwargs);
+}
+
+#else
+
+// A port can provide implementations of these functions.
+mp_import_stat_t mp_import_stat(const char *path);
 mp_obj_t mp_builtin_open(size_t n_args, const mp_obj_t *args, mp_map_t *kwargs);
+
+// A port can provide this object.
+MP_DECLARE_CONST_FUN_OBJ_KW(mp_builtin_open_obj);
+
+#endif
+
+mp_obj_t mp_builtin___import__(size_t n_args, const mp_obj_t *args);
 mp_obj_t mp_micropython_mem_info(size_t n_args, const mp_obj_t *args);
 
 MP_DECLARE_CONST_FUN_OBJ_VAR(mp_builtin___build_class___obj);
@@ -76,9 +111,7 @@ MP_DECLARE_CONST_FUN_OBJ_1(mp_builtin_repr_obj);
 MP_DECLARE_CONST_FUN_OBJ_VAR_BETWEEN(mp_builtin_round_obj);
 MP_DECLARE_CONST_FUN_OBJ_KW(mp_builtin_sorted_obj);
 MP_DECLARE_CONST_FUN_OBJ_VAR_BETWEEN(mp_builtin_sum_obj);
-// Defined by a port, but declared here for simplicity
 MP_DECLARE_CONST_FUN_OBJ_VAR_BETWEEN(mp_builtin_input_obj);
-MP_DECLARE_CONST_FUN_OBJ_KW(mp_builtin_open_obj);
 
 MP_DECLARE_CONST_FUN_OBJ_2(mp_namedtuple_obj);
 
@@ -108,6 +141,7 @@ extern const mp_obj_module_t mp_module_uerrno;
 extern const mp_obj_module_t mp_module_uctypes;
 extern const mp_obj_module_t mp_module_uzlib;
 extern const mp_obj_module_t mp_module_ujson;
+extern const mp_obj_module_t mp_module_uos;
 extern const mp_obj_module_t mp_module_ure;
 extern const mp_obj_module_t mp_module_uheapq;
 extern const mp_obj_module_t mp_module_uhashlib;
@@ -124,6 +158,7 @@ extern const mp_obj_module_t mp_module_webrepl;
 extern const mp_obj_module_t mp_module_framebuf;
 extern const mp_obj_module_t mp_module_btree;
 extern const mp_obj_module_t mp_module_ubluetooth;
+extern const mp_obj_module_t mp_module_uplatform;
 
 extern const char MICROPY_PY_BUILTINS_HELP_TEXT[];
 
diff --git a/python/src/py/builtinevex.c b/python/src/py/builtinevex.c
index 800a20223..73b77b40b 100644
--- a/python/src/py/builtinevex.c
+++ b/python/src/py/builtinevex.c
@@ -54,7 +54,7 @@ STATIC mp_obj_t code_execute(mp_obj_code_t *self, mp_obj_dict_t *globals, mp_obj
     // the correct one
     if (mp_obj_is_type(self->module_fun, &mp_type_fun_bc)) {
         mp_obj_fun_bc_t *fun_bc = MP_OBJ_TO_PTR(self->module_fun);
-        fun_bc->globals = globals;
+        ((mp_module_context_t *)fun_bc->context)->module.globals = globals;
     }
 
     // execute code
@@ -103,8 +103,7 @@ STATIC mp_obj_t mp_builtin_compile(size_t n_args, const mp_obj_t *args) {
             mp_raise_ValueError(MP_ERROR_TEXT("bad compile mode"));
     }
 
-    mp_obj_code_t *code = m_new_obj(mp_obj_code_t);
-    code->base.type = &mp_type_code;
+    mp_obj_code_t *code = mp_obj_malloc(mp_obj_code_t, &mp_type_code);
     code->module_fun = mp_parse_compile_execute(lex, parse_input_kind, NULL, NULL);
     return MP_OBJ_FROM_PTR(code);
 }
diff --git a/python/src/py/builtinhelp.c b/python/src/py/builtinhelp.c
index 13735635e..84d69caf3 100644
--- a/python/src/py/builtinhelp.c
+++ b/python/src/py/builtinhelp.c
@@ -67,10 +67,10 @@ STATIC void mp_help_add_from_map(mp_obj_t list, const mp_map_t *map) {
 #if MICROPY_MODULE_FROZEN
 STATIC void mp_help_add_from_names(mp_obj_t list, const char *name) {
     while (*name) {
-        size_t l = strlen(name);
+        size_t len = strlen(name);
         // name should end in '.py' and we strip it off
-        mp_obj_list_append(list, mp_obj_new_str(name, l - 3));
-        name += l + 1;
+        mp_obj_list_append(list, mp_obj_new_str(name, len - 3));
+        name += len + 1;
     }
 }
 #endif
@@ -80,14 +80,9 @@ STATIC void mp_help_print_modules(void) {
 
     mp_help_add_from_map(list, &mp_builtin_module_map);
 
-    #if MICROPY_MODULE_FROZEN_STR
-    extern const char mp_frozen_str_names[];
-    mp_help_add_from_names(list, mp_frozen_str_names);
-    #endif
-
-    #if MICROPY_MODULE_FROZEN_MPY
-    extern const char mp_frozen_mpy_names[];
-    mp_help_add_from_names(list, mp_frozen_mpy_names);
+    #if MICROPY_MODULE_FROZEN
+    extern const char mp_frozen_names[];
+    mp_help_add_from_names(list, mp_frozen_names);
     #endif
 
     // sort the list so it's printed in alphabetical order
diff --git a/python/src/py/builtinimport.c b/python/src/py/builtinimport.c
index cdee5e407..cd9636ccd 100644
--- a/python/src/py/builtinimport.c
+++ b/python/src/py/builtinimport.c
@@ -5,6 +5,7 @@
  *
  * Copyright (c) 2013-2019 Damien P. George
  * Copyright (c) 2014 Paul Sokolovsky
+ * Copyright (c) 2021 Jim Mussared
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -46,7 +47,11 @@
 
 #if MICROPY_ENABLE_EXTERNAL_IMPORT
 
-#define PATH_SEP_CHAR '/'
+// Must be a string of one byte.
+#define PATH_SEP_CHAR "/"
+
+// Virtual sys.path entry that maps to the frozen modules.
+#define MP_FROZEN_PATH_PREFIX ".frozen/"
 
 bool mp_obj_is_package(mp_obj_t module) {
     mp_obj_t dest[2];
@@ -54,27 +59,33 @@ bool mp_obj_is_package(mp_obj_t module) {
     return dest[0] != MP_OBJ_NULL;
 }
 
-// Stat either frozen or normal module by a given path
-// (whatever is available, if at all).
-STATIC mp_import_stat_t mp_import_stat_any(const char *path) {
+// Wrapper for mp_import_stat (which is provided by the port, and typically
+// uses mp_vfs_import_stat) to also search frozen modules. Given an exact
+// path to a file or directory (e.g. "foo/bar", foo/bar.py" or "foo/bar.mpy"),
+// will return whether the path is a file, directory, or doesn't exist.
+STATIC mp_import_stat_t stat_path_or_frozen(const char *path) {
     #if MICROPY_MODULE_FROZEN
-    mp_import_stat_t st = mp_frozen_stat(path);
-    if (st != MP_IMPORT_STAT_NO_EXIST) {
-        return st;
+    // Only try and load as a frozen module if it starts with .frozen/.
+    const int frozen_path_prefix_len = strlen(MP_FROZEN_PATH_PREFIX);
+    if (strncmp(path, MP_FROZEN_PATH_PREFIX, frozen_path_prefix_len) == 0) {
+        return mp_find_frozen_module(path + frozen_path_prefix_len, NULL, NULL);
     }
     #endif
     return mp_import_stat(path);
 }
 
+// Given a path to a .py file, try and find this path as either a .py or .mpy
+// in either the filesystem or frozen modules.
 STATIC mp_import_stat_t stat_file_py_or_mpy(vstr_t *path) {
-    mp_import_stat_t stat = mp_import_stat_any(vstr_null_terminated_str(path));
+    mp_import_stat_t stat = stat_path_or_frozen(vstr_null_terminated_str(path));
     if (stat == MP_IMPORT_STAT_FILE) {
         return stat;
     }
 
     #if MICROPY_PERSISTENT_CODE_LOAD
+    // Didn't find .py -- try the .mpy instead by inserting an 'm' into the '.py'.
     vstr_ins_byte(path, path->len - 2, 'm');
-    stat = mp_import_stat_any(vstr_null_terminated_str(path));
+    stat = stat_path_or_frozen(vstr_null_terminated_str(path));
     if (stat == MP_IMPORT_STAT_FILE) {
         return stat;
     }
@@ -83,8 +94,10 @@ STATIC mp_import_stat_t stat_file_py_or_mpy(vstr_t *path) {
     return MP_IMPORT_STAT_NO_EXIST;
 }
 
+// Given an import path (e.g. "foo/bar"), try and find "foo/bar" (a directory)
+// or "foo/bar.(m)py" in either the filesystem or frozen modules.
 STATIC mp_import_stat_t stat_dir_or_file(vstr_t *path) {
-    mp_import_stat_t stat = mp_import_stat_any(vstr_null_terminated_str(path));
+    mp_import_stat_t stat = stat_path_or_frozen(vstr_null_terminated_str(path));
     DEBUG_printf("stat %s: %d\n", vstr_str(path), stat);
     if (stat == MP_IMPORT_STAT_DIR) {
         return stat;
@@ -95,14 +108,16 @@ STATIC mp_import_stat_t stat_dir_or_file(vstr_t *path) {
     return stat_file_py_or_mpy(path);
 }
 
-STATIC mp_import_stat_t find_file(const char *file_str, uint file_len, vstr_t *dest) {
+// Given a top-level module, try and find it in each of the sys.path entries
+// via stat_dir_or_file.
+STATIC mp_import_stat_t stat_top_level_dir_or_file(qstr mod_name, vstr_t *dest) {
+    DEBUG_printf("stat_top_level_dir_or_file: '%s'\n", qstr_str(mod_name));
     #if MICROPY_PY_SYS
-    // extract the list of paths
     size_t path_num;
     mp_obj_t *path_items;
     mp_obj_list_get(mp_sys_path, &path_num, &path_items);
 
-    if (path_num != 0) {
+    if (path_num > 0) {
         // go through each path looking for a directory or file
         for (size_t i = 0; i < path_num; i++) {
             vstr_reset(dest);
@@ -110,9 +125,9 @@ STATIC mp_import_stat_t find_file(const char *file_str, uint file_len, vstr_t *d
             const char *p = mp_obj_str_get_data(path_items[i], &p_len);
             if (p_len > 0) {
                 vstr_add_strn(dest, p, p_len);
-                vstr_add_char(dest, PATH_SEP_CHAR);
+                vstr_add_char(dest, PATH_SEP_CHAR[0]);
             }
-            vstr_add_strn(dest, file_str, file_len);
+            vstr_add_str(dest, qstr_str(mod_name));
             mp_import_stat_t stat = stat_dir_or_file(dest);
             if (stat != MP_IMPORT_STAT_NO_EXIST) {
                 return stat;
@@ -124,34 +139,35 @@ STATIC mp_import_stat_t find_file(const char *file_str, uint file_len, vstr_t *d
     }
     #endif
 
-    // mp_sys_path is empty, so just use the given file name
-    vstr_add_strn(dest, file_str, file_len);
+    // mp_sys_path is empty (or not enabled), so just stat the given path
+    // directly.
+    vstr_add_str(dest, qstr_str(mod_name));
     return stat_dir_or_file(dest);
 }
 
 #if MICROPY_MODULE_FROZEN_STR || MICROPY_ENABLE_COMPILER
-STATIC void do_load_from_lexer(mp_obj_t module_obj, mp_lexer_t *lex) {
+STATIC void do_load_from_lexer(mp_module_context_t *context, mp_lexer_t *lex) {
     #if MICROPY_PY___FILE__
     qstr source_name = lex->source_name;
-    mp_store_attr(module_obj, MP_QSTR___file__, MP_OBJ_NEW_QSTR(source_name));
+    mp_store_attr(MP_OBJ_FROM_PTR(&context->module), MP_QSTR___file__, MP_OBJ_NEW_QSTR(source_name));
     #endif
 
     // parse, compile and execute the module in its context
-    mp_obj_dict_t *mod_globals = mp_obj_module_get_globals(module_obj);
+    mp_obj_dict_t *mod_globals = context->module.globals;
     mp_parse_compile_execute(lex, MP_PARSE_FILE_INPUT, mod_globals, mod_globals);
 }
 #endif
 
 #if (MICROPY_HAS_FILE_READER && MICROPY_PERSISTENT_CODE_LOAD) || MICROPY_MODULE_FROZEN_MPY
-STATIC void do_execute_raw_code(mp_obj_t module_obj, mp_raw_code_t *raw_code, const char *source_name) {
+STATIC void do_execute_raw_code(mp_module_context_t *context, const mp_raw_code_t *rc, const mp_module_context_t *mc, const char *source_name) {
     (void)source_name;
 
     #if MICROPY_PY___FILE__
-    mp_store_attr(module_obj, MP_QSTR___file__, MP_OBJ_NEW_QSTR(qstr_from_str(source_name)));
+    mp_store_attr(MP_OBJ_FROM_PTR(&context->module), MP_QSTR___file__, MP_OBJ_NEW_QSTR(qstr_from_str(source_name)));
     #endif
 
     // execute the module in its context
-    mp_obj_dict_t *mod_globals = mp_obj_module_get_globals(module_obj);
+    mp_obj_dict_t *mod_globals = context->module.globals;
 
     // save context
     mp_obj_dict_t *volatile old_globals = mp_globals_get();
@@ -163,7 +179,7 @@ STATIC void do_execute_raw_code(mp_obj_t module_obj, mp_raw_code_t *raw_code, co
 
     nlr_buf_t nlr;
     if (nlr_push(&nlr) == 0) {
-        mp_obj_t module_fun = mp_make_function_from_raw_code(raw_code, MP_OBJ_NULL, MP_OBJ_NULL);
+        mp_obj_t module_fun = mp_make_function_from_raw_code(rc, mc, NULL);
         mp_call_function_0(module_fun);
 
         // finish nlr block, restore context
@@ -179,42 +195,49 @@ STATIC void do_execute_raw_code(mp_obj_t module_obj, mp_raw_code_t *raw_code, co
 }
 #endif
 
-STATIC void do_load(mp_obj_t module_obj, vstr_t *file) {
+STATIC void do_load(mp_module_context_t *module_obj, vstr_t *file) {
     #if MICROPY_MODULE_FROZEN || MICROPY_ENABLE_COMPILER || (MICROPY_PERSISTENT_CODE_LOAD && MICROPY_HAS_FILE_READER)
-    char *file_str = vstr_null_terminated_str(file);
+    const char *file_str = vstr_null_terminated_str(file);
     #endif
 
     // If we support frozen modules (either as str or mpy) then try to find the
     // requested filename in the list of frozen module filenames.
     #if MICROPY_MODULE_FROZEN
     void *modref;
-    int frozen_type = mp_find_frozen_module(file_str, file->len, &modref);
-    #endif
+    int frozen_type;
+    const int frozen_path_prefix_len = strlen(MP_FROZEN_PATH_PREFIX);
+    if (strncmp(file_str, MP_FROZEN_PATH_PREFIX, frozen_path_prefix_len) == 0) {
+        mp_find_frozen_module(file_str + frozen_path_prefix_len, &frozen_type, &modref);
 
-    // If we support frozen str modules and the compiler is enabled, and we
-    // found the filename in the list of frozen files, then load and execute it.
-    #if MICROPY_MODULE_FROZEN_STR
-    if (frozen_type == MP_FROZEN_STR) {
-        do_load_from_lexer(module_obj, modref);
-        return;
-    }
-    #endif
+        // If we support frozen str modules and the compiler is enabled, and we
+        // found the filename in the list of frozen files, then load and execute it.
+        #if MICROPY_MODULE_FROZEN_STR
+        if (frozen_type == MP_FROZEN_STR) {
+            do_load_from_lexer(module_obj, modref);
+            return;
+        }
+        #endif
 
-    // If we support frozen mpy modules and we found a corresponding file (and
-    // its data) in the list of frozen files, execute it.
-    #if MICROPY_MODULE_FROZEN_MPY
-    if (frozen_type == MP_FROZEN_MPY) {
-        do_execute_raw_code(module_obj, modref, file_str);
-        return;
+        // If we support frozen mpy modules and we found a corresponding file (and
+        // its data) in the list of frozen files, execute it.
+        #if MICROPY_MODULE_FROZEN_MPY
+        if (frozen_type == MP_FROZEN_MPY) {
+            const mp_frozen_module_t *frozen = modref;
+            module_obj->constants = frozen->constants;
+            do_execute_raw_code(module_obj, frozen->rc, module_obj, file_str + frozen_path_prefix_len);
+            return;
+        }
+        #endif
     }
-    #endif
+
+    #endif // MICROPY_MODULE_FROZEN
 
     // If we support loading .mpy files then check if the file extension is of
     // the correct format and, if so, load and execute the file.
     #if MICROPY_HAS_FILE_READER && MICROPY_PERSISTENT_CODE_LOAD
     if (file_str[file->len - 3] == 'm') {
-        mp_raw_code_t *raw_code = mp_raw_code_load_file(file_str);
-        do_execute_raw_code(module_obj, raw_code, file_str);
+        mp_compiled_module_t cm = mp_raw_code_load_file(file_str, module_obj);
+        do_execute_raw_code(module_obj, cm.rc, cm.context, file_str);
         return;
     }
     #endif
@@ -232,15 +255,216 @@ STATIC void do_load(mp_obj_t module_obj, vstr_t *file) {
     #endif
 }
 
-STATIC void chop_component(const char *start, const char **end) {
-    const char *p = *end;
-    while (p > start) {
+// Convert a relative (to the current module) import, going up "level" levels,
+// into an absolute import.
+STATIC void evaluate_relative_import(mp_int_t level, const char **module_name, size_t *module_name_len) {
+    // What we want to do here is to take the name of the current module,
+    // remove <level> trailing components, and concatenate the passed-in
+    // module name.
+    // For example, level=3, module_name="foo.bar", __name__="a.b.c.d" --> "a.foo.bar"
+    // "Relative imports use a module's __name__ attribute to determine that
+    // module's position in the package hierarchy."
+    // http://legacy.python.org/dev/peps/pep-0328/#relative-imports-and-name
+
+    mp_obj_t current_module_name_obj = mp_obj_dict_get(MP_OBJ_FROM_PTR(mp_globals_get()), MP_OBJ_NEW_QSTR(MP_QSTR___name__));
+    assert(current_module_name_obj != MP_OBJ_NULL);
+
+    #if MICROPY_MODULE_OVERRIDE_MAIN_IMPORT && MICROPY_CPYTHON_COMPAT
+    if (MP_OBJ_QSTR_VALUE(current_module_name_obj) == MP_QSTR___main__) {
+        // This is a module loaded by -m command-line switch (e.g. unix port),
+        // and so its __name__ has been set to "__main__". Get its real name
+        // that we stored during import in the __main__ attribute.
+        current_module_name_obj = mp_obj_dict_get(MP_OBJ_FROM_PTR(mp_globals_get()), MP_OBJ_NEW_QSTR(MP_QSTR___main__));
+    }
+    #endif
+
+    // If we have a __path__ in the globals dict, then we're a package.
+    bool is_pkg = mp_map_lookup(&mp_globals_get()->map, MP_OBJ_NEW_QSTR(MP_QSTR___path__), MP_MAP_LOOKUP);
+
+    #if DEBUG_PRINT
+    DEBUG_printf("Current module/package: ");
+    mp_obj_print_helper(MICROPY_DEBUG_PRINTER, current_module_name_obj, PRINT_REPR);
+    DEBUG_printf(", is_package: %d", is_pkg);
+    DEBUG_printf("\n");
+    #endif
+
+    size_t current_module_name_len;
+    const char *current_module_name = mp_obj_str_get_data(current_module_name_obj, &current_module_name_len);
+
+    const char *p = current_module_name + current_module_name_len;
+    if (is_pkg) {
+        // If we're evaluating relative to a package, then take off one fewer
+        // level (i.e. the relative search starts inside the package, rather
+        // than as a sibling of the package).
+        --level;
+    }
+
+    // Walk back 'level' dots (or run out of path).
+    while (level && p > current_module_name) {
         if (*--p == '.') {
-            *end = p;
-            return;
+            --level;
         }
     }
-    *end = p;
+
+    // We must have some component left over to import from.
+    if (p == current_module_name) {
+        mp_raise_msg(&mp_type_ImportError, MP_ERROR_TEXT("can't perform relative import"));
+    }
+
+    // New length is len("<chopped path>.<module_name>"). Note: might be one byte
+    // more than we need if module_name is empty (for the extra . we will
+    // append).
+    uint new_module_name_len = (size_t)(p - current_module_name) + 1 + *module_name_len;
+    char *new_mod = mp_local_alloc(new_module_name_len);
+    memcpy(new_mod, current_module_name, p - current_module_name);
+
+    // Only append ".<module_name>" if there was one).
+    if (*module_name_len != 0) {
+        new_mod[p - current_module_name] = '.';
+        memcpy(new_mod + (p - current_module_name) + 1, *module_name, *module_name_len);
+    } else {
+        --new_module_name_len;
+    }
+
+    // Copy into a QSTR.
+    qstr new_mod_q = qstr_from_strn(new_mod, new_module_name_len);
+    mp_local_free(new_mod);
+
+    DEBUG_printf("Resolved base name for relative import: '%s'\n", qstr_str(new_mod_q));
+    *module_name = qstr_str(new_mod_q);
+    *module_name_len = new_module_name_len;
+}
+
+// Load a module at the specified absolute path, possibly as a submodule of the given outer module.
+// full_mod_name:    The full absolute path to this module (e.g. "foo.bar.baz").
+// level_mod_name:   The final component of the path (e.g. "baz").
+// outer_module_obj: The parent module (we need to store this module as an
+//                   attribute on it) (or MP_OBJ_NULL for top-level).
+// path:             The filesystem path where we found the parent module
+//                   (or empty for a top level module).
+// override_main:    Whether to set the __name__ to "__main__" (and use __main__
+//                   for the actual path).
+STATIC mp_obj_t process_import_at_level(qstr full_mod_name, qstr level_mod_name, mp_obj_t outer_module_obj, vstr_t *path, bool override_main) {
+    mp_import_stat_t stat = MP_IMPORT_STAT_NO_EXIST;
+
+    // Exact-match of built-in (or already-loaded) takes priority.
+    mp_obj_t module_obj = mp_module_get_loaded_or_builtin(full_mod_name);
+
+    // Even if we find the module, go through the motions of searching for it
+    // because we may actually be in the process of importing a sub-module.
+    // So we need to (re-)find the correct path to be finding the sub-module
+    // on the next iteration of process_import_at_level.
+
+    if (outer_module_obj == MP_OBJ_NULL) {
+        DEBUG_printf("Searching for top-level module\n");
+
+        // First module in the dotted-name; search for a directory or file
+        // relative to all the locations in sys.path.
+        stat = stat_top_level_dir_or_file(full_mod_name, path);
+
+        // If the module "foo" doesn't exist on the filesystem, and it's not a
+        // builtin, try and find "ufoo" as a built-in. (This feature was
+        // formerly known as "weak links").
+        #if MICROPY_MODULE_WEAK_LINKS
+        if (stat == MP_IMPORT_STAT_NO_EXIST && module_obj == MP_OBJ_NULL) {
+            char *umodule_buf = vstr_str(path);
+            umodule_buf[0] = 'u';
+            strcpy(umodule_buf + 1, qstr_str(level_mod_name));
+            qstr umodule_name = qstr_from_str(umodule_buf);
+            module_obj = mp_module_get_builtin(umodule_name);
+        }
+        #elif MICROPY_PY_SYS
+        if (stat == MP_IMPORT_STAT_NO_EXIST && module_obj == MP_OBJ_NULL && level_mod_name == MP_QSTR_sys) {
+            module_obj = MP_OBJ_FROM_PTR(&mp_module_sys);
+        }
+        #endif
+    } else {
+        DEBUG_printf("Searching for sub-module\n");
+
+        // Add the current part of the module name to the path.
+        vstr_add_char(path, PATH_SEP_CHAR[0]);
+        vstr_add_str(path, qstr_str(level_mod_name));
+
+        // Because it's not top level, we already know which path the parent was found in.
+        stat = stat_dir_or_file(path);
+    }
+    DEBUG_printf("Current path: %.*s\n", (int)vstr_len(path), vstr_str(path));
+
+    if (module_obj == MP_OBJ_NULL) {
+        // Not a built-in and not already-loaded.
+
+        if (stat == MP_IMPORT_STAT_NO_EXIST) {
+            // And the file wasn't found -- fail.
+            #if MICROPY_ERROR_REPORTING <= MICROPY_ERROR_REPORTING_TERSE
+            mp_raise_msg(&mp_type_ImportError, MP_ERROR_TEXT("module not found"));
+            #else
+            mp_raise_msg_varg(&mp_type_ImportError, MP_ERROR_TEXT("no module named '%q'"), full_mod_name);
+            #endif
+        }
+
+        // Not a built-in but found on the filesystem, try and load it.
+
+        DEBUG_printf("Found path: %.*s\n", (int)vstr_len(path), vstr_str(path));
+
+        // Prepare for loading from the filesystem. Create a new shell module.
+        module_obj = mp_obj_new_module(full_mod_name);
+
+        #if MICROPY_MODULE_OVERRIDE_MAIN_IMPORT
+        // If this module is being loaded via -m on unix, then
+        // override __name__ to "__main__". Do this only for *modules*
+        // however - packages never have their names replaced, instead
+        // they're -m'ed using a special __main__ submodule in them. (This all
+        // apparently is done to not touch the package name itself, which is
+        // important for future imports).
+        if (override_main && stat != MP_IMPORT_STAT_DIR) {
+            mp_obj_module_t *o = MP_OBJ_TO_PTR(module_obj);
+            mp_obj_dict_store(MP_OBJ_FROM_PTR(o->globals), MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR___main__));
+            #if MICROPY_CPYTHON_COMPAT
+            // Store module as "__main__" in the dictionary of loaded modules (returned by sys.modules).
+            mp_obj_dict_store(MP_OBJ_FROM_PTR(&MP_STATE_VM(mp_loaded_modules_dict)), MP_OBJ_NEW_QSTR(MP_QSTR___main__), module_obj);
+            // Store real name in "__main__" attribute. Need this for
+            // resolving relative imports later. "__main__ was chosen
+            // semi-randonly, to reuse existing qstr's.
+            mp_obj_dict_store(MP_OBJ_FROM_PTR(o->globals), MP_OBJ_NEW_QSTR(MP_QSTR___main__), MP_OBJ_NEW_QSTR(full_mod_name));
+            #endif
+        }
+        #endif // MICROPY_MODULE_OVERRIDE_MAIN_IMPORT
+
+        if (stat == MP_IMPORT_STAT_DIR) {
+            // Directory -- execute "path/__init__.py".
+            DEBUG_printf("%.*s is dir\n", (int)vstr_len(path), vstr_str(path));
+            // Store the __path__ attribute onto this module.
+            // https://docs.python.org/3/reference/import.html
+            // "Specifically, any module that contains a __path__ attribute is considered a package."
+            mp_store_attr(module_obj, MP_QSTR___path__, mp_obj_new_str(vstr_str(path), vstr_len(path)));
+            size_t orig_path_len = path->len;
+            vstr_add_str(path, PATH_SEP_CHAR "__init__.py");
+            if (stat_file_py_or_mpy(path) == MP_IMPORT_STAT_FILE) {
+                do_load(MP_OBJ_TO_PTR(module_obj), path);
+            } else {
+                // No-op. Nothing to load.
+                // mp_warning("%s is imported as namespace package", vstr_str(&path));
+            }
+            // Remove /__init__.py suffix.
+            path->len = orig_path_len;
+        } else { // MP_IMPORT_STAT_FILE
+            // File -- execute "path.(m)py".
+            do_load(MP_OBJ_TO_PTR(module_obj), path);
+            // Note: This should be the last component in the import path.  If
+            // there are remaining components then it's an ImportError
+            // because the current path(the module that was just loaded) is
+            // not a package.  This will be caught on the next iteration
+            // because the file will not exist.
+        }
+    }
+
+    if (outer_module_obj != MP_OBJ_NULL) {
+        // If it's a sub-module (not a built-in one), then make it available on
+        // the parent module.
+        mp_store_attr(outer_module_obj, level_mod_name, module_obj);
+    }
+
+    return module_obj;
 }
 
 mp_obj_t mp_builtin___import__(size_t n_args, const mp_obj_t *args) {
@@ -248,14 +472,28 @@ mp_obj_t mp_builtin___import__(size_t n_args, const mp_obj_t *args) {
     DEBUG_printf("__import__:\n");
     for (size_t i = 0; i < n_args; i++) {
         DEBUG_printf("  ");
-        mp_obj_print(args[i], PRINT_REPR);
+        mp_obj_print_helper(MICROPY_DEBUG_PRINTER, args[i], PRINT_REPR);
         DEBUG_printf("\n");
     }
     #endif
 
-    mp_obj_t module_name = args[0];
+    // This is the import path, with any leading dots stripped.
+    // "import foo.bar" --> module_name="foo.bar"
+    // "from foo.bar import baz" --> module_name="foo.bar"
+    // "from . import foo" --> module_name=""
+    // "from ...foo.bar import baz" --> module_name="foo.bar"
+    mp_obj_t module_name_obj = args[0];
+
+    // These are the imported names.
+    // i.e. "from foo.bar import baz, zap" --> fromtuple=("baz", "zap",)
+    // Note: There's a special case on the Unix port, where this is set to mp_const_false which means that it's __main__.
     mp_obj_t fromtuple = mp_const_none;
+
+    // Level is the number of leading dots in a relative import.
+    // i.e. "from . import foo" --> level=1
+    // i.e. "from ...foo.bar import baz" --> level=3
     mp_int_t level = 0;
+
     if (n_args >= 4) {
         fromtuple = args[3];
         if (n_args >= 5) {
@@ -266,211 +504,64 @@ mp_obj_t mp_builtin___import__(size_t n_args, const mp_obj_t *args) {
         }
     }
 
-    size_t mod_len;
-    const char *mod_str = mp_obj_str_get_data(module_name, &mod_len);
+    size_t module_name_len;
+    const char *module_name = mp_obj_str_get_data(module_name_obj, &module_name_len);
 
     if (level != 0) {
-        // What we want to do here is to take name of current module,
-        // chop <level> trailing components, and concatenate with passed-in
-        // module name, thus resolving relative import name into absolute.
-        // This even appears to be correct per
-        // http://legacy.python.org/dev/peps/pep-0328/#relative-imports-and-name
-        // "Relative imports use a module's __name__ attribute to determine that
-        // module's position in the package hierarchy."
-        level--;
-        mp_obj_t this_name_q = mp_obj_dict_get(MP_OBJ_FROM_PTR(mp_globals_get()), MP_OBJ_NEW_QSTR(MP_QSTR___name__));
-        assert(this_name_q != MP_OBJ_NULL);
-        #if MICROPY_CPYTHON_COMPAT
-        if (MP_OBJ_QSTR_VALUE(this_name_q) == MP_QSTR___main__) {
-            // This is a module run by -m command-line switch, get its real name from backup attribute
-            this_name_q = mp_obj_dict_get(MP_OBJ_FROM_PTR(mp_globals_get()), MP_OBJ_NEW_QSTR(MP_QSTR___main__));
-        }
-        #endif
-        mp_map_t *globals_map = &mp_globals_get()->map;
-        mp_map_elem_t *elem = mp_map_lookup(globals_map, MP_OBJ_NEW_QSTR(MP_QSTR___path__), MP_MAP_LOOKUP);
-        bool is_pkg = (elem != NULL);
-
-        #if DEBUG_PRINT
-        DEBUG_printf("Current module/package: ");
-        mp_obj_print(this_name_q, PRINT_REPR);
-        DEBUG_printf(", is_package: %d", is_pkg);
-        DEBUG_printf("\n");
-        #endif
-
-        size_t this_name_l;
-        const char *this_name = mp_obj_str_get_data(this_name_q, &this_name_l);
-
-        const char *p = this_name + this_name_l;
-        if (!is_pkg) {
-            // We have module, but relative imports are anchored at package, so
-            // go there.
-            chop_component(this_name, &p);
-        }
-
-        while (level--) {
-            chop_component(this_name, &p);
-        }
-
-        // We must have some component left over to import from
-        if (p == this_name) {
-            mp_raise_msg(&mp_type_ImportError, MP_ERROR_TEXT("can't perform relative import"));
-        }
-
-        uint new_mod_l = (mod_len == 0 ? (size_t)(p - this_name) : (size_t)(p - this_name) + 1 + mod_len);
-        char *new_mod = mp_local_alloc(new_mod_l);
-        memcpy(new_mod, this_name, p - this_name);
-        if (mod_len != 0) {
-            new_mod[p - this_name] = '.';
-            memcpy(new_mod + (p - this_name) + 1, mod_str, mod_len);
-        }
-
-        qstr new_mod_q = qstr_from_strn(new_mod, new_mod_l);
-        mp_local_free(new_mod);
-        DEBUG_printf("Resolved base name for relative import: '%s'\n", qstr_str(new_mod_q));
-        module_name = MP_OBJ_NEW_QSTR(new_mod_q);
-        mod_str = qstr_str(new_mod_q);
-        mod_len = new_mod_l;
+        // Turn "foo.bar" into "<current module minus 3 components>.foo.bar".
+        evaluate_relative_import(level, &module_name, &module_name_len);
     }
 
-    if (mod_len == 0) {
+    if (module_name_len == 0) {
         mp_raise_ValueError(NULL);
     }
 
-    // check if module already exists
-    qstr module_name_qstr = mp_obj_str_get_qstr(module_name);
-    mp_obj_t module_obj = mp_module_get(module_name_qstr);
-    if (module_obj != MP_OBJ_NULL) {
-        DEBUG_printf("Module already loaded\n");
-        // If it's not a package, return module right away
-        char *p = strchr(mod_str, '.');
-        if (p == NULL) {
-            return module_obj;
-        }
-        // If fromlist is not empty, return leaf module
-        if (fromtuple != mp_const_none) {
-            return module_obj;
-        }
-        // Otherwise, we need to return top-level package
-        qstr pkg_name = qstr_from_strn(mod_str, p - mod_str);
-        return mp_module_get(pkg_name);
-    }
-    DEBUG_printf("Module not yet loaded\n");
+    DEBUG_printf("Starting module search for '%s'\n", module_name);
 
-    uint last = 0;
     VSTR_FIXED(path, MICROPY_ALLOC_PATH_MAX)
-    module_obj = MP_OBJ_NULL;
     mp_obj_t top_module_obj = MP_OBJ_NULL;
     mp_obj_t outer_module_obj = MP_OBJ_NULL;
-    uint i;
-    for (i = 1; i <= mod_len; i++) {
-        if (i == mod_len || mod_str[i] == '.') {
-            // create a qstr for the module name up to this depth
-            qstr mod_name = qstr_from_strn(mod_str, i);
-            DEBUG_printf("Processing module: %s\n", qstr_str(mod_name));
-            DEBUG_printf("Previous path: =%.*s=\n", vstr_len(&path), vstr_str(&path));
 
-            // find the file corresponding to the module name
-            mp_import_stat_t stat;
-            if (vstr_len(&path) == 0) {
-                // first module in the dotted-name; search for a directory or file
-                stat = find_file(mod_str, i, &path);
-            } else {
-                // latter module in the dotted-name; append to path
-                vstr_add_char(&path, PATH_SEP_CHAR);
-                vstr_add_strn(&path, mod_str + last, i - last);
-                stat = stat_dir_or_file(&path);
-            }
-            DEBUG_printf("Current path: %.*s\n", vstr_len(&path), vstr_str(&path));
+    // Search for the end of each component.
+    size_t current_component_start = 0;
+    for (size_t i = 1; i <= module_name_len; i++) {
+        if (i == module_name_len || module_name[i] == '.') {
+            // The module name up to this depth (e.g. foo.bar.baz).
+            qstr full_mod_name = qstr_from_strn(module_name, i);
+            // The current level name (e.g. baz).
+            qstr level_mod_name = qstr_from_strn(module_name + current_component_start, i - current_component_start);
 
-            if (stat == MP_IMPORT_STAT_NO_EXIST) {
-                module_obj = MP_OBJ_NULL;
-                #if MICROPY_MODULE_WEAK_LINKS
-                // check if there is a weak link to this module
-                if (i == mod_len) {
-                    module_obj = mp_module_search_umodule(mod_str);
-                    if (module_obj != MP_OBJ_NULL) {
-                        // found weak linked module
-                        mp_module_call_init(mod_name, module_obj);
-                    }
-                }
-                #endif
-                if (module_obj == MP_OBJ_NULL) {
-                    // couldn't find the file, so fail
-                    #if MICROPY_ERROR_REPORTING <= MICROPY_ERROR_REPORTING_TERSE
-                    mp_raise_msg(&mp_type_ImportError, MP_ERROR_TEXT("module not found"));
-                    #else
-                    mp_raise_msg_varg(&mp_type_ImportError, MP_ERROR_TEXT("no module named '%q'"), mod_name);
-                    #endif
-                }
-            } else {
-                // found the file, so get the module
-                module_obj = mp_module_get(mod_name);
-            }
+            DEBUG_printf("Processing module: '%s' at level '%s'\n", qstr_str(full_mod_name), qstr_str(level_mod_name));
+            DEBUG_printf("Previous path: =%.*s=\n", (int)vstr_len(&path), vstr_str(&path));
 
-            if (module_obj == MP_OBJ_NULL) {
-                // module not already loaded, so load it!
+            #if MICROPY_MODULE_OVERRIDE_MAIN_IMPORT
+            // On unix, if this is being loaded via -m (magic mp_const_false),
+            // then handle that if it's the final component.
+            bool override_main = (i == module_name_len && fromtuple == mp_const_false);
+            #else
+            bool override_main = false;
+            #endif
 
-                module_obj = mp_obj_new_module(mod_name);
+            // Import this module.
+            mp_obj_t module_obj = process_import_at_level(full_mod_name, level_mod_name, outer_module_obj, &path, override_main);
 
-                // if args[3] (fromtuple) has magic value False, set up
-                // this module for command-line "-m" option (set module's
-                // name to __main__ instead of real name). Do this only
-                // for *modules* however - packages never have their names
-                // replaced, instead they're -m'ed using a special __main__
-                // submodule in them. (This all apparently is done to not
-                // touch package name itself, which is important for future
-                // imports).
-                if (i == mod_len && fromtuple == mp_const_false && stat != MP_IMPORT_STAT_DIR) {
-                    mp_obj_module_t *o = MP_OBJ_TO_PTR(module_obj);
-                    mp_obj_dict_store(MP_OBJ_FROM_PTR(o->globals), MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(MP_QSTR___main__));
-                    #if MICROPY_CPYTHON_COMPAT
-                    // Store module as "__main__" in the dictionary of loaded modules (returned by sys.modules).
-                    mp_obj_dict_store(MP_OBJ_FROM_PTR(&MP_STATE_VM(mp_loaded_modules_dict)), MP_OBJ_NEW_QSTR(MP_QSTR___main__), module_obj);
-                    // Store real name in "__main__" attribute. Chosen semi-randonly, to reuse existing qstr's.
-                    mp_obj_dict_store(MP_OBJ_FROM_PTR(o->globals), MP_OBJ_NEW_QSTR(MP_QSTR___main__), MP_OBJ_NEW_QSTR(mod_name));
-                    #endif
-                }
-
-                if (stat == MP_IMPORT_STAT_DIR) {
-                    DEBUG_printf("%.*s is dir\n", vstr_len(&path), vstr_str(&path));
-                    // https://docs.python.org/3/reference/import.html
-                    // "Specifically, any module that contains a __path__ attribute is considered a package."
-                    mp_store_attr(module_obj, MP_QSTR___path__, mp_obj_new_str(vstr_str(&path), vstr_len(&path)));
-                    size_t orig_path_len = path.len;
-                    vstr_add_char(&path, PATH_SEP_CHAR);
-                    vstr_add_str(&path, "__init__.py");
-                    if (stat_file_py_or_mpy(&path) != MP_IMPORT_STAT_FILE) {
-                        // mp_warning("%s is imported as namespace package", vstr_str(&path));
-                    } else {
-                        do_load(module_obj, &path);
-                    }
-                    path.len = orig_path_len;
-                } else { // MP_IMPORT_STAT_FILE
-                    do_load(module_obj, &path);
-                    // This should be the last component in the import path.  If there are
-                    // remaining components then it's an ImportError because the current path
-                    // (the module that was just loaded) is not a package.  This will be caught
-                    // on the next iteration because the file will not exist.
-                }
-            }
-            if (outer_module_obj != MP_OBJ_NULL) {
-                qstr s = qstr_from_strn(mod_str + last, i - last);
-                mp_store_attr(outer_module_obj, s, module_obj);
-            }
+            // Set this as the parent module, and remember the top-level module if it's the first.
             outer_module_obj = module_obj;
             if (top_module_obj == MP_OBJ_NULL) {
                 top_module_obj = module_obj;
             }
-            last = i + 1;
+
+            current_component_start = i + 1;
         }
     }
 
-    // If fromlist is not empty, return leaf module
     if (fromtuple != mp_const_none) {
-        return module_obj;
+        // If fromtuple is not empty, return leaf module
+        return outer_module_obj;
+    } else {
+        // Otherwise, we need to return top-level package
+        return top_module_obj;
     }
-    // Otherwise, we need to return top-level package
-    return top_module_obj;
 }
 
 #else // MICROPY_ENABLE_EXTERNAL_IMPORT
@@ -483,17 +574,19 @@ mp_obj_t mp_builtin___import__(size_t n_args, const mp_obj_t *args) {
 
     // Check if module already exists, and return it if it does
     qstr module_name_qstr = mp_obj_str_get_qstr(args[0]);
-    mp_obj_t module_obj = mp_module_get(module_name_qstr);
+    mp_obj_t module_obj = mp_module_get_loaded_or_builtin(module_name_qstr);
     if (module_obj != MP_OBJ_NULL) {
         return module_obj;
     }
 
     #if MICROPY_MODULE_WEAK_LINKS
     // Check if there is a weak link to this module
-    module_obj = mp_module_search_umodule(qstr_str(module_name_qstr));
+    char umodule_buf[MICROPY_ALLOC_PATH_MAX];
+    umodule_buf[0] = 'u';
+    strcpy(umodule_buf + 1, args[0]);
+    qstr umodule_name_qstr = qstr_from_str(umodule_buf);
+    module_obj = mp_module_get_loaded_or_builtin(umodule_name_qstr);
     if (module_obj != MP_OBJ_NULL) {
-        // Found weak-linked module
-        mp_module_call_init(module_name_qstr, module_obj);
         return module_obj;
     }
     #endif
diff --git a/python/src/py/compile.c b/python/src/py/compile.c
index 3d5c5f21e..9cca5df40 100644
--- a/python/src/py/compile.c
+++ b/python/src/py/compile.c
@@ -35,7 +35,9 @@
 #include "py/compile.h"
 #include "py/runtime.h"
 #include "py/asmbase.h"
+#include "py/nativeglue.h"
 #include "py/persistentcode.h"
+#include "py/smallint.h"
 
 #if MICROPY_ENABLE_COMPILER
 
@@ -59,6 +61,12 @@ typedef enum {
 #undef DEF_RULE_NC
 } pn_kind_t;
 
+// Whether a mp_parse_node_struct_t that has pns->kind == PN_testlist_comp
+// corresponds to a list comprehension or generator.
+#define MP_PARSE_NODE_TESTLIST_COMP_HAS_COMP_FOR(pns) \
+    (MP_PARSE_NODE_STRUCT_NUM_NODES(pns) == 2 && \
+    MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[1], PN_comp_for))
+
 #define NEED_METHOD_TABLE MICROPY_EMIT_NATIVE
 
 #if NEED_METHOD_TABLE
@@ -82,7 +90,7 @@ typedef enum {
 #if MICROPY_EMIT_NATIVE && MICROPY_DYNAMIC_COMPILER
 
 #define NATIVE_EMITTER(f) emit_native_table[mp_dynamic_compiler.native_arch]->emit_##f
-#define NATIVE_EMITTER_TABLE emit_native_table[mp_dynamic_compiler.native_arch]
+#define NATIVE_EMITTER_TABLE (emit_native_table[mp_dynamic_compiler.native_arch])
 
 STATIC const emit_method_table_t *emit_native_table[] = {
     NULL,
@@ -115,7 +123,7 @@ STATIC const emit_method_table_t *emit_native_table[] = {
 #else
 #error "unknown native emitter"
 #endif
-#define NATIVE_EMITTER_TABLE &NATIVE_EMITTER(method_table)
+#define NATIVE_EMITTER_TABLE (&NATIVE_EMITTER(method_table))
 #endif
 
 #if MICROPY_EMIT_INLINE_ASM && MICROPY_DYNAMIC_COMPILER
@@ -156,8 +164,6 @@ STATIC const emit_inline_asm_method_table_t *emit_asm_table[] = {
 
 // elements in this struct are ordered to make it compact
 typedef struct _compiler_t {
-    qstr source_file;
-
     uint8_t is_repl;
     uint8_t pass; // holds enum type pass_kind_t
     uint8_t have_star;
@@ -188,8 +194,60 @@ typedef struct _compiler_t {
     emit_inline_asm_t *emit_inline_asm;                                   // current emitter for inline asm
     const emit_inline_asm_method_table_t *emit_inline_asm_method_table;   // current emit method table for inline asm
     #endif
+
+    mp_emit_common_t emit_common;
 } compiler_t;
 
+/******************************************************************************/
+// mp_emit_common_t helper functions
+// These are defined here so they can be inlined, to reduce code size.
+
+STATIC void mp_emit_common_init(mp_emit_common_t *emit, qstr source_file) {
+    #if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
+    mp_map_init(&emit->qstr_map, 1);
+
+    // add the source file as the first entry in the qstr table
+    mp_map_elem_t *elem = mp_map_lookup(&emit->qstr_map, MP_OBJ_NEW_QSTR(source_file), MP_MAP_LOOKUP_ADD_IF_NOT_FOUND);
+    elem->value = MP_OBJ_NEW_SMALL_INT(0);
+    #endif
+    mp_obj_list_init(&emit->const_obj_list, 0);
+}
+
+STATIC void mp_emit_common_start_pass(mp_emit_common_t *emit, pass_kind_t pass) {
+    emit->pass = pass;
+    if (pass == MP_PASS_CODE_SIZE) {
+        if (emit->ct_cur_child == 0) {
+            emit->children = NULL;
+        } else {
+            emit->children = m_new0(mp_raw_code_t *, emit->ct_cur_child);
+        }
+    }
+    emit->ct_cur_child = 0;
+}
+
+STATIC void mp_emit_common_populate_module_context(mp_emit_common_t *emit, qstr source_file, mp_module_context_t *context) {
+    #if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
+    size_t qstr_map_used = emit->qstr_map.used;
+    mp_module_context_alloc_tables(context, qstr_map_used, emit->const_obj_list.len);
+    for (size_t i = 0; i < emit->qstr_map.alloc; ++i) {
+        if (mp_map_slot_is_filled(&emit->qstr_map, i)) {
+            size_t idx = MP_OBJ_SMALL_INT_VALUE(emit->qstr_map.table[i].value);
+            qstr qst = MP_OBJ_QSTR_VALUE(emit->qstr_map.table[i].key);
+            context->constants.qstr_table[idx] = qst;
+        }
+    }
+    #else
+    mp_module_context_alloc_tables(context, 0, emit->const_obj_list.len);
+    context->constants.source_file = source_file;
+    #endif
+
+    for (size_t i = 0; i < emit->const_obj_list.len; ++i) {
+        context->constants.obj_table[i] = emit->const_obj_list.items[i];
+    }
+}
+
+/******************************************************************************/
+
 STATIC void compile_error_set_line(compiler_t *comp, mp_parse_node_t pn) {
     // if the line of the error is unknown then try to update it from the pn
     if (comp->compile_error_line == 0 && MP_PARSE_NODE_IS_STRUCT(pn)) {
@@ -240,7 +298,7 @@ STATIC void compile_decrease_except_level(compiler_t *comp) {
 }
 
 STATIC scope_t *scope_new_and_link(compiler_t *comp, scope_kind_t kind, mp_parse_node_t pn, uint emit_options) {
-    scope_t *scope = scope_new(kind, pn, comp->source_file, emit_options);
+    scope_t *scope = scope_new(kind, pn, emit_options);
     scope->parent = comp->scope_cur;
     scope->next = NULL;
     if (comp->scope_head == NULL) {
@@ -317,25 +375,13 @@ STATIC void compile_delete_id(compiler_t *comp, qstr qst) {
     }
 }
 
-STATIC void c_tuple(compiler_t *comp, mp_parse_node_t pn, mp_parse_node_struct_t *pns_list) {
-    int total = 0;
-    if (!MP_PARSE_NODE_IS_NULL(pn)) {
-        compile_node(comp, pn);
-        total += 1;
-    }
-    if (pns_list != NULL) {
-        int n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns_list);
-        for (int i = 0; i < n; i++) {
-            compile_node(comp, pns_list->nodes[i]);
-        }
-        total += n;
-    }
-    EMIT_ARG(build, total, MP_EMIT_BUILD_TUPLE);
-}
-
 STATIC void compile_generic_tuple(compiler_t *comp, mp_parse_node_struct_t *pns) {
     // a simple tuple expression
-    c_tuple(comp, MP_PARSE_NODE_NULL, pns);
+    size_t num_nodes = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
+    for (size_t i = 0; i < num_nodes; i++) {
+        compile_node(comp, pns->nodes[i]);
+    }
+    EMIT_ARG(build, num_nodes, MP_EMIT_BUILD_TUPLE);
 }
 
 STATIC void c_if_cond(compiler_t *comp, mp_parse_node_t pn, bool jump_if, int label) {
@@ -452,21 +498,14 @@ STATIC void c_assign_atom_expr(compiler_t *comp, mp_parse_node_struct_t *pns, as
     compile_syntax_error(comp, (mp_parse_node_t)pns, MP_ERROR_TEXT("can't assign to expression"));
 }
 
-// we need to allow for a caller passing in 1 initial node (node_head) followed by an array of nodes (nodes_tail)
-STATIC void c_assign_tuple(compiler_t *comp, mp_parse_node_t node_head, uint num_tail, mp_parse_node_t *nodes_tail) {
-    uint num_head = (node_head == MP_PARSE_NODE_NULL) ? 0 : 1;
-
+STATIC void c_assign_tuple(compiler_t *comp, uint num_tail, mp_parse_node_t *nodes_tail) {
     // look for star expression
     uint have_star_index = -1;
-    if (num_head != 0 && MP_PARSE_NODE_IS_STRUCT_KIND(node_head, PN_star_expr)) {
-        EMIT_ARG(unpack_ex, 0, num_tail);
-        have_star_index = 0;
-    }
     for (uint i = 0; i < num_tail; i++) {
         if (MP_PARSE_NODE_IS_STRUCT_KIND(nodes_tail[i], PN_star_expr)) {
             if (have_star_index == (uint)-1) {
-                EMIT_ARG(unpack_ex, num_head + i, num_tail - i - 1);
-                have_star_index = num_head + i;
+                EMIT_ARG(unpack_ex, i, num_tail - i - 1);
+                have_star_index = i;
             } else {
                 compile_syntax_error(comp, nodes_tail[i], MP_ERROR_TEXT("multiple *x in assignment"));
                 return;
@@ -474,17 +513,10 @@ STATIC void c_assign_tuple(compiler_t *comp, mp_parse_node_t node_head, uint num
         }
     }
     if (have_star_index == (uint)-1) {
-        EMIT_ARG(unpack_sequence, num_head + num_tail);
-    }
-    if (num_head != 0) {
-        if (0 == have_star_index) {
-            c_assign(comp, ((mp_parse_node_struct_t *)node_head)->nodes[0], ASSIGN_STORE);
-        } else {
-            c_assign(comp, node_head, ASSIGN_STORE);
-        }
+        EMIT_ARG(unpack_sequence, num_tail);
     }
     for (uint i = 0; i < num_tail; i++) {
-        if (num_head + i == have_star_index) {
+        if (i == have_star_index) {
             c_assign(comp, ((mp_parse_node_struct_t *)nodes_tail[i])->nodes[0], ASSIGN_STORE);
         } else {
             c_assign(comp, nodes_tail[i], ASSIGN_STORE);
@@ -526,7 +558,7 @@ STATIC void c_assign(compiler_t *comp, mp_parse_node_t pn, assign_kind_t assign_
                 if (assign_kind != ASSIGN_STORE) {
                     goto cannot_assign;
                 }
-                c_assign_tuple(comp, MP_PARSE_NODE_NULL, MP_PARSE_NODE_STRUCT_NUM_NODES(pns), pns->nodes);
+                c_assign_tuple(comp, MP_PARSE_NODE_STRUCT_NUM_NODES(pns), pns->nodes);
                 break;
 
             case PN_atom_paren:
@@ -551,13 +583,13 @@ STATIC void c_assign(compiler_t *comp, mp_parse_node_t pn, assign_kind_t assign_
                 }
                 if (MP_PARSE_NODE_IS_NULL(pns->nodes[0])) {
                     // empty list, assignment allowed
-                    c_assign_tuple(comp, MP_PARSE_NODE_NULL, 0, NULL);
+                    c_assign_tuple(comp, 0, NULL);
                 } else if (MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_testlist_comp)) {
                     pns = (mp_parse_node_struct_t *)pns->nodes[0];
                     goto testlist_comp;
                 } else {
                     // brackets around 1 item
-                    c_assign_tuple(comp, pns->nodes[0], 0, NULL);
+                    c_assign_tuple(comp, 1, pns->nodes);
                 }
                 break;
 
@@ -568,27 +600,10 @@ STATIC void c_assign(compiler_t *comp, mp_parse_node_t pn, assign_kind_t assign_
 
     testlist_comp:
         // lhs is a sequence
-        if (MP_PARSE_NODE_IS_STRUCT(pns->nodes[1])) {
-            mp_parse_node_struct_t *pns2 = (mp_parse_node_struct_t *)pns->nodes[1];
-            if (MP_PARSE_NODE_STRUCT_KIND(pns2) == PN_testlist_comp_3b) {
-                // sequence of one item, with trailing comma
-                assert(MP_PARSE_NODE_IS_NULL(pns2->nodes[0]));
-                c_assign_tuple(comp, pns->nodes[0], 0, NULL);
-            } else if (MP_PARSE_NODE_STRUCT_KIND(pns2) == PN_testlist_comp_3c) {
-                // sequence of many items
-                uint n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns2);
-                c_assign_tuple(comp, pns->nodes[0], n, pns2->nodes);
-            } else if (MP_PARSE_NODE_STRUCT_KIND(pns2) == PN_comp_for) {
-                goto cannot_assign;
-            } else {
-                // sequence with 2 items
-                goto sequence_with_2_items;
-            }
-        } else {
-            // sequence with 2 items
-        sequence_with_2_items:
-            c_assign_tuple(comp, MP_PARSE_NODE_NULL, 2, pns->nodes);
+        if (MP_PARSE_NODE_TESTLIST_COMP_HAS_COMP_FOR(pns)) {
+            goto cannot_assign;
         }
+        c_assign_tuple(comp, MP_PARSE_NODE_STRUCT_NUM_NODES(pns), pns->nodes);
         return;
     }
     return;
@@ -705,7 +720,7 @@ STATIC void compile_funcdef_lambdef_param(compiler_t *comp, mp_parse_node_t pn)
 
         } else {
             // this parameter has a default value
-            // in CPython, None (and True, False?) as default parameters are loaded with LOAD_NAME; don't understand why
+            // in CPython, None (and True, False?) as default parameters are loaded with LOAD_NAME; don't understandy why
 
             if (comp->have_star) {
                 comp->num_dict_params += 1;
@@ -856,7 +871,7 @@ STATIC bool compile_built_in_decorator(compiler_t *comp, size_t name_len, mp_par
         compile_syntax_error(comp, name_nodes[1], MP_ERROR_TEXT("invalid micropython decorator"));
     }
 
-    #if MICROPY_DYNAMIC_COMPILER
+    #if MICROPY_EMIT_NATIVE && MICROPY_DYNAMIC_COMPILER
     if (*emit_options == MP_EMIT_OPT_NATIVE_PYTHON || *emit_options == MP_EMIT_OPT_VIPER) {
         if (emit_native_table[mp_dynamic_compiler.native_arch] == NULL) {
             compile_syntax_error(comp, name_nodes[1], MP_ERROR_TEXT("invalid arch"));
@@ -983,32 +998,11 @@ STATIC void c_del_stmt(compiler_t *comp, mp_parse_node_t pn) {
         } else {
             assert(MP_PARSE_NODE_IS_STRUCT_KIND(pn, PN_testlist_comp));
             mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn;
-            // TODO perhaps factorise testlist_comp code with other uses of PN_testlist_comp
-
-            if (MP_PARSE_NODE_IS_STRUCT(pns->nodes[1])) {
-                mp_parse_node_struct_t *pns1 = (mp_parse_node_struct_t *)pns->nodes[1];
-                if (MP_PARSE_NODE_STRUCT_KIND(pns1) == PN_testlist_comp_3b) {
-                    // sequence of one item, with trailing comma
-                    assert(MP_PARSE_NODE_IS_NULL(pns1->nodes[0]));
-                    c_del_stmt(comp, pns->nodes[0]);
-                } else if (MP_PARSE_NODE_STRUCT_KIND(pns1) == PN_testlist_comp_3c) {
-                    // sequence of many items
-                    int n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns1);
-                    c_del_stmt(comp, pns->nodes[0]);
-                    for (int i = 0; i < n; i++) {
-                        c_del_stmt(comp, pns1->nodes[i]);
-                    }
-                } else if (MP_PARSE_NODE_STRUCT_KIND(pns1) == PN_comp_for) {
-                    goto cannot_delete;
-                } else {
-                    // sequence with 2 items
-                    goto sequence_with_2_items;
-                }
-            } else {
-                // sequence with 2 items
-            sequence_with_2_items:
-                c_del_stmt(comp, pns->nodes[0]);
-                c_del_stmt(comp, pns->nodes[1]);
+            if (MP_PARSE_NODE_TESTLIST_COMP_HAS_COMP_FOR(pns)) {
+                goto cannot_delete;
+            }
+            for (size_t i = 0; i < MP_PARSE_NODE_STRUCT_NUM_NODES(pns); ++i) {
+                c_del_stmt(comp, pns->nodes[i]);
             }
         }
     } else {
@@ -1121,14 +1115,19 @@ STATIC void do_import_name(compiler_t *comp, mp_parse_node_t pn, qstr *q_base) {
             if (!is_as) {
                 *q_base = MP_PARSE_NODE_LEAF_ARG(pns->nodes[0]);
             }
-            int n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
-            int len = n - 1;
-            for (int i = 0; i < n; i++) {
+            size_t n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
+            if (n == 0) {
+                // There must be at least one node in this PN_dotted_name.
+                // Let the compiler know this so it doesn't warn, and can generate better code.
+                MP_UNREACHABLE;
+            }
+            size_t len = n - 1;
+            for (size_t i = 0; i < n; i++) {
                 len += qstr_len(MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]));
             }
             char *q_ptr = mp_local_alloc(len);
             char *str_dest = q_ptr;
-            for (int i = 0; i < n; i++) {
+            for (size_t i = 0; i < n; i++) {
                 if (i > 0) {
                     *str_dest++ = '.';
                 }
@@ -1141,7 +1140,7 @@ STATIC void do_import_name(compiler_t *comp, mp_parse_node_t pn, qstr *q_base) {
             mp_local_free(q_ptr);
             EMIT_ARG(import, q_full, MP_EMIT_IMPORT_NAME);
             if (is_as) {
-                for (int i = 1; i < n; i++) {
+                for (size_t i = 1; i < n; i++) {
                     EMIT_ARG(attr, MP_PARSE_NODE_LEAF_ARG(pns->nodes[i]), MP_EMIT_ATTR_LOAD);
                 }
             }
@@ -2383,24 +2382,36 @@ STATIC void compile_trailer_paren_helper(compiler_t *comp, mp_parse_node_t pn_ar
     int n_positional = n_positional_extra;
     uint n_keyword = 0;
     uint star_flags = 0;
-    mp_parse_node_struct_t *star_args_node = NULL, *dblstar_args_node = NULL;
+    mp_uint_t star_args = 0;
     for (size_t i = 0; i < n_args; i++) {
         if (MP_PARSE_NODE_IS_STRUCT(args[i])) {
             mp_parse_node_struct_t *pns_arg = (mp_parse_node_struct_t *)args[i];
             if (MP_PARSE_NODE_STRUCT_KIND(pns_arg) == PN_arglist_star) {
-                if (star_flags & MP_EMIT_STAR_FLAG_SINGLE) {
-                    compile_syntax_error(comp, (mp_parse_node_t)pns_arg, MP_ERROR_TEXT("can't have multiple *x"));
+                if (star_flags & MP_EMIT_STAR_FLAG_DOUBLE) {
+                    compile_syntax_error(comp, (mp_parse_node_t)pns_arg, MP_ERROR_TEXT("* arg after **"));
+                    return;
+                }
+                #if MICROPY_DYNAMIC_COMPILER
+                if (i >= (size_t)mp_dynamic_compiler.small_int_bits - 1)
+                #else
+                if (i >= MP_SMALL_INT_BITS - 1)
+                #endif
+                {
+                    // If there are not enough bits in a small int to fit the flag, then we consider
+                    // it a syntax error. It should be unlikely to have this many args in practice.
+                    compile_syntax_error(comp, (mp_parse_node_t)pns_arg, MP_ERROR_TEXT("too many args"));
                     return;
                 }
                 star_flags |= MP_EMIT_STAR_FLAG_SINGLE;
-                star_args_node = pns_arg;
+                star_args |= (mp_uint_t)1 << i;
+                compile_node(comp, pns_arg->nodes[0]);
+                n_positional++;
             } else if (MP_PARSE_NODE_STRUCT_KIND(pns_arg) == PN_arglist_dbl_star) {
-                if (star_flags & MP_EMIT_STAR_FLAG_DOUBLE) {
-                    compile_syntax_error(comp, (mp_parse_node_t)pns_arg, MP_ERROR_TEXT("can't have multiple **x"));
-                    return;
-                }
                 star_flags |= MP_EMIT_STAR_FLAG_DOUBLE;
-                dblstar_args_node = pns_arg;
+                // double-star args are stored as kw arg with key of None
+                EMIT(load_null);
+                compile_node(comp, pns_arg->nodes[0]);
+                n_keyword++;
             } else if (MP_PARSE_NODE_STRUCT_KIND(pns_arg) == PN_argument) {
                 #if MICROPY_PY_ASSIGN_EXPR
                 if (MP_PARSE_NODE_IS_STRUCT_KIND(pns_arg->nodes[1], PN_argument_3)) {
@@ -2415,7 +2426,7 @@ STATIC void compile_trailer_paren_helper(compiler_t *comp, mp_parse_node_t pn_ar
                     }
                     EMIT_ARG(load_const_str, MP_PARSE_NODE_LEAF_ARG(pns_arg->nodes[0]));
                     compile_node(comp, pns_arg->nodes[1]);
-                    n_keyword += 1;
+                    n_keyword++;
                 } else {
                     compile_comprehension(comp, pns_arg, SCOPE_GEN_EXPR);
                     n_positional++;
@@ -2425,12 +2436,12 @@ STATIC void compile_trailer_paren_helper(compiler_t *comp, mp_parse_node_t pn_ar
             }
         } else {
         normal_argument:
-            if (star_flags) {
-                compile_syntax_error(comp, args[i], MP_ERROR_TEXT("non-keyword arg after */**"));
+            if (star_flags & MP_EMIT_STAR_FLAG_DOUBLE) {
+                compile_syntax_error(comp, args[i], MP_ERROR_TEXT("positional arg after **"));
                 return;
             }
             if (n_keyword > 0) {
-                compile_syntax_error(comp, args[i], MP_ERROR_TEXT("non-keyword arg after keyword arg"));
+                compile_syntax_error(comp, args[i], MP_ERROR_TEXT("positional arg after keyword arg"));
                 return;
             }
             compile_node(comp, args[i]);
@@ -2438,19 +2449,9 @@ STATIC void compile_trailer_paren_helper(compiler_t *comp, mp_parse_node_t pn_ar
         }
     }
 
-    // compile the star/double-star arguments if we had them
-    // if we had one but not the other then we load "null" as a place holder
     if (star_flags != 0) {
-        if (star_args_node == NULL) {
-            EMIT(load_null);
-        } else {
-            compile_node(comp, star_args_node->nodes[0]);
-        }
-        if (dblstar_args_node == NULL) {
-            EMIT(load_null);
-        } else {
-            compile_node(comp, dblstar_args_node->nodes[0]);
-        }
+        // one extra object that contains the star_args map
+        EMIT_ARG(load_const_small_int, star_args);
     }
 
     // emit the function/method call
@@ -2490,31 +2491,16 @@ STATIC void compile_comprehension(compiler_t *comp, mp_parse_node_struct_t *pns,
 STATIC void compile_atom_paren(compiler_t *comp, mp_parse_node_struct_t *pns) {
     if (MP_PARSE_NODE_IS_NULL(pns->nodes[0])) {
         // an empty tuple
-        c_tuple(comp, MP_PARSE_NODE_NULL, NULL);
+        EMIT_ARG(build, 0, MP_EMIT_BUILD_TUPLE);
     } else {
         assert(MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_testlist_comp));
         pns = (mp_parse_node_struct_t *)pns->nodes[0];
-        assert(!MP_PARSE_NODE_IS_NULL(pns->nodes[1]));
-        if (MP_PARSE_NODE_IS_STRUCT(pns->nodes[1])) {
-            mp_parse_node_struct_t *pns2 = (mp_parse_node_struct_t *)pns->nodes[1];
-            if (MP_PARSE_NODE_STRUCT_KIND(pns2) == PN_testlist_comp_3b) {
-                // tuple of one item, with trailing comma
-                assert(MP_PARSE_NODE_IS_NULL(pns2->nodes[0]));
-                c_tuple(comp, pns->nodes[0], NULL);
-            } else if (MP_PARSE_NODE_STRUCT_KIND(pns2) == PN_testlist_comp_3c) {
-                // tuple of many items
-                c_tuple(comp, pns->nodes[0], pns2);
-            } else if (MP_PARSE_NODE_STRUCT_KIND(pns2) == PN_comp_for) {
-                // generator expression
-                compile_comprehension(comp, pns, SCOPE_GEN_EXPR);
-            } else {
-                // tuple with 2 items
-                goto tuple_with_2_items;
-            }
+        if (MP_PARSE_NODE_TESTLIST_COMP_HAS_COMP_FOR(pns)) {
+            // generator expression
+            compile_comprehension(comp, pns, SCOPE_GEN_EXPR);
         } else {
-            // tuple with 2 items
-        tuple_with_2_items:
-            c_tuple(comp, MP_PARSE_NODE_NULL, pns);
+            // tuple with N items
+            compile_generic_tuple(comp, pns);
         }
     }
 }
@@ -2525,31 +2511,13 @@ STATIC void compile_atom_bracket(compiler_t *comp, mp_parse_node_struct_t *pns)
         EMIT_ARG(build, 0, MP_EMIT_BUILD_LIST);
     } else if (MP_PARSE_NODE_IS_STRUCT_KIND(pns->nodes[0], PN_testlist_comp)) {
         mp_parse_node_struct_t *pns2 = (mp_parse_node_struct_t *)pns->nodes[0];
-        if (MP_PARSE_NODE_IS_STRUCT(pns2->nodes[1])) {
-            mp_parse_node_struct_t *pns3 = (mp_parse_node_struct_t *)pns2->nodes[1];
-            if (MP_PARSE_NODE_STRUCT_KIND(pns3) == PN_testlist_comp_3b) {
-                // list of one item, with trailing comma
-                assert(MP_PARSE_NODE_IS_NULL(pns3->nodes[0]));
-                compile_node(comp, pns2->nodes[0]);
-                EMIT_ARG(build, 1, MP_EMIT_BUILD_LIST);
-            } else if (MP_PARSE_NODE_STRUCT_KIND(pns3) == PN_testlist_comp_3c) {
-                // list of many items
-                compile_node(comp, pns2->nodes[0]);
-                compile_generic_all_nodes(comp, pns3);
-                EMIT_ARG(build, 1 + MP_PARSE_NODE_STRUCT_NUM_NODES(pns3), MP_EMIT_BUILD_LIST);
-            } else if (MP_PARSE_NODE_STRUCT_KIND(pns3) == PN_comp_for) {
-                // list comprehension
-                compile_comprehension(comp, pns2, SCOPE_LIST_COMP);
-            } else {
-                // list with 2 items
-                goto list_with_2_items;
-            }
+        if (MP_PARSE_NODE_TESTLIST_COMP_HAS_COMP_FOR(pns2)) {
+            // list comprehension
+            compile_comprehension(comp, pns2, SCOPE_LIST_COMP);
         } else {
-            // list with 2 items
-        list_with_2_items:
-            compile_node(comp, pns2->nodes[0]);
-            compile_node(comp, pns2->nodes[1]);
-            EMIT_ARG(build, 2, MP_EMIT_BUILD_LIST);
+            // list with N items
+            compile_generic_all_nodes(comp, pns2);
+            EMIT_ARG(build, MP_PARSE_NODE_STRUCT_NUM_NODES(pns2), MP_EMIT_BUILD_LIST);
         }
     } else {
         // list with 1 item
@@ -2779,12 +2747,7 @@ STATIC void compile_atom_expr_await(compiler_t *comp, mp_parse_node_struct_t *pn
 #endif
 
 STATIC mp_obj_t get_const_object(mp_parse_node_struct_t *pns) {
-    #if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D
-    // nodes are 32-bit pointers, but need to extract 64-bit object
-    return (uint64_t)pns->nodes[0] | ((uint64_t)pns->nodes[1] << 32);
-    #else
-    return (mp_obj_t)pns->nodes[0];
-    #endif
+    return mp_parse_node_extract_const_object(pns);
 }
 
 STATIC void compile_const_object(compiler_t *comp, mp_parse_node_struct_t *pns) {
@@ -2809,23 +2772,7 @@ STATIC void compile_node(compiler_t *comp, mp_parse_node_t pn) {
         // pass
     } else if (MP_PARSE_NODE_IS_SMALL_INT(pn)) {
         mp_int_t arg = MP_PARSE_NODE_LEAF_SMALL_INT(pn);
-        #if MICROPY_DYNAMIC_COMPILER
-        mp_uint_t sign_mask = -((mp_uint_t)1 << (mp_dynamic_compiler.small_int_bits - 1));
-        if ((arg & sign_mask) == 0 || (arg & sign_mask) == sign_mask) {
-            // integer fits in target runtime's small-int
-            EMIT_ARG(load_const_small_int, arg);
-        } else {
-            // integer doesn't fit, so create a multi-precision int object
-            // (but only create the actual object on the last pass)
-            if (comp->pass != MP_PASS_EMIT) {
-                EMIT_ARG(load_const_obj, mp_const_none);
-            } else {
-                EMIT_ARG(load_const_obj, mp_obj_new_int_from_ll(arg));
-            }
-        }
-        #else
         EMIT_ARG(load_const_small_int, arg);
-        #endif
     } else if (MP_PARSE_NODE_IS_LEAF(pn)) {
         uintptr_t arg = MP_PARSE_NODE_LEAF_ARG(pn);
         switch (MP_PARSE_NODE_LEAF_KIND(pn)) {
@@ -2835,16 +2782,6 @@ STATIC void compile_node(compiler_t *comp, mp_parse_node_t pn) {
             case MP_PARSE_NODE_STRING:
                 EMIT_ARG(load_const_str, arg);
                 break;
-            case MP_PARSE_NODE_BYTES:
-                // only create and load the actual bytes object on the last pass
-                if (comp->pass != MP_PASS_EMIT) {
-                    EMIT_ARG(load_const_obj, mp_const_none);
-                } else {
-                    size_t len;
-                    const byte *data = qstr_data(arg, &len);
-                    EMIT_ARG(load_const_obj, mp_obj_new_bytes(data, len));
-                }
-                break;
             case MP_PARSE_NODE_TOKEN:
             default:
                 if (arg == MP_TOKEN_NEWLINE) {
@@ -3063,10 +3000,11 @@ STATIC void check_for_doc_string(compiler_t *comp, mp_parse_node_t pn) {
     #endif
 }
 
-STATIC void compile_scope(compiler_t *comp, scope_t *scope, pass_kind_t pass) {
+STATIC bool compile_scope(compiler_t *comp, scope_t *scope, pass_kind_t pass) {
     comp->pass = pass;
     comp->scope_cur = scope;
     comp->next_label = 0;
+    mp_emit_common_start_pass(&comp->emit_common, pass);
     EMIT_ARG(start_pass, pass, scope);
     reserve_labels_for_native(comp, 6); // used by native's start_pass
 
@@ -3220,10 +3158,12 @@ STATIC void compile_scope(compiler_t *comp, scope_t *scope, pass_kind_t pass) {
         EMIT(return_value);
     }
 
-    EMIT(end_pass);
+    bool pass_complete = EMIT(end_pass);
 
     // make sure we match all the exception levels
     assert(comp->cur_except_level == 0);
+
+    return pass_complete;
 }
 
 #if MICROPY_EMIT_INLINE_ASM
@@ -3387,9 +3327,10 @@ STATIC void compile_scope_inline_asm(compiler_t *comp, scope_t *scope, pass_kind
                 f, mp_asm_base_get_code_size((mp_asm_base_t *)comp->emit_inline_asm),
                 NULL,
                 #if MICROPY_PERSISTENT_CODE_SAVE
-                0, 0, 0, 0, NULL,
+                0,
+                0,
                 #endif
-                comp->scope_cur->num_pos_args, 0, type_sig);
+                0, comp->scope_cur->num_pos_args, type_sig);
         }
     }
 
@@ -3497,15 +3438,15 @@ STATIC void scope_compute_things(scope_t *scope) {
 #if !MICROPY_PERSISTENT_CODE_SAVE
 STATIC
 #endif
-mp_raw_code_t *mp_compile_to_raw_code(mp_parse_tree_t *parse_tree, qstr source_file, bool is_repl) {
+mp_compiled_module_t mp_compile_to_raw_code(mp_parse_tree_t *parse_tree, qstr source_file, bool is_repl, mp_module_context_t *context) {
     // put compiler state on the stack, it's relatively small
     compiler_t comp_state = {0};
     compiler_t *comp = &comp_state;
 
-    comp->source_file = source_file;
     comp->is_repl = is_repl;
     comp->break_label = INVALID_LABEL;
     comp->continue_label = INVALID_LABEL;
+    mp_emit_common_init(&comp->emit_common, source_file);
 
     // create the module scope
     #if MICROPY_EMIT_NATIVE
@@ -3516,9 +3457,9 @@ mp_raw_code_t *mp_compile_to_raw_code(mp_parse_tree_t *parse_tree, qstr source_f
     scope_t *module_scope = scope_new_and_link(comp, SCOPE_MODULE, parse_tree->root, emit_opt);
 
     // create standard emitter; it's used at least for MP_PASS_SCOPE
-    emit_t *emit_bc = emit_bc_new();
+    emit_t *emit_bc = emit_bc_new(&comp->emit_common);
 
-    // compile pass 1
+    // compile MP_PASS_SCOPE
     comp->emit = emit_bc;
     #if MICROPY_EMIT_NATIVE
     comp->emit_method_table = &emit_bc_method_table;
@@ -3556,7 +3497,7 @@ mp_raw_code_t *mp_compile_to_raw_code(mp_parse_tree_t *parse_tree, qstr source_f
     // set max number of labels now that it's calculated
     emit_bc_set_max_num_labels(emit_bc, max_num_labels);
 
-    // compile pass 2 and 3
+    // compile MP_PASS_STACK_SIZE, MP_PASS_CODE_SIZE, MP_PASS_EMIT
     #if MICROPY_EMIT_NATIVE
     emit_t *emit_native = NULL;
     #endif
@@ -3596,7 +3537,7 @@ mp_raw_code_t *mp_compile_to_raw_code(mp_parse_tree_t *parse_tree, qstr source_f
                 case MP_EMIT_OPT_NATIVE_PYTHON:
                 case MP_EMIT_OPT_VIPER:
                     if (emit_native == NULL) {
-                        emit_native = NATIVE_EMITTER(new)(&comp->compile_error, &comp->next_label, max_num_labels);
+                        emit_native = NATIVE_EMITTER(new)(&comp->emit_common, &comp->compile_error, &comp->next_label, max_num_labels);
                     }
                     comp->emit_method_table = NATIVE_EMITTER_TABLE;
                     comp->emit = emit_native;
@@ -3620,8 +3561,10 @@ mp_raw_code_t *mp_compile_to_raw_code(mp_parse_tree_t *parse_tree, qstr source_f
             }
 
             // final pass: emit code
+            // the emitter can request multiple of these passes
             if (comp->compile_error == MP_OBJ_NULL) {
-                compile_scope(comp, s, MP_PASS_EMIT);
+                while (!compile_scope(comp, s, MP_PASS_EMIT)) {
+                }
             }
         }
     }
@@ -3631,10 +3574,45 @@ mp_raw_code_t *mp_compile_to_raw_code(mp_parse_tree_t *parse_tree, qstr source_f
         // number for the start of this scope
         compile_error_set_line(comp, comp->scope_cur->pn);
         // add a traceback to the exception using relevant source info
-        mp_obj_exception_add_traceback(comp->compile_error, comp->source_file,
+        mp_obj_exception_add_traceback(comp->compile_error, source_file,
             comp->compile_error_line, comp->scope_cur->simple_name);
     }
 
+    // construct the global qstr/const table for this module
+    mp_compiled_module_t cm;
+    cm.rc = module_scope->raw_code;
+    cm.context = context;
+    #if MICROPY_PERSISTENT_CODE_SAVE
+    cm.has_native = false;
+    #if MICROPY_EMIT_NATIVE
+    if (emit_native != NULL) {
+        cm.has_native = true;
+    }
+    #endif
+    #if MICROPY_EMIT_INLINE_ASM
+    if (comp->emit_inline_asm != NULL) {
+        cm.has_native = true;
+    }
+    #endif
+    cm.n_qstr = comp->emit_common.qstr_map.used;
+    cm.n_obj = comp->emit_common.const_obj_list.len;
+    #endif
+    if (comp->compile_error == MP_OBJ_NULL) {
+        mp_emit_common_populate_module_context(&comp->emit_common, source_file, context);
+
+        #if MICROPY_DEBUG_PRINTERS
+        // now that the module context is valid, the raw codes can be printed
+        if (mp_verbose_flag >= 2) {
+            for (scope_t *s = comp->scope_head; s != NULL; s = s->next) {
+                mp_raw_code_t *rc = s->raw_code;
+                if (rc->kind == MP_CODE_BYTECODE) {
+                    mp_bytecode_print(&mp_plat_print, rc, &cm.context->constants);
+                }
+            }
+        }
+        #endif
+    }
+
     // free the emitters
 
     emit_bc_free(emit_bc);
@@ -3653,7 +3631,6 @@ mp_raw_code_t *mp_compile_to_raw_code(mp_parse_tree_t *parse_tree, qstr source_f
     mp_parse_tree_clear(parse_tree);
 
     // free the scopes
-    mp_raw_code_t *outer_raw_code = module_scope->raw_code;
     for (scope_t *s = module_scope; s;) {
         scope_t *next = s->next;
         scope_free(s);
@@ -3662,15 +3639,17 @@ mp_raw_code_t *mp_compile_to_raw_code(mp_parse_tree_t *parse_tree, qstr source_f
 
     if (comp->compile_error != MP_OBJ_NULL) {
         nlr_raise(comp->compile_error);
-    } else {
-        return outer_raw_code;
     }
+
+    return cm;
 }
 
 mp_obj_t mp_compile(mp_parse_tree_t *parse_tree, qstr source_file, bool is_repl) {
-    mp_raw_code_t *rc = mp_compile_to_raw_code(parse_tree, source_file, is_repl);
+    mp_module_context_t *context = m_new_obj(mp_module_context_t);
+    context->module.globals = mp_globals_get();
+    mp_compiled_module_t cm = mp_compile_to_raw_code(parse_tree, source_file, is_repl, context);
     // return function that executes the outer module
-    return mp_make_function_from_raw_code(rc, MP_OBJ_NULL, MP_OBJ_NULL);
+    return mp_make_function_from_raw_code(cm.rc, cm.context, NULL);
 }
 
 #endif // MICROPY_ENABLE_COMPILER
diff --git a/python/src/py/compile.h b/python/src/py/compile.h
index 1ad1f5e9c..ae87bf2a0 100644
--- a/python/src/py/compile.h
+++ b/python/src/py/compile.h
@@ -32,11 +32,12 @@
 
 // the compiler will raise an exception if an error occurred
 // the compiler will clear the parse tree before it returns
+// mp_globals_get() will be used for the context
 mp_obj_t mp_compile(mp_parse_tree_t *parse_tree, qstr source_file, bool is_repl);
 
 #if MICROPY_PERSISTENT_CODE_SAVE
 // this has the same semantics as mp_compile
-mp_raw_code_t *mp_compile_to_raw_code(mp_parse_tree_t *parse_tree, qstr source_file, bool is_repl);
+mp_compiled_module_t mp_compile_to_raw_code(mp_parse_tree_t *parse_tree, qstr source_file, bool is_repl, mp_module_context_t *globals);
 #endif
 
 // this is implemented in runtime.c
diff --git a/python/src/py/dynruntime.h b/python/src/py/dynruntime.h
index fdb91ed37..e3200a271 100644
--- a/python/src/py/dynruntime.h
+++ b/python/src/py/dynruntime.h
@@ -30,6 +30,7 @@
 // MicroPython runtime API defined in py/obj.h and py/runtime.h.
 
 #include "py/nativeglue.h"
+#include "py/objfun.h"
 #include "py/objstr.h"
 #include "py/objtype.h"
 
@@ -43,6 +44,7 @@
 #undef mp_const_none
 #undef mp_const_false
 #undef mp_const_true
+#undef mp_const_empty_bytes
 #undef mp_const_empty_tuple
 #undef nlr_raise
 
@@ -80,7 +82,11 @@ static inline void *m_realloc_dyn(void *ptr, size_t new_num_bytes) {
 #define MP_OBJ_NEW_QSTR(x) MP_OBJ_NEW_QSTR_##x
 
 #define mp_type_type                        (*mp_fun_table.type_type)
+#define mp_type_NoneType                    (*mp_obj_get_type(mp_const_none))
+#define mp_type_bool                        (*mp_obj_get_type(mp_const_false))
+#define mp_type_int                         (*(mp_obj_type_t *)(mp_load_global(MP_QSTR_int)))
 #define mp_type_str                         (*mp_fun_table.type_str)
+#define mp_type_bytes                       (*(mp_obj_type_t *)(mp_load_global(MP_QSTR_bytes)))
 #define mp_type_tuple                       (*((mp_obj_base_t *)mp_const_empty_tuple)->type)
 #define mp_type_list                        (*mp_fun_table.type_list)
 #define mp_type_EOFError                    (*(mp_obj_type_t *)(mp_load_global(MP_QSTR_EOFError)))
@@ -99,6 +105,7 @@ static inline void *m_realloc_dyn(void *ptr, size_t new_num_bytes) {
 #define mp_const_none                       ((mp_obj_t)mp_fun_table.const_none)
 #define mp_const_false                      ((mp_obj_t)mp_fun_table.const_false)
 #define mp_const_true                       ((mp_obj_t)mp_fun_table.const_true)
+#define mp_const_empty_bytes                (mp_type_bytes.make_new(NULL, 0, 0, NULL))
 #define mp_const_empty_tuple                (mp_fun_table.new_tuple(0, NULL))
 
 #define mp_obj_new_bool(b)                  ((b) ? (mp_obj_t)mp_fun_table.const_true : (mp_obj_t)mp_fun_table.const_false)
@@ -110,6 +117,7 @@ static inline void *m_realloc_dyn(void *ptr, size_t new_num_bytes) {
 #define mp_obj_new_bytearray_by_ref(n, i)   (mp_fun_table.obj_new_bytearray_by_ref((n), (i)))
 #define mp_obj_new_tuple(n, items)          (mp_fun_table.new_tuple((n), (items)))
 #define mp_obj_new_list(n, items)           (mp_fun_table.new_list((n), (items)))
+#define mp_obj_new_dict(n)                  (mp_fun_table.new_dict((n)))
 
 #define mp_obj_get_type(o)                  (mp_fun_table.obj_get_type((o)))
 #define mp_obj_cast_to_native_base(o, t)    (mp_obj_cast_to_native_base_dyn((o), (t)))
@@ -124,6 +132,9 @@ static inline void *m_realloc_dyn(void *ptr, size_t new_num_bytes) {
 #define mp_obj_subscr(base, index, val)     (mp_fun_table.obj_subscr((base), (index), (val)))
 #define mp_obj_get_array(o, len, items)     (mp_obj_get_array_dyn((o), (len), (items)))
 #define mp_obj_list_append(list, item)      (mp_fun_table.list_append((list), (item)))
+#define mp_obj_dict_store(dict, key, val)   (mp_fun_table.dict_store((dict), (key), (val)))
+
+#define mp_obj_malloc_helper(n, t)          (mp_obj_malloc_helper_dyn(n, t))
 
 static inline mp_obj_t mp_obj_new_str_of_type_dyn(const mp_obj_type_t *type, const byte *data, size_t len) {
     if (type == &mp_type_str) {
@@ -162,6 +173,12 @@ static inline mp_obj_t mp_obj_len_dyn(mp_obj_t o) {
     return mp_fun_table.call_function_n_kw(mp_fun_table.load_name(MP_QSTR_len), 1, &o);
 }
 
+static inline void *mp_obj_malloc_helper_dyn(size_t num_bytes, const mp_obj_type_t *type) {
+    mp_obj_base_t *base = (mp_obj_base_t *)m_malloc(num_bytes);
+    base->type = type;
+    return base;
+}
+
 /******************************************************************************/
 // General runtime functions
 
@@ -177,8 +194,8 @@ static inline mp_obj_t mp_obj_len_dyn(mp_obj_t o) {
 #define mp_unary_op(op, obj)        (mp_fun_table.unary_op((op), (obj)))
 #define mp_binary_op(op, lhs, rhs)  (mp_fun_table.binary_op((op), (lhs), (rhs)))
 
-#define mp_make_function_from_raw_code(rc, def_args, def_kw_args) \
-    (mp_fun_table.make_function_from_raw_code((rc), (def_args), (def_kw_args)))
+#define mp_make_function_from_raw_code(rc, context, def_args) \
+    (mp_fun_table.make_function_from_raw_code((rc), (context), (def_args)))
 
 #define mp_call_function_n_kw(fun, n_args, n_kw, args) \
     (mp_fun_table.call_function_n_kw((fun), (n_args) | ((n_kw) << 8), args))
@@ -187,11 +204,10 @@ static inline mp_obj_t mp_obj_len_dyn(mp_obj_t o) {
     (mp_fun_table.arg_check_num_sig((n_args), (n_kw), MP_OBJ_FUN_MAKE_SIG((n_args_min), (n_args_max), (takes_kw))))
 
 #define MP_DYNRUNTIME_INIT_ENTRY \
-    mp_obj_t old_globals = mp_fun_table.swap_globals(self->globals); \
+    mp_obj_t old_globals = mp_fun_table.swap_globals(self->context->module.globals); \
     mp_raw_code_t rc; \
     rc.kind = MP_CODE_NATIVE_VIPER; \
     rc.scope_flags = 0; \
-    rc.const_table = (void *)self->const_table; \
     (void)rc;
 
 #define MP_DYNRUNTIME_INIT_EXIT \
@@ -199,7 +215,7 @@ static inline mp_obj_t mp_obj_len_dyn(mp_obj_t o) {
     return mp_const_none;
 
 #define MP_DYNRUNTIME_MAKE_FUNCTION(f) \
-    (mp_make_function_from_raw_code((rc.fun_data = (f), &rc), MP_OBJ_NULL, MP_OBJ_NULL))
+    (mp_make_function_from_raw_code((rc.fun_data = (f), &rc), self->context, NULL))
 
 #define mp_import_name(name, fromlist, level) \
     (mp_fun_table.import_name((name), (fromlist), (level)))
diff --git a/python/src/py/dynruntime.mk b/python/src/py/dynruntime.mk
index cb5ab845e..09cbb2dd3 100644
--- a/python/src/py/dynruntime.mk
+++ b/python/src/py/dynruntime.mk
@@ -46,7 +46,6 @@ ifeq ($(ARCH),x86)
 # x86
 CROSS =
 CFLAGS += -m32 -fno-stack-protector
-MPY_CROSS_FLAGS += -mcache-lookup-bc
 MICROPY_FLOAT_IMPL ?= double
 
 else ifeq ($(ARCH),x64)
@@ -54,9 +53,15 @@ else ifeq ($(ARCH),x64)
 # x64
 CROSS =
 CFLAGS += -fno-stack-protector
-MPY_CROSS_FLAGS += -mcache-lookup-bc
 MICROPY_FLOAT_IMPL ?= double
 
+else ifeq ($(ARCH),armv6m)
+
+# thumb
+CROSS = arm-none-eabi-
+CFLAGS += -mthumb -mcpu=cortex-m0
+MICROPY_FLOAT_IMPL ?= none
+
 else ifeq ($(ARCH),armv7m)
 
 # thumb
diff --git a/python/src/py/emit.h b/python/src/py/emit.h
index 13bd3e9b2..608734552 100644
--- a/python/src/py/emit.h
+++ b/python/src/py/emit.h
@@ -43,7 +43,7 @@ typedef enum {
     MP_PASS_SCOPE = 1,      // work out id's and their kind, and number of labels
     MP_PASS_STACK_SIZE = 2, // work out maximum stack size
     MP_PASS_CODE_SIZE = 3,  // work out code size and label offsets
-    MP_PASS_EMIT = 4,       // emit code
+    MP_PASS_EMIT = 4,       // emit code (may be run multiple times if the emitter requests it)
 } pass_kind_t;
 
 #define MP_EMIT_STAR_FLAG_SINGLE (0x01)
@@ -92,6 +92,16 @@ typedef enum {
 
 typedef struct _emit_t emit_t;
 
+typedef struct _mp_emit_common_t {
+    pass_kind_t pass;
+    uint16_t ct_cur_child;
+    mp_raw_code_t **children;
+    #if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
+    mp_map_t qstr_map;
+    #endif
+    mp_obj_list_t const_obj_list;
+} mp_emit_common_t;
+
 typedef struct _mp_emit_method_table_id_ops_t {
     void (*local)(emit_t *emit, qstr qst, mp_uint_t local_num, int kind);
     void (*global)(emit_t *emit, qstr qst, int kind);
@@ -99,12 +109,12 @@ typedef struct _mp_emit_method_table_id_ops_t {
 
 typedef struct _emit_method_table_t {
     #if MICROPY_DYNAMIC_COMPILER
-    emit_t *(*emit_new)(mp_obj_t * error_slot, uint *label_slot, mp_uint_t max_num_labels);
+    emit_t *(*emit_new)(mp_emit_common_t * emit_common, mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
     void (*emit_free)(emit_t *emit);
     #endif
 
     void (*start_pass)(emit_t *emit, pass_kind_t pass, scope_t *scope);
-    void (*end_pass)(emit_t *emit);
+    bool (*end_pass)(emit_t *emit);
     bool (*last_emit_was_return_value)(emit_t *emit);
     void (*adjust_stack_size)(emit_t *emit, mp_int_t delta);
     void (*set_source_line)(emit_t *emit, mp_uint_t line);
@@ -161,6 +171,23 @@ typedef struct _emit_method_table_t {
     void (*end_except_handler)(emit_t *emit);
 } emit_method_table_t;
 
+#if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
+qstr_short_t mp_emit_common_use_qstr(mp_emit_common_t *emit, qstr qst);
+#else
+static inline qstr_short_t mp_emit_common_use_qstr(mp_emit_common_t *emit, qstr qst) {
+    return qst;
+}
+#endif
+
+size_t mp_emit_common_use_const_obj(mp_emit_common_t *emit, mp_obj_t const_obj);
+
+static inline size_t mp_emit_common_alloc_const_child(mp_emit_common_t *emit, mp_raw_code_t *rc) {
+    if (emit->pass == MP_PASS_EMIT) {
+        emit->children[emit->ct_cur_child] = rc;
+    }
+    return emit->ct_cur_child++;
+}
+
 static inline void mp_emit_common_get_id_for_load(scope_t *scope, qstr qst) {
     scope_find_or_add_id(scope, qst, ID_INFO_KIND_GLOBAL_IMPLICIT);
 }
@@ -180,13 +207,13 @@ extern const mp_emit_method_table_id_ops_t mp_emit_bc_method_table_load_id_ops;
 extern const mp_emit_method_table_id_ops_t mp_emit_bc_method_table_store_id_ops;
 extern const mp_emit_method_table_id_ops_t mp_emit_bc_method_table_delete_id_ops;
 
-emit_t *emit_bc_new(void);
-emit_t *emit_native_x64_new(mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
-emit_t *emit_native_x86_new(mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
-emit_t *emit_native_thumb_new(mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
-emit_t *emit_native_arm_new(mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
-emit_t *emit_native_xtensa_new(mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
-emit_t *emit_native_xtensawin_new(mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
+emit_t *emit_bc_new(mp_emit_common_t *emit_common);
+emit_t *emit_native_x64_new(mp_emit_common_t *emit_common, mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
+emit_t *emit_native_x86_new(mp_emit_common_t *emit_common, mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
+emit_t *emit_native_thumb_new(mp_emit_common_t *emit_common, mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
+emit_t *emit_native_arm_new(mp_emit_common_t *emit_common, mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
+emit_t *emit_native_xtensa_new(mp_emit_common_t *emit_common, mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
+emit_t *emit_native_xtensawin_new(mp_emit_common_t *emit_common, mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels);
 
 void emit_bc_set_max_num_labels(emit_t *emit, mp_uint_t max_num_labels);
 
@@ -199,7 +226,7 @@ void emit_native_xtensa_free(emit_t *emit);
 void emit_native_xtensawin_free(emit_t *emit);
 
 void mp_emit_bc_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scope);
-void mp_emit_bc_end_pass(emit_t *emit);
+bool mp_emit_bc_end_pass(emit_t *emit);
 bool mp_emit_bc_last_emit_was_return_value(emit_t *emit);
 void mp_emit_bc_adjust_stack_size(emit_t *emit, mp_int_t delta);
 void mp_emit_bc_set_source_line(emit_t *emit, mp_uint_t line);
diff --git a/python/src/py/emitbc.c b/python/src/py/emitbc.c
index d7e8e05f0..2007975c5 100644
--- a/python/src/py/emitbc.c
+++ b/python/src/py/emitbc.c
@@ -28,16 +28,17 @@
 #include <stdint.h>
 #include <stdio.h>
 #include <string.h>
+#include <unistd.h>
 #include <assert.h>
 
 #include "py/mpstate.h"
+#include "py/smallint.h"
 #include "py/emit.h"
 #include "py/bc0.h"
 
 #if MICROPY_ENABLE_COMPILER
 
-#define BYTES_FOR_INT ((MP_BYTES_PER_OBJ_WORD * 8 + 6) / 7)
-#define DUMMY_DATA_SIZE (BYTES_FOR_INT)
+#define DUMMY_DATA_SIZE (MP_ENCODE_UINT_MAX_BYTES)
 
 struct _emit_t {
     // Accessed as mp_obj_t, so must be aligned as such, and we rely on the
@@ -50,66 +51,45 @@ struct _emit_t {
 
     int stack_size;
 
+    mp_emit_common_t *emit_common;
     scope_t *scope;
 
     mp_uint_t last_source_line_offset;
     mp_uint_t last_source_line;
 
-    mp_uint_t max_num_labels;
-    mp_uint_t *label_offsets;
+    size_t max_num_labels;
+    size_t *label_offsets;
 
     size_t code_info_offset;
     size_t code_info_size;
     size_t bytecode_offset;
     size_t bytecode_size;
     byte *code_base; // stores both byte code and code info
+    bool overflow;
 
     size_t n_info;
     size_t n_cell;
-
-    #if MICROPY_PERSISTENT_CODE
-    uint16_t ct_cur_obj;
-    uint16_t ct_num_obj;
-    uint16_t ct_cur_raw_code;
-    #endif
-    mp_uint_t *const_table;
 };
 
-emit_t *emit_bc_new(void) {
+emit_t *emit_bc_new(mp_emit_common_t *emit_common) {
     emit_t *emit = m_new0(emit_t, 1);
+    emit->emit_common = emit_common;
     return emit;
 }
 
 void emit_bc_set_max_num_labels(emit_t *emit, mp_uint_t max_num_labels) {
     emit->max_num_labels = max_num_labels;
-    emit->label_offsets = m_new(mp_uint_t, emit->max_num_labels);
+    emit->label_offsets = m_new(size_t, emit->max_num_labels);
 }
 
 void emit_bc_free(emit_t *emit) {
-    m_del(mp_uint_t, emit->label_offsets, emit->max_num_labels);
+    m_del(size_t, emit->label_offsets, emit->max_num_labels);
     m_del_obj(emit_t, emit);
 }
 
-typedef byte *(*emit_allocator_t)(emit_t *emit, int nbytes);
-
-STATIC void emit_write_uint(emit_t *emit, emit_allocator_t allocator, mp_uint_t val) {
-    // We store each 7 bits in a separate byte, and that's how many bytes needed
-    byte buf[BYTES_FOR_INT];
-    byte *p = buf + sizeof(buf);
-    // We encode in little-ending order, but store in big-endian, to help decoding
-    do {
-        *--p = val & 0x7f;
-        val >>= 7;
-    } while (val != 0);
-    byte *c = allocator(emit, buf + sizeof(buf) - p);
-    while (p != buf + sizeof(buf) - 1) {
-        *c++ = *p++ | 0x80;
-    }
-    *c = *p;
-}
-
 // all functions must go through this one to emit code info
-STATIC byte *emit_get_cur_to_write_code_info(emit_t *emit, int num_bytes_to_write) {
+STATIC uint8_t *emit_get_cur_to_write_code_info(void *emit_in, size_t num_bytes_to_write) {
+    emit_t *emit = emit_in;
     if (emit->pass < MP_PASS_EMIT) {
         emit->code_info_offset += num_bytes_to_write;
         return emit->dummy_data;
@@ -126,14 +106,7 @@ STATIC void emit_write_code_info_byte(emit_t *emit, byte val) {
 }
 
 STATIC void emit_write_code_info_qstr(emit_t *emit, qstr qst) {
-    #if MICROPY_PERSISTENT_CODE
-    assert((qst >> 16) == 0);
-    byte *c = emit_get_cur_to_write_code_info(emit, 2);
-    c[0] = qst;
-    c[1] = qst >> 8;
-    #else
-    emit_write_uint(emit, emit_get_cur_to_write_code_info, qst);
-    #endif
+    mp_encode_uint(emit, emit_get_cur_to_write_code_info, mp_emit_common_use_qstr(emit->emit_common, qst));
 }
 
 #if MICROPY_ENABLE_SOURCE_LINE
@@ -166,7 +139,8 @@ STATIC void emit_write_code_info_bytes_lines(emit_t *emit, mp_uint_t bytes_to_sk
 #endif
 
 // all functions must go through this one to emit byte code
-STATIC byte *emit_get_cur_to_write_bytecode(emit_t *emit, int num_bytes_to_write) {
+STATIC uint8_t *emit_get_cur_to_write_bytecode(void *emit_in, size_t num_bytes_to_write) {
+    emit_t *emit = emit_in;
     if (emit->pass < MP_PASS_EMIT) {
         emit->bytecode_offset += num_bytes_to_write;
         return emit->dummy_data;
@@ -189,12 +163,12 @@ STATIC void emit_write_bytecode_byte(emit_t *emit, int stack_adj, byte b1) {
     c[0] = b1;
 }
 
-// Similar to emit_write_bytecode_uint(), just some extra handling to encode sign
+// Similar to mp_encode_uint(), just some extra handling to encode sign
 STATIC void emit_write_bytecode_byte_int(emit_t *emit, int stack_adj, byte b1, mp_int_t num) {
     emit_write_bytecode_byte(emit, stack_adj, b1);
 
     // We store each 7 bits in a separate byte, and that's how many bytes needed
-    byte buf[BYTES_FOR_INT];
+    byte buf[MP_ENCODE_UINT_MAX_BYTES];
     byte *p = buf + sizeof(buf);
     // We encode in little-ending order, but store in big-endian, to help decoding
     do {
@@ -218,94 +192,81 @@ STATIC void emit_write_bytecode_byte_int(emit_t *emit, int stack_adj, byte b1, m
 
 STATIC void emit_write_bytecode_byte_uint(emit_t *emit, int stack_adj, byte b, mp_uint_t val) {
     emit_write_bytecode_byte(emit, stack_adj, b);
-    emit_write_uint(emit, emit_get_cur_to_write_bytecode, val);
+    mp_encode_uint(emit, emit_get_cur_to_write_bytecode, val);
 }
 
-#if MICROPY_PERSISTENT_CODE
-STATIC void emit_write_bytecode_byte_const(emit_t *emit, int stack_adj, byte b, mp_uint_t n, mp_uint_t c) {
-    if (emit->pass == MP_PASS_EMIT) {
-        emit->const_table[n] = c;
-    }
+STATIC void emit_write_bytecode_byte_const(emit_t *emit, int stack_adj, byte b, mp_uint_t n) {
     emit_write_bytecode_byte_uint(emit, stack_adj, b, n);
 }
-#endif
 
 STATIC void emit_write_bytecode_byte_qstr(emit_t *emit, int stack_adj, byte b, qstr qst) {
-    #if MICROPY_PERSISTENT_CODE
-    assert((qst >> 16) == 0);
-    mp_emit_bc_adjust_stack_size(emit, stack_adj);
-    byte *c = emit_get_cur_to_write_bytecode(emit, 3);
-    c[0] = b;
-    c[1] = qst;
-    c[2] = qst >> 8;
-    #else
-    emit_write_bytecode_byte_uint(emit, stack_adj, b, qst);
-    #endif
+    emit_write_bytecode_byte_uint(emit, stack_adj, b, mp_emit_common_use_qstr(emit->emit_common, qst));
 }
 
 STATIC void emit_write_bytecode_byte_obj(emit_t *emit, int stack_adj, byte b, mp_obj_t obj) {
-    #if MICROPY_PERSISTENT_CODE
-    emit_write_bytecode_byte_const(emit, stack_adj, b,
-        emit->scope->num_pos_args + emit->scope->num_kwonly_args
-        + emit->ct_cur_obj++, (mp_uint_t)obj);
-    #else
-    // aligns the pointer so it is friendly to GC
-    emit_write_bytecode_byte(emit, stack_adj, b);
-    emit->bytecode_offset = (size_t)MP_ALIGN(emit->bytecode_offset, sizeof(mp_obj_t));
-    mp_obj_t *c = (mp_obj_t *)emit_get_cur_to_write_bytecode(emit, sizeof(mp_obj_t));
-    // Verify thar c is already uint-aligned
-    assert(c == MP_ALIGN(c, sizeof(mp_obj_t)));
-    *c = obj;
-    #endif
+    emit_write_bytecode_byte_const(emit, stack_adj, b, mp_emit_common_use_const_obj(emit->emit_common, obj));
 }
 
-STATIC void emit_write_bytecode_byte_raw_code(emit_t *emit, int stack_adj, byte b, mp_raw_code_t *rc) {
-    #if MICROPY_PERSISTENT_CODE
+STATIC void emit_write_bytecode_byte_child(emit_t *emit, int stack_adj, byte b, mp_raw_code_t *rc) {
     emit_write_bytecode_byte_const(emit, stack_adj, b,
-        emit->scope->num_pos_args + emit->scope->num_kwonly_args
-        + emit->ct_num_obj + emit->ct_cur_raw_code++, (mp_uint_t)(uintptr_t)rc);
-    #else
-    // aligns the pointer so it is friendly to GC
-    emit_write_bytecode_byte(emit, stack_adj, b);
-    emit->bytecode_offset = (size_t)MP_ALIGN(emit->bytecode_offset, sizeof(void *));
-    void **c = (void **)emit_get_cur_to_write_bytecode(emit, sizeof(void *));
-    // Verify thar c is already uint-aligned
-    assert(c == MP_ALIGN(c, sizeof(void *)));
-    *c = rc;
-    #endif
+        mp_emit_common_alloc_const_child(emit->emit_common, rc));
     #if MICROPY_PY_SYS_SETTRACE
     rc->line_of_definition = emit->last_source_line;
     #endif
 }
 
-// unsigned labels are relative to ip following this instruction, stored as 16 bits
-STATIC void emit_write_bytecode_byte_unsigned_label(emit_t *emit, int stack_adj, byte b1, mp_uint_t label) {
+// Emit a jump opcode to a destination label.
+// The offset to the label is relative to the ip following this instruction.
+// The offset is encoded as either 1 or 2 bytes, depending on how big it is.
+// The encoding of this jump opcode can change size from one pass to the next,
+// but it must only ever decrease in size on successive passes.
+STATIC void emit_write_bytecode_byte_label(emit_t *emit, int stack_adj, byte b1, mp_uint_t label) {
     mp_emit_bc_adjust_stack_size(emit, stack_adj);
-    mp_uint_t bytecode_offset;
-    if (emit->pass < MP_PASS_EMIT) {
-        bytecode_offset = 0;
-    } else {
-        bytecode_offset = emit->label_offsets[label] - emit->bytecode_offset - 3;
-    }
-    byte *c = emit_get_cur_to_write_bytecode(emit, 3);
-    c[0] = b1;
-    c[1] = bytecode_offset;
-    c[2] = bytecode_offset >> 8;
-}
 
-// signed labels are relative to ip following this instruction, stored as 16 bits, in excess
-STATIC void emit_write_bytecode_byte_signed_label(emit_t *emit, int stack_adj, byte b1, mp_uint_t label) {
-    mp_emit_bc_adjust_stack_size(emit, stack_adj);
-    int bytecode_offset;
-    if (emit->pass < MP_PASS_EMIT) {
-        bytecode_offset = 0;
-    } else {
-        bytecode_offset = emit->label_offsets[label] - emit->bytecode_offset - 3 + 0x8000;
+    // Determine if the jump offset is signed or unsigned, based on the opcode.
+    const bool is_signed = b1 <= MP_BC_POP_JUMP_IF_FALSE;
+
+    // Default to a 2-byte encoding (the largest) with an unknown jump offset.
+    unsigned int jump_encoding_size = 1;
+    ssize_t bytecode_offset = 0;
+
+    // Compute the jump size and offset only when code size is known.
+    if (emit->pass >= MP_PASS_CODE_SIZE) {
+        // The -2 accounts for this jump opcode taking 2 bytes (at least).
+        bytecode_offset = emit->label_offsets[label] - emit->bytecode_offset - 2;
+
+        // Check if the bytecode_offset is small enough to use a 1-byte encoding.
+        if ((is_signed && -64 <= bytecode_offset && bytecode_offset <= 63)
+            || (!is_signed && (size_t)bytecode_offset <= 127)) {
+            // Use a 1-byte jump offset.
+            jump_encoding_size = 0;
+        }
+
+        // Adjust the offset depending on the size of the encoding of the offset.
+        bytecode_offset -= jump_encoding_size;
+
+        assert(is_signed || bytecode_offset >= 0);
     }
-    byte *c = emit_get_cur_to_write_bytecode(emit, 3);
+
+    // Emit the opcode.
+    byte *c = emit_get_cur_to_write_bytecode(emit, 2 + jump_encoding_size);
     c[0] = b1;
-    c[1] = bytecode_offset;
-    c[2] = bytecode_offset >> 8;
+    if (jump_encoding_size == 0) {
+        if (is_signed) {
+            bytecode_offset += 0x40;
+        }
+        assert(0 <= bytecode_offset && bytecode_offset <= 0x7f);
+        c[1] = bytecode_offset;
+    } else {
+        if (is_signed) {
+            bytecode_offset += 0x4000;
+        }
+        if (emit->pass == MP_PASS_EMIT && !(0 <= bytecode_offset && bytecode_offset <= 0x7fff)) {
+            emit->overflow = true;
+        }
+        c[1] = 0x80 | (bytecode_offset & 0x7f);
+        c[2] = bytecode_offset >> 7;
+    }
 }
 
 void mp_emit_bc_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scope) {
@@ -315,14 +276,9 @@ void mp_emit_bc_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scope) {
     emit->scope = scope;
     emit->last_source_line_offset = 0;
     emit->last_source_line = 1;
-    #ifndef NDEBUG
-    // With debugging enabled labels are checked for unique assignment
-    if (pass < MP_PASS_EMIT && emit->label_offsets != NULL) {
-        memset(emit->label_offsets, -1, emit->max_num_labels * sizeof(mp_uint_t));
-    }
-    #endif
     emit->bytecode_offset = 0;
     emit->code_info_offset = 0;
+    emit->overflow = false;
 
     // Write local state size, exception stack size, scope flags and number of arguments
     {
@@ -343,27 +299,19 @@ void mp_emit_bc_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scope) {
     }
 
     // Write number of cells and size of the source code info
-    if (pass >= MP_PASS_CODE_SIZE) {
-        MP_BC_PRELUDE_SIZE_ENCODE(emit->n_info, emit->n_cell, emit_write_code_info_byte, emit);
+    if (emit->pass >= MP_PASS_CODE_SIZE) {
+        size_t n_info = emit->n_info;
+        size_t n_cell = emit->n_cell;
+        MP_BC_PRELUDE_SIZE_ENCODE(n_info, n_cell, emit_write_code_info_byte, emit);
     }
 
     emit->n_info = emit->code_info_offset;
 
-    // Write the name and source file of this function.
+    // Write the name of this function.
     emit_write_code_info_qstr(emit, scope->simple_name);
-    emit_write_code_info_qstr(emit, scope->source_file);
-
-    #if MICROPY_PERSISTENT_CODE
-    emit->ct_cur_obj = 0;
-    emit->ct_cur_raw_code = 0;
-    #endif
-
-    if (pass == MP_PASS_EMIT) {
-        // Write argument names (needed to resolve positional args passed as
-        // keywords).  We store them as full word-sized objects for efficient access
-        // in mp_setup_code_state this is the start of the prelude and is guaranteed
-        // to be aligned on a word boundary.
 
+    // Write argument names, needed to resolve positional args passed as keywords.
+    {
         // For a given argument position (indexed by i) we need to find the
         // corresponding id_info which is a parameter, as it has the correct
         // qstr name to use as the argument name.  Note that it's not a simple
@@ -383,21 +331,19 @@ void mp_emit_bc_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scope) {
                     break;
                 }
             }
-            emit->const_table[i] = (mp_uint_t)MP_OBJ_NEW_QSTR(qst);
+            emit_write_code_info_qstr(emit, qst);
         }
     }
 }
 
-void mp_emit_bc_end_pass(emit_t *emit) {
+bool mp_emit_bc_end_pass(emit_t *emit) {
     if (emit->pass == MP_PASS_SCOPE) {
-        return;
+        return true;
     }
 
     // check stack is back to zero size
     assert(emit->stack_size == 0);
 
-    emit_write_code_info_byte(emit, 0); // end of line number info
-
     // Calculate size of source code info section
     emit->n_info = emit->code_info_offset - emit->n_info;
 
@@ -412,42 +358,43 @@ void mp_emit_bc_end_pass(emit_t *emit) {
         }
     }
 
-    #if MICROPY_PERSISTENT_CODE
-    assert(emit->pass <= MP_PASS_STACK_SIZE || (emit->ct_num_obj == emit->ct_cur_obj));
-    emit->ct_num_obj = emit->ct_cur_obj;
-    #endif
-
     if (emit->pass == MP_PASS_CODE_SIZE) {
-        #if !MICROPY_PERSISTENT_CODE
-        // so bytecode is aligned
-        emit->code_info_offset = (size_t)MP_ALIGN(emit->code_info_offset, sizeof(mp_uint_t));
-        #endif
-
         // calculate size of total code-info + bytecode, in bytes
         emit->code_info_size = emit->code_info_offset;
         emit->bytecode_size = emit->bytecode_offset;
         emit->code_base = m_new0(byte, emit->code_info_size + emit->bytecode_size);
 
-        #if MICROPY_PERSISTENT_CODE
-        emit->const_table = m_new0(mp_uint_t,
-            emit->scope->num_pos_args + emit->scope->num_kwonly_args
-            + emit->ct_cur_obj + emit->ct_cur_raw_code);
-        #else
-        emit->const_table = m_new0(mp_uint_t,
-            emit->scope->num_pos_args + emit->scope->num_kwonly_args);
-        #endif
-
     } else if (emit->pass == MP_PASS_EMIT) {
+        // Code info and/or bytecode can shrink during this pass.
+        assert(emit->code_info_offset <= emit->code_info_size);
+        assert(emit->bytecode_offset <= emit->bytecode_size);
+
+        if (emit->code_info_offset != emit->code_info_size
+            || emit->bytecode_offset != emit->bytecode_size) {
+            // Code info and/or bytecode changed size in this pass, so request the
+            // compiler to do another pass with these updated sizes.
+            emit->code_info_size = emit->code_info_offset;
+            emit->bytecode_size = emit->bytecode_offset;
+            return false;
+        }
+
+        if (emit->overflow) {
+            mp_raise_msg(&mp_type_RuntimeError, MP_ERROR_TEXT("bytecode overflow"));
+        }
+
+        // Bytecode is finalised, assign it to the raw code object.
         mp_emit_glue_assign_bytecode(emit->scope->raw_code, emit->code_base,
             #if MICROPY_PERSISTENT_CODE_SAVE || MICROPY_DEBUG_PRINTERS
             emit->code_info_size + emit->bytecode_size,
             #endif
-            emit->const_table,
+            emit->emit_common->children,
             #if MICROPY_PERSISTENT_CODE_SAVE
-            emit->ct_cur_obj, emit->ct_cur_raw_code,
+            emit->emit_common->ct_cur_child,
             #endif
             emit->scope->scope_flags);
     }
+
+    return true;
 }
 
 bool mp_emit_bc_last_emit_was_return_value(emit_t *emit) {
@@ -490,15 +437,16 @@ void mp_emit_bc_label_assign(emit_t *emit, mp_uint_t l) {
     if (emit->pass == MP_PASS_SCOPE) {
         return;
     }
+
+    // Label offsets can change from one pass to the next, but they must only
+    // decrease (ie code can only shrink).  There will be multiple MP_PASS_EMIT
+    // stages until the labels no longer change, which is when the code size
+    // stays constant after a MP_PASS_EMIT.
     assert(l < emit->max_num_labels);
-    if (emit->pass < MP_PASS_EMIT) {
-        // assign label offset
-        assert(emit->label_offsets[l] == (mp_uint_t)-1);
-        emit->label_offsets[l] = emit->bytecode_offset;
-    } else {
-        // ensure label offset has not changed from MP_PASS_CODE_SIZE to MP_PASS_EMIT
-        assert(emit->label_offsets[l] == emit->bytecode_offset);
-    }
+    assert(emit->pass == MP_PASS_STACK_SIZE || emit->bytecode_offset <= emit->label_offsets[l]);
+
+    // Assign label offset.
+    emit->label_offsets[l] = emit->bytecode_offset;
 }
 
 void mp_emit_bc_import(emit_t *emit, qstr qst, int kind) {
@@ -523,6 +471,7 @@ void mp_emit_bc_load_const_tok(emit_t *emit, mp_token_kind_t tok) {
 }
 
 void mp_emit_bc_load_const_small_int(emit_t *emit, mp_int_t arg) {
+    assert(MP_SMALL_INT_FITS(arg));
     if (-MP_BC_LOAD_CONST_SMALL_INT_MULTI_EXCESS <= arg
         && arg < MP_BC_LOAD_CONST_SMALL_INT_MULTI_NUM - MP_BC_LOAD_CONST_SMALL_INT_MULTI_EXCESS) {
         emit_write_bytecode_byte(emit, 1,
@@ -560,9 +509,6 @@ void mp_emit_bc_load_global(emit_t *emit, qstr qst, int kind) {
     MP_STATIC_ASSERT(MP_BC_LOAD_NAME + MP_EMIT_IDOP_GLOBAL_GLOBAL == MP_BC_LOAD_GLOBAL);
     (void)qst;
     emit_write_bytecode_byte_qstr(emit, 1, MP_BC_LOAD_NAME + kind, qst);
-    if (MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE_DYNAMIC) {
-        emit_write_bytecode_raw_byte(emit, 0);
-    }
 }
 
 void mp_emit_bc_load_method(emit_t *emit, qstr qst, bool is_super) {
@@ -596,9 +542,6 @@ void mp_emit_bc_attr(emit_t *emit, qstr qst, int kind) {
         }
         emit_write_bytecode_byte_qstr(emit, -2, MP_BC_STORE_ATTR, qst);
     }
-    if (MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE_DYNAMIC) {
-        emit_write_bytecode_raw_byte(emit, 0);
-    }
 }
 
 void mp_emit_bc_store_local(emit_t *emit, qstr qst, mp_uint_t local_num, int kind) {
@@ -652,22 +595,22 @@ void mp_emit_bc_rot_three(emit_t *emit) {
 }
 
 void mp_emit_bc_jump(emit_t *emit, mp_uint_t label) {
-    emit_write_bytecode_byte_signed_label(emit, 0, MP_BC_JUMP, label);
+    emit_write_bytecode_byte_label(emit, 0, MP_BC_JUMP, label);
 }
 
 void mp_emit_bc_pop_jump_if(emit_t *emit, bool cond, mp_uint_t label) {
     if (cond) {
-        emit_write_bytecode_byte_signed_label(emit, -1, MP_BC_POP_JUMP_IF_TRUE, label);
+        emit_write_bytecode_byte_label(emit, -1, MP_BC_POP_JUMP_IF_TRUE, label);
     } else {
-        emit_write_bytecode_byte_signed_label(emit, -1, MP_BC_POP_JUMP_IF_FALSE, label);
+        emit_write_bytecode_byte_label(emit, -1, MP_BC_POP_JUMP_IF_FALSE, label);
     }
 }
 
 void mp_emit_bc_jump_if_or_pop(emit_t *emit, bool cond, mp_uint_t label) {
     if (cond) {
-        emit_write_bytecode_byte_signed_label(emit, -1, MP_BC_JUMP_IF_TRUE_OR_POP, label);
+        emit_write_bytecode_byte_label(emit, -1, MP_BC_JUMP_IF_TRUE_OR_POP, label);
     } else {
-        emit_write_bytecode_byte_signed_label(emit, -1, MP_BC_JUMP_IF_FALSE_OR_POP, label);
+        emit_write_bytecode_byte_label(emit, -1, MP_BC_JUMP_IF_FALSE_OR_POP, label);
     }
 }
 
@@ -681,9 +624,9 @@ void mp_emit_bc_unwind_jump(emit_t *emit, mp_uint_t label, mp_uint_t except_dept
                 emit_write_bytecode_raw_byte(emit, MP_BC_POP_TOP);
             }
         }
-        emit_write_bytecode_byte_signed_label(emit, 0, MP_BC_JUMP, label & ~MP_EMIT_BREAK_FROM_FOR);
+        emit_write_bytecode_byte_label(emit, 0, MP_BC_JUMP, label & ~MP_EMIT_BREAK_FROM_FOR);
     } else {
-        emit_write_bytecode_byte_signed_label(emit, 0, MP_BC_UNWIND_JUMP, label & ~MP_EMIT_BREAK_FROM_FOR);
+        emit_write_bytecode_byte_label(emit, 0, MP_BC_UNWIND_JUMP, label & ~MP_EMIT_BREAK_FROM_FOR);
         emit_write_bytecode_raw_byte(emit, ((label & MP_EMIT_BREAK_FROM_FOR) ? 0x80 : 0) | except_depth);
     }
 }
@@ -695,7 +638,7 @@ void mp_emit_bc_setup_block(emit_t *emit, mp_uint_t label, int kind) {
     // The SETUP_WITH opcode pops ctx_mgr from the top of the stack
     // and then pushes 3 entries: __exit__, ctx_mgr, as_value.
     int stack_adj = kind == MP_EMIT_SETUP_BLOCK_WITH ? 2 : 0;
-    emit_write_bytecode_byte_unsigned_label(emit, stack_adj, MP_BC_SETUP_WITH + kind, label);
+    emit_write_bytecode_byte_label(emit, stack_adj, MP_BC_SETUP_WITH + kind, label);
 }
 
 void mp_emit_bc_with_cleanup(emit_t *emit, mp_uint_t label) {
@@ -717,7 +660,7 @@ void mp_emit_bc_get_iter(emit_t *emit, bool use_stack) {
 }
 
 void mp_emit_bc_for_iter(emit_t *emit, mp_uint_t label) {
-    emit_write_bytecode_byte_unsigned_label(emit, 1, MP_BC_FOR_ITER, label);
+    emit_write_bytecode_byte_label(emit, 1, MP_BC_FOR_ITER, label);
 }
 
 void mp_emit_bc_for_iter_end(emit_t *emit) {
@@ -726,7 +669,7 @@ void mp_emit_bc_for_iter_end(emit_t *emit) {
 
 void mp_emit_bc_pop_except_jump(emit_t *emit, mp_uint_t label, bool within_exc_handler) {
     (void)within_exc_handler;
-    emit_write_bytecode_byte_unsigned_label(emit, 0, MP_BC_POP_EXCEPT_JUMP, label);
+    emit_write_bytecode_byte_label(emit, 0, MP_BC_POP_EXCEPT_JUMP, label);
 }
 
 void mp_emit_bc_unary_op(emit_t *emit, mp_unary_op_t op) {
@@ -789,28 +732,30 @@ void mp_emit_bc_unpack_ex(emit_t *emit, mp_uint_t n_left, mp_uint_t n_right) {
 
 void mp_emit_bc_make_function(emit_t *emit, scope_t *scope, mp_uint_t n_pos_defaults, mp_uint_t n_kw_defaults) {
     if (n_pos_defaults == 0 && n_kw_defaults == 0) {
-        emit_write_bytecode_byte_raw_code(emit, 1, MP_BC_MAKE_FUNCTION, scope->raw_code);
+        emit_write_bytecode_byte_child(emit, 1, MP_BC_MAKE_FUNCTION, scope->raw_code);
     } else {
-        emit_write_bytecode_byte_raw_code(emit, -1, MP_BC_MAKE_FUNCTION_DEFARGS, scope->raw_code);
+        emit_write_bytecode_byte_child(emit, -1, MP_BC_MAKE_FUNCTION_DEFARGS, scope->raw_code);
     }
 }
 
 void mp_emit_bc_make_closure(emit_t *emit, scope_t *scope, mp_uint_t n_closed_over, mp_uint_t n_pos_defaults, mp_uint_t n_kw_defaults) {
     if (n_pos_defaults == 0 && n_kw_defaults == 0) {
         int stack_adj = -n_closed_over + 1;
-        emit_write_bytecode_byte_raw_code(emit, stack_adj, MP_BC_MAKE_CLOSURE, scope->raw_code);
+        emit_write_bytecode_byte_child(emit, stack_adj, MP_BC_MAKE_CLOSURE, scope->raw_code);
         emit_write_bytecode_raw_byte(emit, n_closed_over);
     } else {
         assert(n_closed_over <= 255);
         int stack_adj = -2 - (mp_int_t)n_closed_over + 1;
-        emit_write_bytecode_byte_raw_code(emit, stack_adj, MP_BC_MAKE_CLOSURE_DEFARGS, scope->raw_code);
+        emit_write_bytecode_byte_child(emit, stack_adj, MP_BC_MAKE_CLOSURE_DEFARGS, scope->raw_code);
         emit_write_bytecode_raw_byte(emit, n_closed_over);
     }
 }
 
 STATIC void emit_bc_call_function_method_helper(emit_t *emit, int stack_adj, mp_uint_t bytecode_base, mp_uint_t n_positional, mp_uint_t n_keyword, mp_uint_t star_flags) {
     if (star_flags) {
-        stack_adj -= (int)n_positional + 2 * (int)n_keyword + 2;
+        // each positional arg is one object, each kwarg is two objects, the key
+        // and the value and one extra object for the star args bitmap.
+        stack_adj -= (int)n_positional + 2 * (int)n_keyword + 1;
         emit_write_bytecode_byte_uint(emit, stack_adj, bytecode_base + 1, (n_keyword << 8) | n_positional); // TODO make it 2 separate uints?
     } else {
         stack_adj -= (int)n_positional + 2 * (int)n_keyword;
diff --git a/python/src/py/emitcommon.c b/python/src/py/emitcommon.c
index 791bf398a..679ef1d97 100644
--- a/python/src/py/emitcommon.c
+++ b/python/src/py/emitcommon.c
@@ -27,15 +27,76 @@
 #include <assert.h>
 
 #include "py/emit.h"
+#include "py/nativeglue.h"
 
 #if MICROPY_ENABLE_COMPILER
 
+#if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
+qstr_short_t mp_emit_common_use_qstr(mp_emit_common_t *emit, qstr qst) {
+    mp_map_elem_t *elem = mp_map_lookup(&emit->qstr_map, MP_OBJ_NEW_QSTR(qst), MP_MAP_LOOKUP_ADD_IF_NOT_FOUND);
+    if (elem->value == MP_OBJ_NULL) {
+        elem->value = MP_OBJ_NEW_SMALL_INT(emit->qstr_map.used - 1);
+    }
+    return MP_OBJ_SMALL_INT_VALUE(elem->value);
+}
+#endif
+
+// Compare two objects for strict equality, including equality of type.  This is
+// different to the semantics of mp_obj_equal which, eg, has (True,) == (1.0,).
+static bool strictly_equal(mp_obj_t a, mp_obj_t b) {
+    if (a == b) {
+        return true;
+    }
+
+    #if MICROPY_EMIT_NATIVE
+    if (a == MP_OBJ_FROM_PTR(&mp_fun_table) || b == MP_OBJ_FROM_PTR(&mp_fun_table)) {
+        return false;
+    }
+    #endif
+
+    const mp_obj_type_t *a_type = mp_obj_get_type(a);
+    const mp_obj_type_t *b_type = mp_obj_get_type(b);
+    if (a_type != b_type) {
+        return false;
+    }
+    if (a_type == &mp_type_tuple) {
+        mp_obj_tuple_t *a_tuple = MP_OBJ_TO_PTR(a);
+        mp_obj_tuple_t *b_tuple = MP_OBJ_TO_PTR(b);
+        if (a_tuple->len != b_tuple->len) {
+            return false;
+        }
+        for (size_t i = 0; i < a_tuple->len; ++i) {
+            if (!strictly_equal(a_tuple->items[i], b_tuple->items[i])) {
+                return false;
+            }
+        }
+        return true;
+    } else {
+        return mp_obj_equal(a, b);
+    }
+}
+
+size_t mp_emit_common_use_const_obj(mp_emit_common_t *emit, mp_obj_t const_obj) {
+    for (size_t i = 0; i < emit->const_obj_list.len; ++i) {
+        if (strictly_equal(emit->const_obj_list.items[i], const_obj)) {
+            return i;
+        }
+    }
+    mp_obj_list_append(MP_OBJ_FROM_PTR(&emit->const_obj_list), const_obj);
+    return emit->const_obj_list.len - 1;
+}
+
 void mp_emit_common_get_id_for_modification(scope_t *scope, qstr qst) {
     // name adding/lookup
     id_info_t *id = scope_find_or_add_id(scope, qst, ID_INFO_KIND_GLOBAL_IMPLICIT);
-    if (SCOPE_IS_FUNC_LIKE(scope->kind) && id->kind == ID_INFO_KIND_GLOBAL_IMPLICIT) {
-        // rebind as a local variable
-        id->kind = ID_INFO_KIND_LOCAL;
+    if (id->kind == ID_INFO_KIND_GLOBAL_IMPLICIT) {
+        if (SCOPE_IS_FUNC_LIKE(scope->kind)) {
+            // rebind as a local variable
+            id->kind = ID_INFO_KIND_LOCAL;
+        } else {
+            // mark this as assigned, to prevent it from being closed over
+            id->kind = ID_INFO_KIND_GLOBAL_IMPLICIT_ASSIGNED;
+        }
     }
 }
 
@@ -46,7 +107,7 @@ void mp_emit_common_id_op(emit_t *emit, const mp_emit_method_table_id_ops_t *emi
     assert(id != NULL);
 
     // call the emit backend with the correct code
-    if (id->kind == ID_INFO_KIND_GLOBAL_IMPLICIT) {
+    if (id->kind == ID_INFO_KIND_GLOBAL_IMPLICIT || id->kind == ID_INFO_KIND_GLOBAL_IMPLICIT_ASSIGNED) {
         emit_method_table->global(emit, qst, MP_EMIT_IDOP_GLOBAL_NAME);
     } else if (id->kind == ID_INFO_KIND_GLOBAL_EXPLICIT) {
         emit_method_table->global(emit, qst, MP_EMIT_IDOP_GLOBAL_GLOBAL);
diff --git a/python/src/py/emitglue.c b/python/src/py/emitglue.c
index 09b48682f..95be7f661 100644
--- a/python/src/py/emitglue.c
+++ b/python/src/py/emitglue.c
@@ -34,6 +34,7 @@
 #include "py/emitglue.h"
 #include "py/runtime0.h"
 #include "py/bc.h"
+#include "py/objfun.h"
 #include "py/profile.h"
 
 #if MICROPY_DEBUG_VERBOSE // print debugging info
@@ -63,20 +64,22 @@ void mp_emit_glue_assign_bytecode(mp_raw_code_t *rc, const byte *code,
     #if MICROPY_PERSISTENT_CODE_SAVE || MICROPY_DEBUG_PRINTERS
     size_t len,
     #endif
-    const mp_uint_t *const_table,
+    mp_raw_code_t **children,
     #if MICROPY_PERSISTENT_CODE_SAVE
-    uint16_t n_obj, uint16_t n_raw_code,
+    size_t n_children,
     #endif
     mp_uint_t scope_flags) {
 
     rc->kind = MP_CODE_BYTECODE;
     rc->scope_flags = scope_flags;
     rc->fun_data = code;
-    rc->const_table = const_table;
-    #if MICROPY_PERSISTENT_CODE_SAVE
+    #if MICROPY_PERSISTENT_CODE_SAVE || MICROPY_DEBUG_PRINTERS
     rc->fun_data_len = len;
-    rc->n_obj = n_obj;
-    rc->n_raw_code = n_raw_code;
+    #endif
+    rc->children = children;
+
+    #if MICROPY_PERSISTENT_CODE_SAVE
+    rc->n_children = n_children;
     #endif
 
     #if MICROPY_PY_SYS_SETTRACE
@@ -85,26 +88,21 @@ void mp_emit_glue_assign_bytecode(mp_raw_code_t *rc, const byte *code,
     #endif
 
     #ifdef DEBUG_PRINT
-    #if !MICROPY_DEBUG_PRINTERS
+    #if !(MICROPY_PERSISTENT_CODE_SAVE || MICROPY_DEBUG_PRINTERS)
     const size_t len = 0;
     #endif
     DEBUG_printf("assign byte code: code=%p len=" UINT_FMT " flags=%x\n", code, len, (uint)scope_flags);
     #endif
-    #if MICROPY_DEBUG_PRINTERS
-    if (mp_verbose_flag >= 2) {
-        mp_bytecode_print(&mp_plat_print, rc, code, len, const_table);
-    }
-    #endif
 }
 
 #if MICROPY_EMIT_MACHINE_CODE
-void mp_emit_glue_assign_native(mp_raw_code_t *rc, mp_raw_code_kind_t kind, void *fun_data, mp_uint_t fun_len, const mp_uint_t *const_table,
+void mp_emit_glue_assign_native(mp_raw_code_t *rc, mp_raw_code_kind_t kind, void *fun_data, mp_uint_t fun_len,
+    mp_raw_code_t **children,
     #if MICROPY_PERSISTENT_CODE_SAVE
+    size_t n_children,
     uint16_t prelude_offset,
-    uint16_t n_obj, uint16_t n_raw_code,
-    uint16_t n_qstr, mp_qstr_link_entry_t *qstr_link,
     #endif
-    mp_uint_t n_pos_args, mp_uint_t scope_flags, mp_uint_t type_sig) {
+    mp_uint_t scope_flags, mp_uint_t n_pos_args, mp_uint_t type_sig) {
 
     assert(kind == MP_CODE_NATIVE_PY || kind == MP_CODE_NATIVE_VIPER || kind == MP_CODE_NATIVE_ASM);
 
@@ -135,20 +133,22 @@ void mp_emit_glue_assign_native(mp_raw_code_t *rc, mp_raw_code_kind_t kind, void
 
     rc->kind = kind;
     rc->scope_flags = scope_flags;
-    rc->n_pos_args = n_pos_args;
     rc->fun_data = fun_data;
-    rc->const_table = const_table;
-    rc->type_sig = type_sig;
+
+    #if MICROPY_PERSISTENT_CODE_SAVE || MICROPY_DEBUG_PRINTERS
+    rc->fun_data_len = fun_len;
+    #endif
+    rc->children = children;
 
     #if MICROPY_PERSISTENT_CODE_SAVE
-    rc->fun_data_len = fun_len;
+    rc->n_children = n_children;
     rc->prelude_offset = prelude_offset;
-    rc->n_obj = n_obj;
-    rc->n_raw_code = n_raw_code;
-    rc->n_qstr = n_qstr;
-    rc->qstr_link = qstr_link;
     #endif
 
+    // These two entries are only needed for MP_CODE_NATIVE_ASM.
+    rc->n_pos_args = n_pos_args;
+    rc->type_sig = type_sig;
+
     #ifdef DEBUG_PRINT
     DEBUG_printf("assign native: kind=%d fun=%p len=" UINT_FMT " n_pos_args=" UINT_FMT " flags=%x\n", kind, fun_data, fun_len, n_pos_args, (uint)scope_flags);
     for (mp_uint_t i = 0; i < fun_len; i++) {
@@ -170,15 +170,15 @@ void mp_emit_glue_assign_native(mp_raw_code_t *rc, mp_raw_code_kind_t kind, void
 }
 #endif
 
-mp_obj_t mp_make_function_from_raw_code(const mp_raw_code_t *rc, mp_obj_t def_args, mp_obj_t def_kw_args) {
+mp_obj_t mp_make_function_from_raw_code(const mp_raw_code_t *rc, const mp_module_context_t *context, const mp_obj_t *def_args) {
     DEBUG_OP_printf("make_function_from_raw_code %p\n", rc);
     assert(rc != NULL);
 
     // def_args must be MP_OBJ_NULL or a tuple
-    assert(def_args == MP_OBJ_NULL || mp_obj_is_type(def_args, &mp_type_tuple));
+    assert(def_args == NULL || def_args[0] == MP_OBJ_NULL || mp_obj_is_type(def_args[0], &mp_type_tuple));
 
     // def_kw_args must be MP_OBJ_NULL or a dict
-    assert(def_kw_args == MP_OBJ_NULL || mp_obj_is_type(def_kw_args, &mp_type_dict));
+    assert(def_args == NULL || def_args[1] == MP_OBJ_NULL || mp_obj_is_type(def_args[1], &mp_type_dict));
 
     // make the function, depending on the raw code kind
     mp_obj_t fun;
@@ -186,7 +186,7 @@ mp_obj_t mp_make_function_from_raw_code(const mp_raw_code_t *rc, mp_obj_t def_ar
         #if MICROPY_EMIT_NATIVE
         case MP_CODE_NATIVE_PY:
         case MP_CODE_NATIVE_VIPER:
-            fun = mp_obj_new_fun_native(def_args, def_kw_args, rc->fun_data, rc->const_table);
+            fun = mp_obj_new_fun_native(def_args, rc->fun_data, context, rc->children);
             // Check for a generator function, and if so change the type of the object
             if ((rc->scope_flags & MP_SCOPE_FLAG_GENERATOR) != 0) {
                 ((mp_obj_base_t *)MP_OBJ_TO_PTR(fun))->type = &mp_type_native_gen_wrap;
@@ -201,7 +201,7 @@ mp_obj_t mp_make_function_from_raw_code(const mp_raw_code_t *rc, mp_obj_t def_ar
         default:
             // rc->kind should always be set and BYTECODE is the only remaining case
             assert(rc->kind == MP_CODE_BYTECODE);
-            fun = mp_obj_new_fun_bc(def_args, def_kw_args, rc->fun_data, rc->const_table);
+            fun = mp_obj_new_fun_bc(def_args, rc->fun_data, context, rc->children);
             // check for generator functions and if so change the type of the object
             if ((rc->scope_flags & MP_SCOPE_FLAG_GENERATOR) != 0) {
                 ((mp_obj_base_t *)MP_OBJ_TO_PTR(fun))->type = &mp_type_gen_wrap;
@@ -218,16 +218,16 @@ mp_obj_t mp_make_function_from_raw_code(const mp_raw_code_t *rc, mp_obj_t def_ar
     return fun;
 }
 
-mp_obj_t mp_make_closure_from_raw_code(const mp_raw_code_t *rc, mp_uint_t n_closed_over, const mp_obj_t *args) {
+mp_obj_t mp_make_closure_from_raw_code(const mp_raw_code_t *rc, const mp_module_context_t *context, mp_uint_t n_closed_over, const mp_obj_t *args) {
     DEBUG_OP_printf("make_closure_from_raw_code %p " UINT_FMT " %p\n", rc, n_closed_over, args);
     // make function object
     mp_obj_t ffun;
     if (n_closed_over & 0x100) {
         // default positional and keyword args given
-        ffun = mp_make_function_from_raw_code(rc, args[0], args[1]);
+        ffun = mp_make_function_from_raw_code(rc, context, args);
     } else {
         // default positional and keyword args not given
-        ffun = mp_make_function_from_raw_code(rc, MP_OBJ_NULL, MP_OBJ_NULL);
+        ffun = mp_make_function_from_raw_code(rc, context, NULL);
     }
     // wrap function in closure object
     return mp_obj_new_closure(ffun, n_closed_over & 0xff, args + ((n_closed_over >> 7) & 2));
diff --git a/python/src/py/emitglue.h b/python/src/py/emitglue.h
index a5411dc2e..4ddf74011 100644
--- a/python/src/py/emitglue.h
+++ b/python/src/py/emitglue.h
@@ -49,21 +49,20 @@ typedef enum {
     MP_CODE_NATIVE_ASM,
 } mp_raw_code_kind_t;
 
-typedef struct _mp_qstr_link_entry_t {
-    uint16_t off;
-    uint16_t qst;
-} mp_qstr_link_entry_t;
-
+// compiled bytecode: instance in RAM, referenced by outer scope, usually freed after first (and only) use
+// mpy file: instance in RAM, created when .mpy file is loaded (same comments as above)
+// frozen: instance in ROM
 typedef struct _mp_raw_code_t {
     mp_uint_t kind : 3; // of type mp_raw_code_kind_t
     mp_uint_t scope_flags : 7;
     mp_uint_t n_pos_args : 11;
     const void *fun_data;
-    const mp_uint_t *const_table;
+    #if MICROPY_PERSISTENT_CODE_SAVE || MICROPY_DEBUG_PRINTERS
+    size_t fun_data_len; // so mp_raw_code_save and mp_bytecode_print work
+    #endif
+    struct _mp_raw_code_t **children;
     #if MICROPY_PERSISTENT_CODE_SAVE
-    size_t fun_data_len;
-    uint16_t n_obj;
-    uint16_t n_raw_code;
+    size_t n_children;
     #if MICROPY_PY_SYS_SETTRACE
     mp_bytecode_prelude_t prelude;
     // line_of_definition is a Python source line where the raw_code was
@@ -74,8 +73,6 @@ typedef struct _mp_raw_code_t {
     #endif
     #if MICROPY_EMIT_MACHINE_CODE
     uint16_t prelude_offset;
-    uint16_t n_qstr;
-    mp_qstr_link_entry_t *qstr_link;
     #endif
     #endif
     #if MICROPY_EMIT_MACHINE_CODE
@@ -89,22 +86,21 @@ void mp_emit_glue_assign_bytecode(mp_raw_code_t *rc, const byte *code,
     #if MICROPY_PERSISTENT_CODE_SAVE || MICROPY_DEBUG_PRINTERS
     size_t len,
     #endif
-    const mp_uint_t *const_table,
+    mp_raw_code_t **children,
     #if MICROPY_PERSISTENT_CODE_SAVE
-    uint16_t n_obj, uint16_t n_raw_code,
+    size_t n_children,
     #endif
     mp_uint_t scope_flags);
 
 void mp_emit_glue_assign_native(mp_raw_code_t *rc, mp_raw_code_kind_t kind, void *fun_data, mp_uint_t fun_len,
-    const mp_uint_t *const_table,
+    mp_raw_code_t **children,
     #if MICROPY_PERSISTENT_CODE_SAVE
+    size_t n_children,
     uint16_t prelude_offset,
-    uint16_t n_obj, uint16_t n_raw_code,
-    uint16_t n_qstr, mp_qstr_link_entry_t *qstr_link,
     #endif
-    mp_uint_t n_pos_args, mp_uint_t scope_flags, mp_uint_t type_sig);
+    mp_uint_t scope_flags, mp_uint_t n_pos_args, mp_uint_t type_sig);
 
-mp_obj_t mp_make_function_from_raw_code(const mp_raw_code_t *rc, mp_obj_t def_args, mp_obj_t def_kw_args);
-mp_obj_t mp_make_closure_from_raw_code(const mp_raw_code_t *rc, mp_uint_t n_closed_over, const mp_obj_t *args);
+mp_obj_t mp_make_function_from_raw_code(const mp_raw_code_t *rc, const mp_module_context_t *context, const mp_obj_t *def_args);
+mp_obj_t mp_make_closure_from_raw_code(const mp_raw_code_t *rc, const mp_module_context_t *context, mp_uint_t n_closed_over, const mp_obj_t *args);
 
 #endif // MICROPY_INCLUDED_PY_EMITGLUE_H
diff --git a/python/src/py/emitinlinethumb.c b/python/src/py/emitinlinethumb.c
index 1a35e25ad..29487f104 100644
--- a/python/src/py/emitinlinethumb.c
+++ b/python/src/py/emitinlinethumb.c
@@ -59,6 +59,21 @@ struct _emit_inline_asm_t {
     qstr *label_lookup;
 };
 
+#if MICROPY_DYNAMIC_COMPILER
+
+static inline bool emit_inline_thumb_allow_float(emit_inline_asm_t *emit) {
+    return MP_NATIVE_ARCH_ARMV7EMSP <= mp_dynamic_compiler.native_arch
+           && mp_dynamic_compiler.native_arch <= MP_NATIVE_ARCH_ARMV7EMDP;
+}
+
+#else
+
+static inline bool emit_inline_thumb_allow_float(emit_inline_asm_t *emit) {
+    return MICROPY_EMIT_INLINE_THUMB_FLOAT;
+}
+
+#endif
+
 STATIC void emit_inline_thumb_error_msg(emit_inline_asm_t *emit, mp_rom_error_text_t msg) {
     *emit->error_slot = mp_obj_new_exception_msg(&mp_type_SyntaxError, msg);
 }
@@ -216,7 +231,6 @@ STATIC mp_uint_t get_arg_special_reg(emit_inline_asm_t *emit, const char *op, mp
     return 0;
 }
 
-#if MICROPY_EMIT_INLINE_THUMB_FLOAT
 STATIC mp_uint_t get_arg_vfpreg(emit_inline_asm_t *emit, const char *op, mp_parse_node_t pn) {
     const char *reg_str = get_arg_str(pn);
     if (reg_str[0] == 's' && reg_str[1] != '\0') {
@@ -243,7 +257,6 @@ malformed:
             MP_ERROR_TEXT("'%s' expects an FPU register"), op));
     return 0;
 }
-#endif
 
 STATIC mp_uint_t get_arg_reglist(emit_inline_asm_t *emit, const char *op, mp_parse_node_t pn) {
     // a register list looks like {r0, r1, r2} and is parsed as a Python set
@@ -409,10 +422,10 @@ STATIC const format_9_10_op_t format_9_10_op_table[] = {
 };
 #undef X
 
-#if MICROPY_EMIT_INLINE_THUMB_FLOAT
 // actual opcodes are: 0xee00 | op.hi_nibble, 0x0a00 | op.lo_nibble
-typedef struct _format_vfp_op_t { byte op;
-                                  char name[3];
+typedef struct _format_vfp_op_t {
+    byte op;
+    char name[3];
 } format_vfp_op_t;
 STATIC const format_vfp_op_t format_vfp_op_table[] = {
     { 0x30, "add" },
@@ -420,10 +433,9 @@ STATIC const format_vfp_op_t format_vfp_op_table[] = {
     { 0x20, "mul" },
     { 0x80, "div" },
 };
-#endif
 
 // shorthand alias for whether we allow ARMv7-M instructions
-#define ARMV7M MICROPY_EMIT_INLINE_THUMB_ARMV7M
+#define ARMV7M asm_thumb_allow_armv7m(&emit->as)
 
 STATIC void emit_inline_thumb_op(emit_inline_asm_t *emit, qstr op, mp_uint_t n_args, mp_parse_node_t *pn_args) {
     // TODO perhaps make two tables:
@@ -439,8 +451,7 @@ STATIC void emit_inline_thumb_op(emit_inline_asm_t *emit, qstr op, mp_uint_t n_a
     size_t op_len;
     const char *op_str = (const char *)qstr_data(op, &op_len);
 
-    #if MICROPY_EMIT_INLINE_THUMB_FLOAT
-    if (op_str[0] == 'v') {
+    if (emit_inline_thumb_allow_float(emit) && op_str[0] == 'v') {
         // floating point operations
         if (n_args == 2) {
             mp_uint_t op_code = 0x0ac0, op_code_hi;
@@ -535,7 +546,6 @@ STATIC void emit_inline_thumb_op(emit_inline_asm_t *emit, qstr op, mp_uint_t n_a
         }
         return;
     }
-    #endif
 
     if (n_args == 0) {
         if (op == MP_QSTR_nop) {
@@ -621,8 +631,13 @@ STATIC void emit_inline_thumb_op(emit_inline_asm_t *emit, qstr op, mp_uint_t n_a
             asm_thumb_op16(&emit->as, ASM_THUMB_OP_CPSIE_I);
         } else if (op == MP_QSTR_push) {
             mp_uint_t reglist = get_arg_reglist(emit, op_str, pn_args[0]);
-            if ((reglist & 0xff00) == 0) {
-                asm_thumb_op16(&emit->as, 0xb400 | reglist);
+            if ((reglist & 0xbf00) == 0) {
+                if ((reglist & (1 << 14)) == 0) {
+                    asm_thumb_op16(&emit->as, 0xb400 | reglist);
+                } else {
+                    // 16-bit encoding for pushing low registers and LR
+                    asm_thumb_op16(&emit->as, 0xb500 | (reglist & 0xff));
+                }
             } else {
                 if (!ARMV7M) {
                     goto unknown_op;
@@ -631,8 +646,13 @@ STATIC void emit_inline_thumb_op(emit_inline_asm_t *emit, qstr op, mp_uint_t n_a
             }
         } else if (op == MP_QSTR_pop) {
             mp_uint_t reglist = get_arg_reglist(emit, op_str, pn_args[0]);
-            if ((reglist & 0xff00) == 0) {
-                asm_thumb_op16(&emit->as, 0xbc00 | reglist);
+            if ((reglist & 0x7f00) == 0) {
+                if ((reglist & (1 << 15)) == 0) {
+                    asm_thumb_op16(&emit->as, 0xbc00 | reglist);
+                } else {
+                    // 16-bit encoding for popping low registers and PC, i.e., returning
+                    asm_thumb_op16(&emit->as, 0xbd00 | (reglist & 0xff));
+                }
             } else {
                 if (!ARMV7M) {
                     goto unknown_op;
@@ -705,24 +725,23 @@ STATIC void emit_inline_thumb_op(emit_inline_asm_t *emit, qstr op, mp_uint_t n_a
             } else if (op == MP_QSTR_sub) {
                 op_code = ASM_THUMB_FORMAT_3_SUB;
                 goto op_format_3;
-            #if ARMV7M
-            } else if (op == MP_QSTR_movw) {
+            } else if (ARMV7M && op == MP_QSTR_movw) {
                 op_code = ASM_THUMB_OP_MOVW;
                 mp_uint_t reg_dest;
             op_movw_movt:
                 reg_dest = get_arg_reg(emit, op_str, pn_args[0], 15);
                 int i_src = get_arg_i(emit, op_str, pn_args[1], 0xffff);
                 asm_thumb_mov_reg_i16(&emit->as, op_code, reg_dest, i_src);
-            } else if (op == MP_QSTR_movt) {
+            } else if (ARMV7M && op == MP_QSTR_movt) {
                 op_code = ASM_THUMB_OP_MOVT;
                 goto op_movw_movt;
-            } else if (op == MP_QSTR_movwt) {
+            } else if (ARMV7M && op == MP_QSTR_movwt) {
                 // this is a convenience instruction
                 mp_uint_t reg_dest = get_arg_reg(emit, op_str, pn_args[0], 15);
                 uint32_t i_src = get_arg_i(emit, op_str, pn_args[1], 0xffffffff);
                 asm_thumb_mov_reg_i16(&emit->as, ASM_THUMB_OP_MOVW, reg_dest, i_src & 0xffff);
                 asm_thumb_mov_reg_i16(&emit->as, ASM_THUMB_OP_MOVT, reg_dest, (i_src >> 16) & 0xffff);
-            } else if (op == MP_QSTR_ldrex) {
+            } else if (ARMV7M && op == MP_QSTR_ldrex) {
                 mp_uint_t r_dest = get_arg_reg(emit, op_str, pn_args[0], 15);
                 mp_parse_node_t pn_base, pn_offset;
                 if (get_arg_addr(emit, op_str, pn_args[1], &pn_base, &pn_offset)) {
@@ -730,7 +749,6 @@ STATIC void emit_inline_thumb_op(emit_inline_asm_t *emit, qstr op, mp_uint_t n_a
                     mp_uint_t i8 = get_arg_i(emit, op_str, pn_offset, 0xff) >> 2;
                     asm_thumb_op32(&emit->as, 0xe850 | r_base, 0x0f00 | (r_dest << 12) | i8);
                 }
-            #endif
             } else {
                 // search table for ldr/str instructions
                 for (mp_uint_t i = 0; i < MP_ARRAY_SIZE(format_9_10_op_table); i++) {
diff --git a/python/src/py/emitnarm.c b/python/src/py/emitnarm.c
index 8297ad619..59075b607 100644
--- a/python/src/py/emitnarm.c
+++ b/python/src/py/emitnarm.c
@@ -10,8 +10,6 @@
 
 // Word indices of REG_LOCAL_x in nlr_buf_t
 #define NLR_BUF_IDX_LOCAL_1 (3) // r4
-#define NLR_BUF_IDX_LOCAL_2 (4) // r5
-#define NLR_BUF_IDX_LOCAL_3 (5) // r6
 
 #define N_ARM (1)
 #define EXPORT_FUN(name) emit_native_arm_##name
diff --git a/python/src/py/emitnative.c b/python/src/py/emitnative.c
index f63b6d289..6683ea420 100644
--- a/python/src/py/emitnative.c
+++ b/python/src/py/emitnative.c
@@ -48,6 +48,7 @@
 
 #include "py/emit.h"
 #include "py/nativeglue.h"
+#include "py/objfun.h"
 #include "py/objstr.h"
 
 #if MICROPY_DEBUG_VERBOSE // print debugging info
@@ -62,20 +63,26 @@
 
 // C stack layout for native functions:
 //  0:                          nlr_buf_t [optional]
-//  emit->code_state_start:     mp_code_state_t
+//                              return_value [optional word]
+//                              exc_handler_unwind [optional word]
+//  emit->code_state_start:     mp_code_state_native_t
 //  emit->stack_start:          Python object stack             | emit->n_state
 //                              locals (reversed, L0 at end)    |
 //
 // C stack layout for native generator functions:
 //  0=emit->stack_start:        nlr_buf_t
+//                              return_value
+//                              exc_handler_unwind [optional word]
 //
 //  Then REG_GENERATOR_STATE points to:
-//  0=emit->code_state_start:   mp_code_state_t
+//  0=emit->code_state_start:   mp_code_state_native_t
 //  emit->stack_start:          Python object stack             | emit->n_state
 //                              locals (reversed, L0 at end)    |
 //
 // C stack layout for viper functions:
 //  0:                          nlr_buf_t [optional]
+//                              return_value [optional word]
+//                              exc_handler_unwind [optional word]
 //  emit->code_state_start:     fun_obj, old_globals [optional]
 //  emit->stack_start:          Python object stack             | emit->n_state
 //                              locals (reversed, L0 at end)    |
@@ -87,14 +94,18 @@
 #else
 #define SIZEOF_NLR_BUF (sizeof(nlr_buf_t) / sizeof(uintptr_t))
 #endif
-#define SIZEOF_CODE_STATE (sizeof(mp_code_state_t) / sizeof(uintptr_t))
-#define OFFSETOF_CODE_STATE_STATE (offsetof(mp_code_state_t, state) / sizeof(uintptr_t))
-#define OFFSETOF_CODE_STATE_FUN_BC (offsetof(mp_code_state_t, fun_bc) / sizeof(uintptr_t))
-#define OFFSETOF_CODE_STATE_IP (offsetof(mp_code_state_t, ip) / sizeof(uintptr_t))
-#define OFFSETOF_CODE_STATE_SP (offsetof(mp_code_state_t, sp) / sizeof(uintptr_t))
-#define OFFSETOF_OBJ_FUN_BC_GLOBALS (offsetof(mp_obj_fun_bc_t, globals) / sizeof(uintptr_t))
+#define SIZEOF_CODE_STATE (sizeof(mp_code_state_native_t) / sizeof(uintptr_t))
+#define OFFSETOF_CODE_STATE_STATE (offsetof(mp_code_state_native_t, state) / sizeof(uintptr_t))
+#define OFFSETOF_CODE_STATE_FUN_BC (offsetof(mp_code_state_native_t, fun_bc) / sizeof(uintptr_t))
+#define OFFSETOF_CODE_STATE_IP (offsetof(mp_code_state_native_t, ip) / sizeof(uintptr_t))
+#define OFFSETOF_CODE_STATE_SP (offsetof(mp_code_state_native_t, sp) / sizeof(uintptr_t))
+#define OFFSETOF_CODE_STATE_N_STATE (offsetof(mp_code_state_native_t, n_state) / sizeof(uintptr_t))
+#define OFFSETOF_OBJ_FUN_BC_CONTEXT (offsetof(mp_obj_fun_bc_t, context) / sizeof(uintptr_t))
+#define OFFSETOF_OBJ_FUN_BC_CHILD_TABLE (offsetof(mp_obj_fun_bc_t, child_table) / sizeof(uintptr_t))
 #define OFFSETOF_OBJ_FUN_BC_BYTECODE (offsetof(mp_obj_fun_bc_t, bytecode) / sizeof(uintptr_t))
-#define OFFSETOF_OBJ_FUN_BC_CONST_TABLE (offsetof(mp_obj_fun_bc_t, const_table) / sizeof(uintptr_t))
+#define OFFSETOF_MODULE_CONTEXT_QSTR_TABLE (offsetof(mp_module_context_t, constants.qstr_table) / sizeof(uintptr_t))
+#define OFFSETOF_MODULE_CONTEXT_OBJ_TABLE (offsetof(mp_module_context_t, constants.obj_table) / sizeof(uintptr_t))
+#define OFFSETOF_MODULE_CONTEXT_GLOBALS (offsetof(mp_module_context_t, module.globals) / sizeof(uintptr_t))
 
 // If not already defined, set parent args to same as child call registers
 #ifndef REG_PARENT_RET
@@ -116,6 +127,9 @@
 #define NEED_GLOBAL_EXC_HANDLER(emit) ((emit)->scope->exc_stack_size > 0 \
     || ((emit)->scope->scope_flags & (MP_SCOPE_FLAG_GENERATOR | MP_SCOPE_FLAG_REFGLOBALS)))
 
+// Whether a slot is needed to store LOCAL_IDX_EXC_HANDLER_UNWIND
+#define NEED_EXC_HANDLER_UNWIND(emit) ((emit)->scope->exc_stack_size > 0)
+
 // Whether registers can be used to store locals (only true if there are no
 // exception handlers, because otherwise an nlr_jump will restore registers to
 // their state at the start of the function and updates to locals will be lost)
@@ -124,14 +138,41 @@
 // Indices within the local C stack for various variables
 #define LOCAL_IDX_EXC_VAL(emit) (NLR_BUF_IDX_RET_VAL)
 #define LOCAL_IDX_EXC_HANDLER_PC(emit) (NLR_BUF_IDX_LOCAL_1)
-#define LOCAL_IDX_EXC_HANDLER_UNWIND(emit) (NLR_BUF_IDX_LOCAL_2)
-#define LOCAL_IDX_RET_VAL(emit) (NLR_BUF_IDX_LOCAL_3)
+#define LOCAL_IDX_EXC_HANDLER_UNWIND(emit) (SIZEOF_NLR_BUF + 1) // this needs a dedicated variable outside nlr_buf_t
+#define LOCAL_IDX_RET_VAL(emit) (SIZEOF_NLR_BUF) // needed when NEED_GLOBAL_EXC_HANDLER is true
 #define LOCAL_IDX_FUN_OBJ(emit) ((emit)->code_state_start + OFFSETOF_CODE_STATE_FUN_BC)
 #define LOCAL_IDX_OLD_GLOBALS(emit) ((emit)->code_state_start + OFFSETOF_CODE_STATE_IP)
 #define LOCAL_IDX_GEN_PC(emit) ((emit)->code_state_start + OFFSETOF_CODE_STATE_IP)
 #define LOCAL_IDX_LOCAL_VAR(emit, local_num) ((emit)->stack_start + (emit)->n_state - 1 - (local_num))
 
+#if MICROPY_PERSISTENT_CODE_SAVE
+
+// When building with the ability to save native code to .mpy files:
+//  - Qstrs are indirect via qstr_table, and REG_LOCAL_3 always points to qstr_table.
+//  - In a generator no registers are used to store locals, and REG_LOCAL_2 points to the generator state.
+//  - At most 2 registers hold local variables (see CAN_USE_REGS_FOR_LOCALS for when this is possible).
+
+#define REG_GENERATOR_STATE (REG_LOCAL_2)
+#define REG_QSTR_TABLE (REG_LOCAL_3)
+#define MAX_REGS_FOR_LOCAL_VARS (2)
+
+STATIC const uint8_t reg_local_table[MAX_REGS_FOR_LOCAL_VARS] = {REG_LOCAL_1, REG_LOCAL_2};
+
+#else
+
+// When building without the ability to save native code to .mpy files:
+//  - Qstrs values are written directly into the machine code.
+//  - In a generator no registers are used to store locals, and REG_LOCAL_3 points to the generator state.
+//  - At most 3 registers hold local variables (see CAN_USE_REGS_FOR_LOCALS for when this is possible).
+
 #define REG_GENERATOR_STATE (REG_LOCAL_3)
+#define MAX_REGS_FOR_LOCAL_VARS (3)
+
+STATIC const uint8_t reg_local_table[MAX_REGS_FOR_LOCAL_VARS] = {REG_LOCAL_1, REG_LOCAL_2, REG_LOCAL_3};
+
+#endif
+
+#define REG_LOCAL_LAST (reg_local_table[MAX_REGS_FOR_LOCAL_VARS - 1])
 
 #define EMIT_NATIVE_VIPER_TYPE_ERROR(emit, ...) do { \
         *emit->error_slot = mp_obj_new_exception_msg_varg(&mp_type_ViperTypeError, __VA_ARGS__); \
@@ -205,6 +246,7 @@ typedef struct _exc_stack_entry_t {
 } exc_stack_entry_t;
 
 struct _emit_t {
+    mp_emit_common_t *emit_common;
     mp_obj_t *error_slot;
     uint *label_slot;
     uint exit_label;
@@ -225,23 +267,15 @@ struct _emit_t {
     exc_stack_entry_t *exc_stack;
 
     int prelude_offset;
+    int prelude_ptr_index;
     int start_offset;
     int n_state;
     uint16_t code_state_start;
     uint16_t stack_start;
     int stack_size;
+    uint16_t n_info;
     uint16_t n_cell;
 
-    uint16_t const_table_cur_obj;
-    uint16_t const_table_num_obj;
-    uint16_t const_table_cur_raw_code;
-    mp_uint_t *const_table;
-
-    #if MICROPY_PERSISTENT_CODE_SAVE
-    uint16_t qstr_link_cur;
-    mp_qstr_link_entry_t *qstr_link;
-    #endif
-
     bool last_emit_was_return_value;
 
     scope_t *scope;
@@ -249,14 +283,14 @@ struct _emit_t {
     ASM_T *as;
 };
 
-STATIC const uint8_t reg_local_table[REG_LOCAL_NUM] = {REG_LOCAL_1, REG_LOCAL_2, REG_LOCAL_3};
-
+STATIC void emit_load_reg_with_object(emit_t *emit, int reg, mp_obj_t obj);
 STATIC void emit_native_global_exc_entry(emit_t *emit);
 STATIC void emit_native_global_exc_exit(emit_t *emit);
 STATIC void emit_native_load_const_obj(emit_t *emit, mp_obj_t obj);
 
-emit_t *EXPORT_FUN(new)(mp_obj_t * error_slot, uint *label_slot, mp_uint_t max_num_labels) {
+emit_t *EXPORT_FUN(new)(mp_emit_common_t * emit_common, mp_obj_t *error_slot, uint *label_slot, mp_uint_t max_num_labels) {
     emit_t *emit = m_new0(emit_t, 1);
+    emit->emit_common = emit_common;
     emit->error_slot = error_slot;
     emit->label_slot = label_slot;
     emit->stack_info_alloc = 8;
@@ -310,12 +344,7 @@ STATIC void emit_native_mov_reg_state_addr(emit_t *emit, int reg_dest, int local
 
 STATIC void emit_native_mov_reg_qstr(emit_t *emit, int arg_reg, qstr qst) {
     #if MICROPY_PERSISTENT_CODE_SAVE
-    size_t loc = ASM_MOV_REG_IMM_FIX_U16(emit->as, arg_reg, qst);
-    size_t link_idx = emit->qstr_link_cur++;
-    if (emit->pass == MP_PASS_EMIT) {
-        emit->qstr_link[link_idx].off = loc << 2 | 1;
-        emit->qstr_link[link_idx].qst = qst;
-    }
+    ASM_LOAD16_REG_REG_OFFSET(emit->as, arg_reg, REG_QSTR_TABLE, mp_emit_common_use_qstr(emit->emit_common, qst));
     #else
     ASM_MOV_REG_IMM(emit->as, arg_reg, qst);
     #endif
@@ -323,12 +352,7 @@ STATIC void emit_native_mov_reg_qstr(emit_t *emit, int arg_reg, qstr qst) {
 
 STATIC void emit_native_mov_reg_qstr_obj(emit_t *emit, int reg_dest, qstr qst) {
     #if MICROPY_PERSISTENT_CODE_SAVE
-    size_t loc = ASM_MOV_REG_IMM_FIX_WORD(emit->as, reg_dest, (mp_uint_t)MP_OBJ_NEW_QSTR(qst));
-    size_t link_idx = emit->qstr_link_cur++;
-    if (emit->pass == MP_PASS_EMIT) {
-        emit->qstr_link[link_idx].off = loc << 2 | 2;
-        emit->qstr_link[link_idx].qst = qst;
-    }
+    emit_load_reg_with_object(emit, reg_dest, MP_OBJ_NEW_QSTR(qst));
     #else
     ASM_MOV_REG_IMM(emit->as, reg_dest, (mp_uint_t)MP_OBJ_NEW_QSTR(qst));
     #endif
@@ -340,33 +364,12 @@ STATIC void emit_native_mov_reg_qstr_obj(emit_t *emit, int reg_dest, qstr qst) {
         emit_native_mov_state_reg((emit), (local_num), (reg_temp)); \
     } while (false)
 
-#define emit_native_mov_state_imm_fix_u16_via(emit, local_num, imm, reg_temp) \
-    do { \
-        ASM_MOV_REG_IMM_FIX_U16((emit)->as, (reg_temp), (imm)); \
-        emit_native_mov_state_reg((emit), (local_num), (reg_temp)); \
-    } while (false)
-
-#define emit_native_mov_state_imm_fix_word_via(emit, local_num, imm, reg_temp) \
-    do { \
-        ASM_MOV_REG_IMM_FIX_WORD((emit)->as, (reg_temp), (imm)); \
-        emit_native_mov_state_reg((emit), (local_num), (reg_temp)); \
-    } while (false)
-
 STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scope) {
     DEBUG_printf("start_pass(pass=%u, scope=%p)\n", pass, scope);
 
     emit->pass = pass;
     emit->do_viper_types = scope->emit_options == MP_EMIT_OPT_VIPER;
     emit->stack_size = 0;
-    #if N_PRELUDE_AS_BYTES_OBJ
-    emit->const_table_cur_obj = emit->do_viper_types ? 0 : 1; // reserve first obj for prelude bytes obj
-    #else
-    emit->const_table_cur_obj = 0;
-    #endif
-    emit->const_table_cur_raw_code = 0;
-    #if MICROPY_PERSISTENT_CODE_SAVE
-    emit->qstr_link_cur = 0;
-    #endif
     emit->last_emit_was_return_value = false;
     emit->scope = scope;
 
@@ -414,12 +417,18 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
 
     // generate code for entry to function
 
-    // Work out start of code state (mp_code_state_t or reduced version for viper)
+    // Work out start of code state (mp_code_state_native_t or reduced version for viper)
     emit->code_state_start = 0;
     if (NEED_GLOBAL_EXC_HANDLER(emit)) {
-        emit->code_state_start = SIZEOF_NLR_BUF;
+        emit->code_state_start = SIZEOF_NLR_BUF; // for nlr_buf_t
+        emit->code_state_start += 1;  // for return_value
+        if (NEED_EXC_HANDLER_UNWIND(emit)) {
+            emit->code_state_start += 1;
+        }
     }
 
+    size_t fun_table_off = mp_emit_common_use_const_obj(emit->emit_common, MP_OBJ_FROM_PTR(&mp_fun_table));
+
     if (emit->do_viper_types) {
         // Work out size of state (locals plus stack)
         // n_state counts all stack and locals, even those in registers
@@ -427,11 +436,11 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
         int num_locals_in_regs = 0;
         if (CAN_USE_REGS_FOR_LOCALS(emit)) {
             num_locals_in_regs = scope->num_locals;
-            if (num_locals_in_regs > REG_LOCAL_NUM) {
-                num_locals_in_regs = REG_LOCAL_NUM;
+            if (num_locals_in_regs > MAX_REGS_FOR_LOCAL_VARS) {
+                num_locals_in_regs = MAX_REGS_FOR_LOCAL_VARS;
             }
-            // Need a spot for REG_LOCAL_3 if 4 or more args (see below)
-            if (scope->num_pos_args >= 4) {
+            // Need a spot for REG_LOCAL_LAST (see below)
+            if (scope->num_pos_args >= MAX_REGS_FOR_LOCAL_VARS + 1) {
                 --num_locals_in_regs;
             }
         }
@@ -455,23 +464,27 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
         #endif
 
         // Load REG_FUN_TABLE with a pointer to mp_fun_table, found in the const_table
-        ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_PARENT_ARG_1, OFFSETOF_OBJ_FUN_BC_CONST_TABLE);
-        ASM_LOAD_REG_REG_OFFSET(emit->as, REG_FUN_TABLE, REG_LOCAL_3, 0);
+        ASM_LOAD_REG_REG_OFFSET(emit->as, REG_FUN_TABLE, REG_PARENT_ARG_1, OFFSETOF_OBJ_FUN_BC_CONTEXT);
+        #if MICROPY_PERSISTENT_CODE_SAVE
+        ASM_LOAD_REG_REG_OFFSET(emit->as, REG_QSTR_TABLE, REG_FUN_TABLE, OFFSETOF_MODULE_CONTEXT_QSTR_TABLE);
+        #endif
+        ASM_LOAD_REG_REG_OFFSET(emit->as, REG_FUN_TABLE, REG_FUN_TABLE, OFFSETOF_MODULE_CONTEXT_OBJ_TABLE);
+        ASM_LOAD_REG_REG_OFFSET(emit->as, REG_FUN_TABLE, REG_FUN_TABLE, fun_table_off);
 
         // Store function object (passed as first arg) to stack if needed
         if (NEED_FUN_OBJ(emit)) {
             ASM_MOV_LOCAL_REG(emit->as, LOCAL_IDX_FUN_OBJ(emit), REG_PARENT_ARG_1);
         }
 
-        // Put n_args in REG_ARG_1, n_kw in REG_ARG_2, args array in REG_LOCAL_3
+        // Put n_args in REG_ARG_1, n_kw in REG_ARG_2, args array in REG_LOCAL_LAST
         #if N_X86
         asm_x86_mov_arg_to_r32(emit->as, 1, REG_ARG_1);
         asm_x86_mov_arg_to_r32(emit->as, 2, REG_ARG_2);
-        asm_x86_mov_arg_to_r32(emit->as, 3, REG_LOCAL_3);
+        asm_x86_mov_arg_to_r32(emit->as, 3, REG_LOCAL_LAST);
         #else
         ASM_MOV_REG_REG(emit->as, REG_ARG_1, REG_PARENT_ARG_2);
         ASM_MOV_REG_REG(emit->as, REG_ARG_2, REG_PARENT_ARG_3);
-        ASM_MOV_REG_REG(emit->as, REG_LOCAL_3, REG_PARENT_ARG_4);
+        ASM_MOV_REG_REG(emit->as, REG_LOCAL_LAST, REG_PARENT_ARG_4);
         #endif
 
         // Check number of args matches this function, and call mp_arg_check_num_sig if not
@@ -486,21 +499,21 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
         // Store arguments into locals (reg or stack), converting to native if needed
         for (int i = 0; i < emit->scope->num_pos_args; i++) {
             int r = REG_ARG_1;
-            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_ARG_1, REG_LOCAL_3, i);
+            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_ARG_1, REG_LOCAL_LAST, i);
             if (emit->local_vtype[i] != VTYPE_PYOBJ) {
                 emit_call_with_imm_arg(emit, MP_F_CONVERT_OBJ_TO_NATIVE, emit->local_vtype[i], REG_ARG_2);
                 r = REG_RET;
             }
-            // REG_LOCAL_3 points to the args array so be sure not to overwrite it if it's still needed
-            if (i < REG_LOCAL_NUM && CAN_USE_REGS_FOR_LOCALS(emit) && (i != 2 || emit->scope->num_pos_args == 3)) {
+            // REG_LOCAL_LAST points to the args array so be sure not to overwrite it if it's still needed
+            if (i < MAX_REGS_FOR_LOCAL_VARS && CAN_USE_REGS_FOR_LOCALS(emit) && (i != MAX_REGS_FOR_LOCAL_VARS - 1 || emit->scope->num_pos_args == MAX_REGS_FOR_LOCAL_VARS)) {
                 ASM_MOV_REG_REG(emit->as, reg_local_table[i], r);
             } else {
                 emit_native_mov_state_reg(emit, LOCAL_IDX_LOCAL_VAR(emit, i), r);
             }
         }
-        // Get 3rd local from the stack back into REG_LOCAL_3 if this reg couldn't be written to above
-        if (emit->scope->num_pos_args >= 4 && CAN_USE_REGS_FOR_LOCALS(emit)) {
-            ASM_MOV_REG_LOCAL(emit->as, REG_LOCAL_3, LOCAL_IDX_LOCAL_VAR(emit, 2));
+        // Get local from the stack back into REG_LOCAL_LAST if this reg couldn't be written to above
+        if (emit->scope->num_pos_args >= MAX_REGS_FOR_LOCAL_VARS + 1 && CAN_USE_REGS_FOR_LOCALS(emit)) {
+            ASM_MOV_REG_LOCAL(emit->as, REG_LOCAL_LAST, LOCAL_IDX_LOCAL_VAR(emit, MAX_REGS_FOR_LOCAL_VARS - 1));
         }
 
         emit_native_global_exc_entry(emit);
@@ -510,16 +523,13 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
         emit->n_state = scope->num_locals + scope->stack_size;
 
         if (emit->scope->scope_flags & MP_SCOPE_FLAG_GENERATOR) {
+            mp_asm_base_data(&emit->as->base, ASM_WORD_SIZE, (uintptr_t)emit->prelude_ptr_index);
+            mp_asm_base_data(&emit->as->base, ASM_WORD_SIZE, (uintptr_t)emit->start_offset);
+            ASM_ENTRY(emit->as, emit->code_state_start);
+
+            // Reset the state size for the state pointed to by REG_GENERATOR_STATE
             emit->code_state_start = 0;
             emit->stack_start = SIZEOF_CODE_STATE;
-            #if N_PRELUDE_AS_BYTES_OBJ
-            // Load index of prelude bytes object in const_table
-            mp_asm_base_data(&emit->as->base, ASM_WORD_SIZE, (uintptr_t)(emit->scope->num_pos_args + emit->scope->num_kwonly_args + 1));
-            #else
-            mp_asm_base_data(&emit->as->base, ASM_WORD_SIZE, (uintptr_t)emit->prelude_offset);
-            #endif
-            mp_asm_base_data(&emit->as->base, ASM_WORD_SIZE, (uintptr_t)emit->start_offset);
-            ASM_ENTRY(emit->as, SIZEOF_NLR_BUF);
 
             // Put address of code_state into REG_GENERATOR_STATE
             #if N_X86
@@ -536,8 +546,12 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
 
             // Load REG_FUN_TABLE with a pointer to mp_fun_table, found in the const_table
             ASM_LOAD_REG_REG_OFFSET(emit->as, REG_TEMP0, REG_GENERATOR_STATE, LOCAL_IDX_FUN_OBJ(emit));
-            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_TEMP0, REG_TEMP0, OFFSETOF_OBJ_FUN_BC_CONST_TABLE);
-            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_FUN_TABLE, REG_TEMP0, emit->scope->num_pos_args + emit->scope->num_kwonly_args);
+            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_TEMP0, REG_TEMP0, OFFSETOF_OBJ_FUN_BC_CONTEXT);
+            #if MICROPY_PERSISTENT_CODE_SAVE
+            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_QSTR_TABLE, REG_TEMP0, OFFSETOF_MODULE_CONTEXT_QSTR_TABLE);
+            #endif
+            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_TEMP0, REG_TEMP0, OFFSETOF_MODULE_CONTEXT_OBJ_TABLE);
+            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_FUN_TABLE, REG_TEMP0, fun_table_off);
         } else {
             // The locals and stack start after the code_state structure
             emit->stack_start = emit->code_state_start + SIZEOF_CODE_STATE;
@@ -555,38 +569,27 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
             #endif
 
             // Load REG_FUN_TABLE with a pointer to mp_fun_table, found in the const_table
-            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_PARENT_ARG_1, OFFSETOF_OBJ_FUN_BC_CONST_TABLE);
-            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_FUN_TABLE, REG_LOCAL_3, emit->scope->num_pos_args + emit->scope->num_kwonly_args);
+            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_FUN_TABLE, REG_PARENT_ARG_1, OFFSETOF_OBJ_FUN_BC_CONTEXT);
+            #if MICROPY_PERSISTENT_CODE_SAVE
+            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_QSTR_TABLE, REG_FUN_TABLE, OFFSETOF_MODULE_CONTEXT_QSTR_TABLE);
+            #endif
+            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_FUN_TABLE, REG_FUN_TABLE, OFFSETOF_MODULE_CONTEXT_OBJ_TABLE);
+            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_FUN_TABLE, REG_FUN_TABLE, fun_table_off);
 
             // Set code_state.fun_bc
             ASM_MOV_LOCAL_REG(emit->as, LOCAL_IDX_FUN_OBJ(emit), REG_PARENT_ARG_1);
 
-            // Set code_state.ip (offset from start of this function to prelude info)
-            int code_state_ip_local = emit->code_state_start + OFFSETOF_CODE_STATE_IP;
-            #if N_PRELUDE_AS_BYTES_OBJ
-            // Prelude is a bytes object in const_table; store ip = prelude->data - fun_bc->bytecode
-            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_LOCAL_3, emit->scope->num_pos_args + emit->scope->num_kwonly_args + 1);
-            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_3, REG_LOCAL_3, offsetof(mp_obj_str_t, data) / sizeof(uintptr_t));
-            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_PARENT_ARG_1, REG_PARENT_ARG_1, OFFSETOF_OBJ_FUN_BC_BYTECODE);
-            ASM_SUB_REG_REG(emit->as, REG_LOCAL_3, REG_PARENT_ARG_1);
-            emit_native_mov_state_reg(emit, code_state_ip_local, REG_LOCAL_3);
-            #else
-            if (emit->pass == MP_PASS_CODE_SIZE) {
-                // Commit to the encoding size based on the value of prelude_offset in this pass.
-                // By using 32768 as the cut-off it is highly unlikely that prelude_offset will
-                // grow beyond 65535 by the end of this pass, and so require the larger encoding.
-                emit->prelude_offset_uses_u16_encoding = emit->prelude_offset < 32768;
+            // Set code_state.ip, a pointer to the beginning of the prelude.  This pointer is found
+            // either directly in mp_obj_fun_bc_t.child_table (if there are no children), or in
+            // mp_obj_fun_bc_t.child_table[num_children] (if num_children > 0).
+            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_PARENT_ARG_1, REG_PARENT_ARG_1, OFFSETOF_OBJ_FUN_BC_CHILD_TABLE);
+            if (emit->prelude_ptr_index != 0) {
+                ASM_LOAD_REG_REG_OFFSET(emit->as, REG_PARENT_ARG_1, REG_PARENT_ARG_1, emit->prelude_ptr_index);
             }
-            if (emit->prelude_offset_uses_u16_encoding) {
-                assert(emit->prelude_offset <= 65535);
-                emit_native_mov_state_imm_fix_u16_via(emit, code_state_ip_local, emit->prelude_offset, REG_PARENT_ARG_1);
-            } else {
-                emit_native_mov_state_imm_fix_word_via(emit, code_state_ip_local, emit->prelude_offset, REG_PARENT_ARG_1);
-            }
-            #endif
+            emit_native_mov_state_reg(emit, emit->code_state_start + OFFSETOF_CODE_STATE_IP, REG_PARENT_ARG_1);
 
             // Set code_state.n_state (only works on little endian targets due to n_state being uint16_t)
-            emit_native_mov_state_imm_via(emit, emit->code_state_start + offsetof(mp_code_state_t, n_state) / sizeof(uintptr_t), emit->n_state, REG_ARG_1);
+            emit_native_mov_state_imm_via(emit, emit->code_state_start + OFFSETOF_CODE_STATE_N_STATE, emit->n_state, REG_ARG_1);
 
             // Put address of code_state into first arg
             ASM_MOV_REG_LOCAL_ADDR(emit->as, REG_ARG_1, emit->code_state_start);
@@ -616,7 +619,7 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
 
         // cache some locals in registers, but only if no exception handlers
         if (CAN_USE_REGS_FOR_LOCALS(emit)) {
-            for (int i = 0; i < REG_LOCAL_NUM && i < scope->num_locals; ++i) {
+            for (int i = 0; i < MAX_REGS_FOR_LOCAL_VARS && i < scope->num_locals; ++i) {
                 ASM_MOV_REG_LOCAL(emit->as, reg_local_table[i], LOCAL_IDX_LOCAL_VAR(emit, i));
             }
         }
@@ -628,55 +631,47 @@ STATIC void emit_native_start_pass(emit_t *emit, pass_kind_t pass, scope_t *scop
                 emit->local_vtype[id->local_num] = VTYPE_PYOBJ;
             }
         }
-
-        if (pass == MP_PASS_EMIT) {
-            // write argument names as qstr objects
-            // see comment in corresponding part of emitbc.c about the logic here
-            for (int i = 0; i < scope->num_pos_args + scope->num_kwonly_args; i++) {
-                qstr qst = MP_QSTR__star_;
-                for (int j = 0; j < scope->id_info_len; ++j) {
-                    id_info_t *id = &scope->id_info[j];
-                    if ((id->flags & ID_FLAG_IS_PARAM) && id->local_num == i) {
-                        qst = id->qst;
-                        break;
-                    }
-                }
-                emit->const_table[i] = (mp_uint_t)MP_OBJ_NEW_QSTR(qst);
-            }
-        }
     }
-
 }
 
 static inline void emit_native_write_code_info_byte(emit_t *emit, byte val) {
     mp_asm_base_data(&emit->as->base, 1, val);
 }
 
-STATIC void emit_native_end_pass(emit_t *emit) {
+static inline void emit_native_write_code_info_qstr(emit_t *emit, qstr qst) {
+    mp_encode_uint(&emit->as->base, mp_asm_base_get_cur_to_write_bytes, mp_emit_common_use_qstr(emit->emit_common, qst));
+}
+
+STATIC bool emit_native_end_pass(emit_t *emit) {
     emit_native_global_exc_exit(emit);
 
     if (!emit->do_viper_types) {
         emit->prelude_offset = mp_asm_base_get_code_pos(&emit->as->base);
+        emit->prelude_ptr_index = emit->emit_common->ct_cur_child;
 
         size_t n_state = emit->n_state;
         size_t n_exc_stack = 0; // exc-stack not needed for native code
         MP_BC_PRELUDE_SIG_ENCODE(n_state, n_exc_stack, emit->scope, emit_native_write_code_info_byte, emit);
 
-        #if MICROPY_PERSISTENT_CODE
-        size_t n_info = 4;
-        #else
-        size_t n_info = 1;
-        #endif
-        MP_BC_PRELUDE_SIZE_ENCODE(n_info, emit->n_cell, emit_native_write_code_info_byte, emit);
+        size_t n_info = emit->n_info;
+        size_t n_cell = emit->n_cell;
+        MP_BC_PRELUDE_SIZE_ENCODE(n_info, n_cell, emit_native_write_code_info_byte, emit);
 
-        #if MICROPY_PERSISTENT_CODE
-        mp_asm_base_data(&emit->as->base, 1, emit->scope->simple_name);
-        mp_asm_base_data(&emit->as->base, 1, emit->scope->simple_name >> 8);
-        mp_asm_base_data(&emit->as->base, 1, emit->scope->source_file);
-        mp_asm_base_data(&emit->as->base, 1, emit->scope->source_file >> 8);
-        #else
-        mp_asm_base_data(&emit->as->base, 1, 1);
-        #endif
+        // bytecode prelude: source info (function and argument qstrs)
+        size_t info_start = mp_asm_base_get_code_pos(&emit->as->base);
+        emit_native_write_code_info_qstr(emit, emit->scope->simple_name);
+        for (int i = 0; i < emit->scope->num_pos_args + emit->scope->num_kwonly_args; i++) {
+            qstr qst = MP_QSTR__star_;
+            for (int j = 0; j < emit->scope->id_info_len; ++j) {
+                id_info_t *id = &emit->scope->id_info[j];
+                if ((id->flags & ID_FLAG_IS_PARAM) && id->local_num == i) {
+                    qst = id->qst;
+                    break;
+                }
+            }
+            emit_native_write_code_info_qstr(emit, qst);
+        }
+        emit->n_info = mp_asm_base_get_code_pos(&emit->as->base) - info_start;
 
         // bytecode prelude: initialise closed over variables
         size_t cell_start = mp_asm_base_get_code_pos(&emit->as->base);
@@ -689,15 +684,6 @@ STATIC void emit_native_end_pass(emit_t *emit) {
         }
         emit->n_cell = mp_asm_base_get_code_pos(&emit->as->base) - cell_start;
 
-        #if N_PRELUDE_AS_BYTES_OBJ
-        // Prelude bytes object is after qstr arg names and mp_fun_table
-        size_t table_off = emit->scope->num_pos_args + emit->scope->num_kwonly_args + 1;
-        if (emit->pass == MP_PASS_EMIT) {
-            void *buf = emit->as->base.code_base + emit->prelude_offset;
-            size_t n = emit->as->base.code_offset - emit->prelude_offset;
-            emit->const_table[table_off] = (uintptr_t)mp_obj_new_bytes(buf, n);
-        }
-        #endif
     }
 
     ASM_END_PASS(emit->as);
@@ -706,46 +692,45 @@ STATIC void emit_native_end_pass(emit_t *emit) {
     assert(emit->stack_size == 0);
     assert(emit->exc_stack_size == 0);
 
-    // Deal with const table accounting
-    assert(emit->pass <= MP_PASS_STACK_SIZE || (emit->const_table_num_obj == emit->const_table_cur_obj));
-    emit->const_table_num_obj = emit->const_table_cur_obj;
-    if (emit->pass == MP_PASS_CODE_SIZE) {
-        size_t const_table_alloc = 1 + emit->const_table_num_obj + emit->const_table_cur_raw_code;
-        size_t nqstr = 0;
-        if (!emit->do_viper_types) {
-            // Add room for qstr names of arguments
-            nqstr = emit->scope->num_pos_args + emit->scope->num_kwonly_args;
-            const_table_alloc += nqstr;
-        }
-        emit->const_table = m_new(mp_uint_t, const_table_alloc);
-        #if !MICROPY_DYNAMIC_COMPILER
-        // Store mp_fun_table pointer just after qstrs
-        // (but in dynamic-compiler mode eliminate dependency on mp_fun_table)
-        emit->const_table[nqstr] = (mp_uint_t)(uintptr_t)&mp_fun_table;
-        #endif
-
-        #if MICROPY_PERSISTENT_CODE_SAVE
-        size_t qstr_link_alloc = emit->qstr_link_cur;
-        if (qstr_link_alloc > 0) {
-            emit->qstr_link = m_new(mp_qstr_link_entry_t, qstr_link_alloc);
-        }
-        #endif
-    }
-
     if (emit->pass == MP_PASS_EMIT) {
         void *f = mp_asm_base_get_code(&emit->as->base);
         mp_uint_t f_len = mp_asm_base_get_code_size(&emit->as->base);
 
+        mp_raw_code_t **children = emit->emit_common->children;
+        if (!emit->do_viper_types) {
+            #if MICROPY_EMIT_NATIVE_PRELUDE_SEPARATE_FROM_MACHINE_CODE
+            // Executable code cannot be accessed byte-wise on this architecture, so copy
+            // the prelude to a separate memory region that is byte-wise readable.
+            void *buf = emit->as->base.code_base + emit->prelude_offset;
+            size_t n = emit->as->base.code_offset - emit->prelude_offset;
+            const uint8_t *prelude_ptr = memcpy(m_new(uint8_t, n), buf, n);
+            #else
+            // Point to the prelude directly, at the end of the machine code data.
+            const uint8_t *prelude_ptr = (const uint8_t *)f + emit->prelude_offset;
+            #endif
+
+            // Store the pointer to the prelude using the child_table.
+            assert(emit->prelude_ptr_index == emit->emit_common->ct_cur_child);
+            if (emit->prelude_ptr_index == 0) {
+                children = (void *)prelude_ptr;
+            } else {
+                children = m_renew(mp_raw_code_t *, children, emit->prelude_ptr_index, emit->prelude_ptr_index + 1);
+                children[emit->prelude_ptr_index] = (void *)prelude_ptr;
+            }
+        }
+
         mp_emit_glue_assign_native(emit->scope->raw_code,
             emit->do_viper_types ? MP_CODE_NATIVE_VIPER : MP_CODE_NATIVE_PY,
-            f, f_len, emit->const_table,
+            f, f_len,
+            children,
             #if MICROPY_PERSISTENT_CODE_SAVE
+            emit->emit_common->ct_cur_child,
             emit->prelude_offset,
-            emit->const_table_cur_obj, emit->const_table_cur_raw_code,
-            emit->qstr_link_cur, emit->qstr_link,
             #endif
-            emit->scope->num_pos_args, emit->scope->scope_flags, 0);
+            emit->scope->scope_flags, 0, 0);
     }
+
+    return true;
 }
 
 STATIC bool emit_native_last_emit_was_return_value(emit_t *emit) {
@@ -874,7 +859,7 @@ STATIC vtype_kind_t load_reg_stack_imm(emit_t *emit, int reg_dest, const stack_i
     }
 }
 
-// Copies all unsettled registers and immediate that are Python values into the
+// Copies all unsettled registers and immediates that are Python values into the
 // concrete Python stack.  This ensures the concrete Python stack holds valid
 // values for the current stack_size.
 // This function may clobber REG_TEMP1.
@@ -1070,7 +1055,7 @@ STATIC void emit_get_stack_pointer_to_reg_for_pop(emit_t *emit, mp_uint_t reg_de
         }
     }
 
-    // Adjust the stack for a pop of n_pop items, and load the stack pointer into reg_dest.
+    // Adujust the stack for a pop of n_pop items, and load the stack pointer into reg_dest.
     adjust_stack(emit, -n_pop);
     emit_native_mov_reg_state_addr(emit, reg_dest, emit->stack_start + emit->stack_size);
 }
@@ -1137,29 +1122,20 @@ STATIC exc_stack_entry_t *emit_native_pop_exc_stack(emit_t *emit) {
     return e;
 }
 
-STATIC void emit_load_reg_with_ptr(emit_t *emit, int reg, mp_uint_t ptr, size_t table_off) {
-    if (!emit->do_viper_types) {
-        // Skip qstr names of arguments
-        table_off += emit->scope->num_pos_args + emit->scope->num_kwonly_args;
-    }
-    if (emit->pass == MP_PASS_EMIT) {
-        emit->const_table[table_off] = ptr;
-    }
+STATIC void emit_load_reg_with_object(emit_t *emit, int reg, mp_obj_t obj) {
+    emit->scope->scope_flags |= MP_SCOPE_FLAG_HASCONSTS;
+    size_t table_off = mp_emit_common_use_const_obj(emit->emit_common, obj);
     emit_native_mov_reg_state(emit, REG_TEMP0, LOCAL_IDX_FUN_OBJ(emit));
-    ASM_LOAD_REG_REG_OFFSET(emit->as, REG_TEMP0, REG_TEMP0, OFFSETOF_OBJ_FUN_BC_CONST_TABLE);
+    ASM_LOAD_REG_REG_OFFSET(emit->as, REG_TEMP0, REG_TEMP0, OFFSETOF_OBJ_FUN_BC_CONTEXT);
+    ASM_LOAD_REG_REG_OFFSET(emit->as, REG_TEMP0, REG_TEMP0, OFFSETOF_MODULE_CONTEXT_OBJ_TABLE);
     ASM_LOAD_REG_REG_OFFSET(emit->as, reg, REG_TEMP0, table_off);
 }
 
-STATIC void emit_load_reg_with_object(emit_t *emit, int reg, mp_obj_t obj) {
-    // First entry is for mp_fun_table
-    size_t table_off = 1 + emit->const_table_cur_obj++;
-    emit_load_reg_with_ptr(emit, reg, (mp_uint_t)obj, table_off);
-}
-
-STATIC void emit_load_reg_with_raw_code(emit_t *emit, int reg, mp_raw_code_t *rc) {
-    // First entry is for mp_fun_table, then constant objects
-    size_t table_off = 1 + emit->const_table_num_obj + emit->const_table_cur_raw_code++;
-    emit_load_reg_with_ptr(emit, reg, (mp_uint_t)rc, table_off);
+STATIC void emit_load_reg_with_child(emit_t *emit, int reg, mp_raw_code_t *rc) {
+    size_t table_off = mp_emit_common_alloc_const_child(emit->emit_common, rc);
+    emit_native_mov_reg_state(emit, REG_TEMP0, LOCAL_IDX_FUN_OBJ(emit));
+    ASM_LOAD_REG_REG_OFFSET(emit->as, REG_TEMP0, REG_TEMP0, OFFSETOF_OBJ_FUN_BC_CHILD_TABLE);
+    ASM_LOAD_REG_REG_OFFSET(emit->as, reg, REG_TEMP0, table_off);
 }
 
 STATIC void emit_native_label_assign(emit_t *emit, mp_uint_t l) {
@@ -1203,7 +1179,8 @@ STATIC void emit_native_global_exc_entry(emit_t *emit) {
         if (!(emit->scope->scope_flags & MP_SCOPE_FLAG_GENERATOR)) {
             // Set new globals
             emit_native_mov_reg_state(emit, REG_ARG_1, LOCAL_IDX_FUN_OBJ(emit));
-            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_ARG_1, REG_ARG_1, OFFSETOF_OBJ_FUN_BC_GLOBALS);
+            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_ARG_1, REG_ARG_1, OFFSETOF_OBJ_FUN_BC_CONTEXT);
+            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_ARG_1, REG_ARG_1, OFFSETOF_MODULE_CONTEXT_GLOBALS);
             emit_call(emit, MP_F_NATIVE_SWAP_GLOBALS);
 
             // Save old globals (or NULL if globals didn't change)
@@ -1234,14 +1211,12 @@ STATIC void emit_native_global_exc_entry(emit_t *emit) {
 
             // Wrap everything in an nlr context
             emit_native_label_assign(emit, nlr_label);
-            ASM_MOV_REG_LOCAL(emit->as, REG_LOCAL_2, LOCAL_IDX_EXC_HANDLER_UNWIND(emit));
             ASM_MOV_REG_LOCAL_ADDR(emit->as, REG_ARG_1, 0);
             emit_call(emit, MP_F_NLR_PUSH);
             #if N_NLR_SETJMP
             ASM_MOV_REG_LOCAL_ADDR(emit->as, REG_ARG_1, 2);
             emit_call(emit, MP_F_SETJMP);
             #endif
-            ASM_MOV_LOCAL_REG(emit->as, LOCAL_IDX_EXC_HANDLER_UNWIND(emit), REG_LOCAL_2);
             ASM_JUMP_IF_REG_NONZERO(emit->as, REG_RET, global_except_label, true);
 
             // Clear PC of current code block, and jump there to resume execution
@@ -1251,12 +1226,6 @@ STATIC void emit_native_global_exc_entry(emit_t *emit) {
 
             // Global exception handler: check for valid exception handler
             emit_native_label_assign(emit, global_except_label);
-            #if N_NLR_SETJMP
-            // Reload REG_FUN_TABLE, since it may be clobbered by longjmp
-            emit_native_mov_reg_state(emit, REG_LOCAL_1, LOCAL_IDX_FUN_OBJ(emit));
-            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_LOCAL_1, REG_LOCAL_1, offsetof(mp_obj_fun_bc_t, const_table) / sizeof(uintptr_t));
-            ASM_LOAD_REG_REG_OFFSET(emit->as, REG_FUN_TABLE, REG_LOCAL_1, emit->scope->num_pos_args + emit->scope->num_kwonly_args);
-            #endif
             ASM_MOV_REG_LOCAL(emit->as, REG_LOCAL_1, LOCAL_IDX_EXC_HANDLER_PC(emit));
             ASM_JUMP_IF_REG_NONZERO(emit->as, REG_LOCAL_1, nlr_label, false);
         }
@@ -1385,11 +1354,7 @@ STATIC void emit_native_import(emit_t *emit, qstr qst, int kind) {
 STATIC void emit_native_load_const_tok(emit_t *emit, mp_token_kind_t tok) {
     DEBUG_printf("load_const_tok(tok=%u)\n", tok);
     if (tok == MP_TOKEN_ELLIPSIS) {
-        #if MICROPY_PERSISTENT_CODE_SAVE
         emit_native_load_const_obj(emit, MP_OBJ_FROM_PTR(&mp_const_ellipsis_obj));
-        #else
-        emit_post_push_imm(emit, VTYPE_PYOBJ, (mp_uint_t)MP_OBJ_FROM_PTR(&mp_const_ellipsis_obj));
-        #endif
     } else {
         emit_native_pre(emit);
         if (tok == MP_TOKEN_KW_NONE) {
@@ -1424,7 +1389,6 @@ STATIC void emit_native_load_const_str(emit_t *emit, qstr qst) {
 }
 
 STATIC void emit_native_load_const_obj(emit_t *emit, mp_obj_t obj) {
-    emit->scope->scope_flags |= MP_SCOPE_FLAG_HASCONSTS;
     emit_native_pre(emit);
     need_reg_single(emit, REG_RET, 0);
     emit_load_reg_with_object(emit, REG_RET, obj);
@@ -1443,7 +1407,7 @@ STATIC void emit_native_load_fast(emit_t *emit, qstr qst, mp_uint_t local_num) {
         EMIT_NATIVE_VIPER_TYPE_ERROR(emit, MP_ERROR_TEXT("local '%q' used before type known"), qst);
     }
     emit_native_pre(emit);
-    if (local_num < REG_LOCAL_NUM && CAN_USE_REGS_FOR_LOCALS(emit)) {
+    if (local_num < MAX_REGS_FOR_LOCAL_VARS && CAN_USE_REGS_FOR_LOCALS(emit)) {
         emit_post_push_reg(emit, vtype, reg_local_table[local_num]);
     } else {
         need_reg_single(emit, REG_TEMP0, 0);
@@ -1560,6 +1524,7 @@ STATIC void emit_native_load_subscr(emit_t *emit) {
             int reg_base = REG_ARG_1;
             int reg_index = REG_ARG_2;
             emit_pre_pop_reg_flexible(emit, &vtype_base, &reg_base, reg_index, reg_index);
+            need_reg_single(emit, REG_RET, 0);
             switch (vtype_base) {
                 case VTYPE_PTR8: {
                     // pointer to 8-bit memory
@@ -1623,6 +1588,7 @@ STATIC void emit_native_load_subscr(emit_t *emit) {
             int reg_index = REG_ARG_2;
             emit_pre_pop_reg_flexible(emit, &vtype_index, &reg_index, REG_ARG_1, REG_ARG_1);
             emit_pre_pop_reg(emit, &vtype_base, REG_ARG_1);
+            need_reg_single(emit, REG_RET, 0);
             if (vtype_index != VTYPE_INT && vtype_index != VTYPE_UINT) {
                 EMIT_NATIVE_VIPER_TYPE_ERROR(emit,
                     MP_ERROR_TEXT("can't load with '%q' index"), vtype_to_qstr(vtype_index));
@@ -1662,7 +1628,7 @@ STATIC void emit_native_load_subscr(emit_t *emit) {
 
 STATIC void emit_native_store_fast(emit_t *emit, qstr qst, mp_uint_t local_num) {
     vtype_kind_t vtype;
-    if (local_num < REG_LOCAL_NUM && CAN_USE_REGS_FOR_LOCALS(emit)) {
+    if (local_num < MAX_REGS_FOR_LOCAL_VARS && CAN_USE_REGS_FOR_LOCALS(emit)) {
         emit_pre_pop_reg(emit, &vtype, reg_local_table[local_num]);
     } else {
         emit_pre_pop_reg(emit, &vtype, REG_TEMP0);
@@ -2455,48 +2421,48 @@ STATIC void emit_native_binary_op(emit_t *emit, mp_binary_op_t op) {
             asm_x86_setcc_r8(emit->as, ops[op_idx], REG_RET);
             #elif N_THUMB
             asm_thumb_cmp_rlo_rlo(emit->as, REG_ARG_2, reg_rhs);
-            #if MICROPY_EMIT_THUMB_ARMV7M
-            static uint16_t ops[6 + 6] = {
-                // unsigned
-                ASM_THUMB_OP_ITE_CC,
-                ASM_THUMB_OP_ITE_HI,
-                ASM_THUMB_OP_ITE_EQ,
-                ASM_THUMB_OP_ITE_LS,
-                ASM_THUMB_OP_ITE_CS,
-                ASM_THUMB_OP_ITE_NE,
-                // signed
-                ASM_THUMB_OP_ITE_LT,
-                ASM_THUMB_OP_ITE_GT,
-                ASM_THUMB_OP_ITE_EQ,
-                ASM_THUMB_OP_ITE_LE,
-                ASM_THUMB_OP_ITE_GE,
-                ASM_THUMB_OP_ITE_NE,
-            };
-            asm_thumb_op16(emit->as, ops[op_idx]);
-            asm_thumb_mov_rlo_i8(emit->as, REG_RET, 1);
-            asm_thumb_mov_rlo_i8(emit->as, REG_RET, 0);
-            #else
-            static uint16_t ops[6 + 6] = {
-                // unsigned
-                ASM_THUMB_CC_CC,
-                ASM_THUMB_CC_HI,
-                ASM_THUMB_CC_EQ,
-                ASM_THUMB_CC_LS,
-                ASM_THUMB_CC_CS,
-                ASM_THUMB_CC_NE,
-                // signed
-                ASM_THUMB_CC_LT,
-                ASM_THUMB_CC_GT,
-                ASM_THUMB_CC_EQ,
-                ASM_THUMB_CC_LE,
-                ASM_THUMB_CC_GE,
-                ASM_THUMB_CC_NE,
-            };
-            asm_thumb_bcc_rel9(emit->as, ops[op_idx], 6);
-            asm_thumb_mov_rlo_i8(emit->as, REG_RET, 0);
-            asm_thumb_b_rel12(emit->as, 4);
-            asm_thumb_mov_rlo_i8(emit->as, REG_RET, 1);
-            #endif
+            if (asm_thumb_allow_armv7m(emit->as)) {
+                static uint16_t ops[6 + 6] = {
+                    // unsigned
+                    ASM_THUMB_OP_ITE_CC,
+                    ASM_THUMB_OP_ITE_HI,
+                    ASM_THUMB_OP_ITE_EQ,
+                    ASM_THUMB_OP_ITE_LS,
+                    ASM_THUMB_OP_ITE_CS,
+                    ASM_THUMB_OP_ITE_NE,
+                    // signed
+                    ASM_THUMB_OP_ITE_LT,
+                    ASM_THUMB_OP_ITE_GT,
+                    ASM_THUMB_OP_ITE_EQ,
+                    ASM_THUMB_OP_ITE_LE,
+                    ASM_THUMB_OP_ITE_GE,
+                    ASM_THUMB_OP_ITE_NE,
+                };
+                asm_thumb_op16(emit->as, ops[op_idx]);
+                asm_thumb_mov_rlo_i8(emit->as, REG_RET, 1);
+                asm_thumb_mov_rlo_i8(emit->as, REG_RET, 0);
+            } else {
+                static uint16_t ops[6 + 6] = {
+                    // unsigned
+                    ASM_THUMB_CC_CC,
+                    ASM_THUMB_CC_HI,
+                    ASM_THUMB_CC_EQ,
+                    ASM_THUMB_CC_LS,
+                    ASM_THUMB_CC_CS,
+                    ASM_THUMB_CC_NE,
+                    // signed
+                    ASM_THUMB_CC_LT,
+                    ASM_THUMB_CC_GT,
+                    ASM_THUMB_CC_EQ,
+                    ASM_THUMB_CC_LE,
+                    ASM_THUMB_CC_GE,
+                    ASM_THUMB_CC_NE,
+                };
+                asm_thumb_bcc_rel9(emit->as, ops[op_idx], 6);
+                asm_thumb_mov_rlo_i8(emit->as, REG_RET, 0);
+                asm_thumb_b_rel12(emit->as, 4);
+                asm_thumb_mov_rlo_i8(emit->as, REG_RET, 1);
+            }
             #elif N_ARM
             asm_arm_cmp_reg_reg(emit->as, REG_ARG_2, reg_rhs);
             static uint ccs[6 + 6] = {
@@ -2680,33 +2646,46 @@ STATIC void emit_native_unpack_ex(emit_t *emit, mp_uint_t n_left, mp_uint_t n_ri
 STATIC void emit_native_make_function(emit_t *emit, scope_t *scope, mp_uint_t n_pos_defaults, mp_uint_t n_kw_defaults) {
     // call runtime, with type info for args, or don't support dict/default params, or only support Python objects for them
     emit_native_pre(emit);
+    emit_native_mov_reg_state(emit, REG_ARG_2, LOCAL_IDX_FUN_OBJ(emit));
+    ASM_LOAD_REG_REG_OFFSET(emit->as, REG_ARG_2, REG_ARG_2, OFFSETOF_OBJ_FUN_BC_CONTEXT);
     if (n_pos_defaults == 0 && n_kw_defaults == 0) {
         need_reg_all(emit);
-        ASM_MOV_REG_IMM(emit->as, REG_ARG_2, (mp_uint_t)MP_OBJ_NULL);
-        ASM_MOV_REG_IMM(emit->as, REG_ARG_3, (mp_uint_t)MP_OBJ_NULL);
+        ASM_MOV_REG_IMM(emit->as, REG_ARG_3, 0);
     } else {
-        vtype_kind_t vtype_def_tuple, vtype_def_dict;
-        emit_pre_pop_reg_reg(emit, &vtype_def_dict, REG_ARG_3, &vtype_def_tuple, REG_ARG_2);
-        assert(vtype_def_tuple == VTYPE_PYOBJ);
-        assert(vtype_def_dict == VTYPE_PYOBJ);
+        emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, 2);
         need_reg_all(emit);
     }
-    emit_load_reg_with_raw_code(emit, REG_ARG_1, scope->raw_code);
+    emit_load_reg_with_child(emit, REG_ARG_1, scope->raw_code);
     ASM_CALL_IND(emit->as, MP_F_MAKE_FUNCTION_FROM_RAW_CODE);
     emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
 }
 
 STATIC void emit_native_make_closure(emit_t *emit, scope_t *scope, mp_uint_t n_closed_over, mp_uint_t n_pos_defaults, mp_uint_t n_kw_defaults) {
+    // make function
     emit_native_pre(emit);
+    emit_native_mov_reg_state(emit, REG_ARG_2, LOCAL_IDX_FUN_OBJ(emit));
+    ASM_LOAD_REG_REG_OFFSET(emit->as, REG_ARG_2, REG_ARG_2, OFFSETOF_OBJ_FUN_BC_CONTEXT);
     if (n_pos_defaults == 0 && n_kw_defaults == 0) {
-        emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, n_closed_over);
-        ASM_MOV_REG_IMM(emit->as, REG_ARG_2, n_closed_over);
+        need_reg_all(emit);
+        ASM_MOV_REG_IMM(emit->as, REG_ARG_3, 0);
     } else {
-        emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, n_closed_over + 2);
-        ASM_MOV_REG_IMM(emit->as, REG_ARG_2, 0x100 | n_closed_over);
+        emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, 2 + n_closed_over);
+        adjust_stack(emit, 2 + n_closed_over);
+        need_reg_all(emit);
     }
-    emit_load_reg_with_raw_code(emit, REG_ARG_1, scope->raw_code);
-    ASM_CALL_IND(emit->as, MP_F_MAKE_CLOSURE_FROM_RAW_CODE);
+    emit_load_reg_with_child(emit, REG_ARG_1, scope->raw_code);
+    ASM_CALL_IND(emit->as, MP_F_MAKE_FUNCTION_FROM_RAW_CODE);
+
+    // make closure
+    #if REG_ARG_1 != REG_RET
+    ASM_MOV_REG_REG(emit->as, REG_ARG_1, REG_RET);
+    #endif
+    ASM_MOV_REG_IMM(emit->as, REG_ARG_2, n_closed_over);
+    emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, n_closed_over);
+    if (n_pos_defaults != 0 || n_kw_defaults != 0) {
+        adjust_stack(emit, -2);
+    }
+    ASM_CALL_IND(emit->as, MP_F_NEW_CLOSURE);
     emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
 }
 
@@ -2751,7 +2730,7 @@ STATIC void emit_native_call_function(emit_t *emit, mp_uint_t n_positional, mp_u
     } else {
         assert(vtype_fun == VTYPE_PYOBJ);
         if (star_flags) {
-            emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, n_positional + 2 * n_keyword + 3); // pointer to args
+            emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, n_positional + 2 * n_keyword + 2); // pointer to args
             emit_call_with_2_imm_args(emit, MP_F_CALL_METHOD_N_KW_VAR, 0, REG_ARG_1, n_positional | (n_keyword << 8), REG_ARG_2);
             emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
         } else {
@@ -2767,7 +2746,7 @@ STATIC void emit_native_call_function(emit_t *emit, mp_uint_t n_positional, mp_u
 
 STATIC void emit_native_call_method(emit_t *emit, mp_uint_t n_positional, mp_uint_t n_keyword, mp_uint_t star_flags) {
     if (star_flags) {
-        emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, n_positional + 2 * n_keyword + 4); // pointer to args
+        emit_get_stack_pointer_to_reg_for_pop(emit, REG_ARG_3, n_positional + 2 * n_keyword + 3); // pointer to args
         emit_call_with_2_imm_args(emit, MP_F_CALL_METHOD_N_KW_VAR, 1, REG_ARG_1, n_positional | (n_keyword << 8), REG_ARG_2);
         emit_post_push_reg(emit, VTYPE_PYOBJ, REG_RET);
     } else {
diff --git a/python/src/py/emitnthumb.c b/python/src/py/emitnthumb.c
index 1c33e7a68..844a73ffa 100644
--- a/python/src/py/emitnthumb.c
+++ b/python/src/py/emitnthumb.c
@@ -10,8 +10,6 @@
 
 // Word indices of REG_LOCAL_x in nlr_buf_t
 #define NLR_BUF_IDX_LOCAL_1 (3) // r4
-#define NLR_BUF_IDX_LOCAL_2 (4) // r5
-#define NLR_BUF_IDX_LOCAL_3 (5) // r6
 
 #define N_THUMB (1)
 #define EXPORT_FUN(name) emit_native_thumb_##name
diff --git a/python/src/py/emitnx64.c b/python/src/py/emitnx64.c
index 4abb3ecad..1b32286d2 100644
--- a/python/src/py/emitnx64.c
+++ b/python/src/py/emitnx64.c
@@ -10,8 +10,6 @@
 
 // Word indices of REG_LOCAL_x in nlr_buf_t
 #define NLR_BUF_IDX_LOCAL_1 (5) // rbx
-#define NLR_BUF_IDX_LOCAL_2 (6) // r12
-#define NLR_BUF_IDX_LOCAL_3 (7) // r13
 
 #define N_X64 (1)
 #define EXPORT_FUN(name) emit_native_x64_##name
diff --git a/python/src/py/emitnx86.c b/python/src/py/emitnx86.c
index f0553f068..a9050c65d 100644
--- a/python/src/py/emitnx86.c
+++ b/python/src/py/emitnx86.c
@@ -11,8 +11,6 @@
 
 // Word indices of REG_LOCAL_x in nlr_buf_t
 #define NLR_BUF_IDX_LOCAL_1 (5) // ebx
-#define NLR_BUF_IDX_LOCAL_2 (7) // esi
-#define NLR_BUF_IDX_LOCAL_3 (6) // edi
 
 // x86 needs a table to know how many args a given function has
 STATIC byte mp_f_n_args[MP_F_NUMBER_OF] = {
@@ -56,7 +54,7 @@ STATIC byte mp_f_n_args[MP_F_NUMBER_OF] = {
     [MP_F_UNPACK_EX] = 3,
     [MP_F_DELETE_NAME] = 1,
     [MP_F_DELETE_GLOBAL] = 1,
-    [MP_F_MAKE_CLOSURE_FROM_RAW_CODE] = 3,
+    [MP_F_NEW_CLOSURE] = 3,
     [MP_F_ARG_CHECK_NUM_SIG] = 3,
     [MP_F_SETUP_CODE_STATE] = 4,
     [MP_F_SMALL_INT_FLOOR_DIVIDE] = 2,
diff --git a/python/src/py/emitnxtensa.c b/python/src/py/emitnxtensa.c
index 34089e90d..c89b02902 100644
--- a/python/src/py/emitnxtensa.c
+++ b/python/src/py/emitnxtensa.c
@@ -10,8 +10,6 @@
 
 // Word indices of REG_LOCAL_x in nlr_buf_t
 #define NLR_BUF_IDX_LOCAL_1 (8) // a12
-#define NLR_BUF_IDX_LOCAL_2 (9) // a13
-#define NLR_BUF_IDX_LOCAL_3 (10) // a14
 
 #define N_XTENSA (1)
 #define EXPORT_FUN(name) emit_native_xtensa_##name
diff --git a/python/src/py/emitnxtensawin.c b/python/src/py/emitnxtensawin.c
index 38d5db13e..f6eeff845 100644
--- a/python/src/py/emitnxtensawin.c
+++ b/python/src/py/emitnxtensawin.c
@@ -11,11 +11,8 @@
 
 // Word indices of REG_LOCAL_x in nlr_buf_t
 #define NLR_BUF_IDX_LOCAL_1 (2 + 4) // a4
-#define NLR_BUF_IDX_LOCAL_2 (2 + 5) // a5
-#define NLR_BUF_IDX_LOCAL_3 (2 + 6) // a6
 
 #define N_NLR_SETJMP (1)
-#define N_PRELUDE_AS_BYTES_OBJ (1)
 #define N_XTENSAWIN (1)
 #define EXPORT_FUN(name) emit_native_xtensawin_##name
 #include "py/emitnative.c"
diff --git a/python/src/py/formatfloat.c b/python/src/py/formatfloat.c
index 6f4eee822..9d28b2317 100644
--- a/python/src/py/formatfloat.c
+++ b/python/src/py/formatfloat.c
@@ -318,7 +318,7 @@ int mp_format_float(FPTYPE f, char *buf, size_t buf_size, char fmt, int prec, ch
 
     // We now have num.f as a floating point number between >= 1 and < 10
     // (or equal to zero), and e contains the absolute value of the power of
-    // 10 exponent. and (dec + 1) == the number of digits before the decimal.
+    // 10 exponent. and (dec + 1) == the number of dgits before the decimal.
 
     // For e, prec is # digits after the decimal
     // For f, prec is # digits after the decimal
diff --git a/python/src/py/frozenmod.c b/python/src/py/frozenmod.c
index a250c0215..61c2f20aa 100644
--- a/python/src/py/frozenmod.c
+++ b/python/src/py/frozenmod.c
@@ -5,6 +5,7 @@
  *
  * Copyright (c) 2015 Paul Sokolovsky
  * Copyright (c) 2016 Damien P. George
+ * Copyright (c) 2021 Jim Mussared
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -31,6 +32,13 @@
 #include "py/lexer.h"
 #include "py/frozenmod.h"
 
+#if MICROPY_MODULE_FROZEN
+
+// Null-separated frozen file names. All string-type entries are listed first,
+// followed by mpy-type entries. Use mp_frozen_str_sizes to determine how
+// many string entries.
+extern const char mp_frozen_names[];
+
 #if MICROPY_MODULE_FROZEN_STR
 
 #ifndef MICROPY_MODULE_FROZEN_LEXER
@@ -39,118 +47,89 @@
 mp_lexer_t *MICROPY_MODULE_FROZEN_LEXER(qstr src_name, const char *str, mp_uint_t len, mp_uint_t free_len);
 #endif
 
-extern const char mp_frozen_str_names[];
+// Size in bytes of each string entry, followed by a zero (terminator).
 extern const uint32_t mp_frozen_str_sizes[];
+// Null-separated string content.
 extern const char mp_frozen_str_content[];
-
-// On input, *len contains size of name, on output - size of content
-const char *mp_find_frozen_str(const char *str, size_t *len) {
-    const char *name = mp_frozen_str_names;
-
-    size_t offset = 0;
-    for (int i = 0; *name != 0; i++) {
-        size_t l = strlen(name);
-        if (l == *len && !memcmp(str, name, l)) {
-            *len = mp_frozen_str_sizes[i];
-            return mp_frozen_str_content + offset;
-        }
-        name += l + 1;
-        offset += mp_frozen_str_sizes[i] + 1;
-    }
-    return NULL;
-}
-
-STATIC mp_lexer_t *mp_lexer_frozen_str(const char *str, size_t len) {
-    size_t name_len = len;
-    const char *content = mp_find_frozen_str(str, &len);
-
-    if (content == NULL) {
-        return NULL;
-    }
-
-    qstr source = qstr_from_strn(str, name_len);
-    mp_lexer_t *lex = MICROPY_MODULE_FROZEN_LEXER(source, content, len, 0);
-    return lex;
-}
-
-#endif
+#endif // MICROPY_MODULE_FROZEN_STR
 
 #if MICROPY_MODULE_FROZEN_MPY
 
 #include "py/emitglue.h"
 
-extern const char mp_frozen_mpy_names[];
-extern const mp_raw_code_t *const mp_frozen_mpy_content[];
+extern const mp_frozen_module_t *const mp_frozen_mpy_content[];
 
-STATIC const mp_raw_code_t *mp_find_frozen_mpy(const char *str, size_t len) {
-    const char *name = mp_frozen_mpy_names;
-    for (size_t i = 0; *name != 0; i++) {
-        size_t l = strlen(name);
-        if (l == len && !memcmp(str, name, l)) {
-            return mp_frozen_mpy_content[i];
-        }
-        name += l + 1;
-    }
-    return NULL;
-}
+#endif // MICROPY_MODULE_FROZEN_MPY
 
-#endif
-
-#if MICROPY_MODULE_FROZEN
-
-STATIC mp_import_stat_t mp_frozen_stat_helper(const char *name, const char *str) {
+// Search for "str" as a frozen entry, returning the stat result
+// (no-exist/file/dir), as well as the type (none/str/mpy) and data.
+// frozen_type can be NULL if its value isn't needed (and then data is assumed to be NULL).
+mp_import_stat_t mp_find_frozen_module(const char *str, int *frozen_type, void **data) {
     size_t len = strlen(str);
+    const char *name = mp_frozen_names;
+
+    if (frozen_type != NULL) {
+        *frozen_type = MP_FROZEN_NONE;
+    }
+
+    // Count the number of str lengths we have to find how many str entries.
+    size_t num_str = 0;
+    #if MICROPY_MODULE_FROZEN_STR && MICROPY_MODULE_FROZEN_MPY
+    for (const uint32_t *s = mp_frozen_str_sizes; *s != 0; ++s) {
+        ++num_str;
+    }
+    #endif
+
+    for (size_t i = 0; *name != 0; i++) {
+        size_t entry_len = strlen(name);
+        if (entry_len >= len && memcmp(str, name, len) == 0) {
+            // Query is a prefix of the current entry.
+            if (entry_len == len) {
+                // Exact match --> file.
+
+                if (frozen_type != NULL) {
+                    #if MICROPY_MODULE_FROZEN_STR
+                    if (i < num_str) {
+                        *frozen_type = MP_FROZEN_STR;
+                        // Use the size table to figure out where this index starts.
+                        size_t offset = 0;
+                        for (size_t j = 0; j < i; ++j) {
+                            offset += mp_frozen_str_sizes[j] + 1;
+                        }
+                        size_t content_len = mp_frozen_str_sizes[i];
+                        const char *content = &mp_frozen_str_content[offset];
+
+                        // Note: str & len have been updated by find_frozen_entry to strip
+                        // the ".frozen/" prefix (to avoid this being a distinct qstr to
+                        // the original path QSTR in frozen_content.c).
+                        qstr source = qstr_from_strn(str, len);
+                        mp_lexer_t *lex = MICROPY_MODULE_FROZEN_LEXER(source, content, content_len, 0);
+                        *data = lex;
+                    }
+                    #endif
+
+                    #if MICROPY_MODULE_FROZEN_MPY
+                    if (i >= num_str) {
+                        *frozen_type = MP_FROZEN_MPY;
+                        // Load the corresponding index as a raw_code, taking
+                        // into account any string entries to offset by.
+                        *data = (void *)mp_frozen_mpy_content[i - num_str];
+                    }
+                    #endif
+                }
 
-    for (int i = 0; *name != 0; i++) {
-        size_t l = strlen(name);
-        if (l >= len && !memcmp(str, name, len)) {
-            if (name[len] == 0) {
                 return MP_IMPORT_STAT_FILE;
             } else if (name[len] == '/') {
+                // Matches up to directory separator, this is a valid
+                // directory path.
                 return MP_IMPORT_STAT_DIR;
             }
         }
-        name += l + 1;
+        // Skip null separator.
+        name += entry_len + 1;
     }
-    return MP_IMPORT_STAT_NO_EXIST;
-}
-
-mp_import_stat_t mp_frozen_stat(const char *str) {
-    mp_import_stat_t stat;
-
-    #if MICROPY_MODULE_FROZEN_STR
-    stat = mp_frozen_stat_helper(mp_frozen_str_names, str);
-    if (stat != MP_IMPORT_STAT_NO_EXIST) {
-        return stat;
-    }
-    #endif
-
-    #if MICROPY_MODULE_FROZEN_MPY
-    stat = mp_frozen_stat_helper(mp_frozen_mpy_names, str);
-    if (stat != MP_IMPORT_STAT_NO_EXIST) {
-        return stat;
-    }
-    #endif
 
     return MP_IMPORT_STAT_NO_EXIST;
 }
 
-int mp_find_frozen_module(const char *str, size_t len, void **data) {
-    #if MICROPY_MODULE_FROZEN_STR
-    mp_lexer_t *lex = mp_lexer_frozen_str(str, len);
-    if (lex != NULL) {
-        *data = lex;
-        return MP_FROZEN_STR;
-    }
-    #endif
-    #if MICROPY_MODULE_FROZEN_MPY
-    const mp_raw_code_t *rc = mp_find_frozen_mpy(str, len);
-    if (rc != NULL) {
-        *data = (void *)rc;
-        return MP_FROZEN_MPY;
-    }
-    #endif
-    return MP_FROZEN_NONE;
-}
-
-#endif
+#endif // MICROPY_MODULE_FROZEN
diff --git a/python/src/py/frozenmod.h b/python/src/py/frozenmod.h
index 8a477d028..cff6c8616 100644
--- a/python/src/py/frozenmod.h
+++ b/python/src/py/frozenmod.h
@@ -27,7 +27,7 @@
 #ifndef MICROPY_INCLUDED_PY_FROZENMOD_H
 #define MICROPY_INCLUDED_PY_FROZENMOD_H
 
-#include "py/lexer.h"
+#include "py/builtin.h"
 
 enum {
     MP_FROZEN_NONE,
@@ -35,8 +35,6 @@ enum {
     MP_FROZEN_MPY,
 };
 
-int mp_find_frozen_module(const char *str, size_t len, void **data);
-const char *mp_find_frozen_str(const char *str, size_t *len);
-mp_import_stat_t mp_frozen_stat(const char *str);
+mp_import_stat_t mp_find_frozen_module(const char *str, int *frozen_type, void **data);
 
 #endif // MICROPY_INCLUDED_PY_FROZENMOD_H
diff --git a/python/src/py/gc.c b/python/src/py/gc.c
index 1256d3524..0c1f3961d 100644
--- a/python/src/py/gc.c
+++ b/python/src/py/gc.c
@@ -213,6 +213,7 @@ STATIC void gc_mark_subtree(size_t block) {
     // Start with the block passed in the argument.
     size_t sp = 0;
     for (;;) {
+        MICROPY_GC_HOOK_LOOP
         // work out number of consecutive blocks in the chain starting with this one
         size_t n_blocks = 0;
         do {
@@ -222,6 +223,7 @@ STATIC void gc_mark_subtree(size_t block) {
         // check this block's children
         void **ptrs = (void **)PTR_FROM_BLOCK(block);
         for (size_t i = n_blocks * BYTES_PER_BLOCK / sizeof(void *); i > 0; i--, ptrs++) {
+            MICROPY_GC_HOOK_LOOP
             void *ptr = *ptrs;
             if (VERIFY_PTR(ptr)) {
                 // Mark and push this pointer
@@ -255,6 +257,7 @@ STATIC void gc_deal_with_stack_overflow(void) {
 
         // scan entire memory looking for blocks which have been marked but not their children
         for (size_t block = 0; block < MP_STATE_MEM(gc_alloc_table_byte_len) * BLOCKS_PER_ATB; block++) {
+            MICROPY_GC_HOOK_LOOP
             // trace (again) if mark bit set
             if (ATB_GET_KIND(block) == AT_MARK) {
                 gc_mark_subtree(block);
@@ -270,6 +273,7 @@ STATIC void gc_sweep(void) {
     // free unmarked heads and their tails
     int free_tail = 0;
     for (size_t block = 0; block < MP_STATE_MEM(gc_alloc_table_byte_len) * BLOCKS_PER_ATB; block++) {
+        MICROPY_GC_HOOK_LOOP
         switch (ATB_GET_KIND(block)) {
             case AT_HEAD:
                 #if MICROPY_ENABLE_FINALISER
@@ -354,6 +358,7 @@ static void *gc_get_ptr(void **ptrs, int i) {
 
 void gc_collect_root(void **ptrs, size_t len) {
     for (size_t i = 0; i < len; i++) {
+        MICROPY_GC_HOOK_LOOP
         void *ptr = gc_get_ptr(ptrs, i);
         if (VERIFY_PTR(ptr)) {
             size_t block = BLOCK_FROM_PTR(ptr);
@@ -512,7 +517,7 @@ found:
     // Set last free ATB index to block after last block we found, for start of
     // next scan.  To reduce fragmentation, we only do this if we were looking
     // for a single free block, which guarantees that there are no free blocks
-    // before this one.  Also, whenever we free or shrink a block we must check
+    // before this one.  Also, whenever we free or shink a block we must check
     // if this index needs adjusting (see gc_realloc and gc_free).
     if (n_free == 1) {
         MP_STATE_MEM(gc_last_free_atb_index) = (i + 1) / BLOCKS_PER_ATB;
@@ -915,13 +920,13 @@ void gc_dump_alloc_table(void) {
                     // This code prints "Q" for qstr-pool data, and "q" for qstr-str
                     // data.  It can be useful to see how qstrs are being allocated,
                     // but is disabled by default because it is very slow.
-                    for (qstr_pool_t *pool = MP_STATE_VM(last_pool); c == 'h' && pool != NULL; pool = pool->prev) {
-                        if ((qstr_pool_t *)ptr == pool) {
+                    for (const qstr_pool_t *pool = MP_STATE_VM(last_pool); c == 'h' && pool != NULL; pool = pool->prev) {
+                        if ((const qstr_pool_t *)ptr == pool) {
                             c = 'Q';
                             break;
                         }
-                        for (const byte **q = pool->qstrs, **q_top = pool->qstrs + pool->len; q < q_top; q++) {
-                            if ((const byte *)ptr == *q) {
+                        for (const char *const *q = pool->qstrs, *const *q_top = pool->qstrs + pool->len; q < q_top; q++) {
+                            if ((const char *)ptr == *q) {
                                 c = 'q';
                                 break;
                             }
diff --git a/python/src/py/lexer.c b/python/src/py/lexer.c
index e1858d8ee..39e9662f6 100644
--- a/python/src/py/lexer.c
+++ b/python/src/py/lexer.c
@@ -363,9 +363,16 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring)
                     // (MicroPython limitation) note: this is completely unaware of
                     // Python syntax and will not handle any expression containing '}' or ':'.
                     // e.g. f'{"}"}' or f'{foo({})}'.
-                    while (!is_end(lex) && !is_char_or(lex, ':', '}')) {
+                    unsigned int nested_bracket_level = 0;
+                    while (!is_end(lex) && (nested_bracket_level != 0 || !is_char_or(lex, ':', '}'))) {
+                        unichar c = CUR_CHAR(lex);
+                        if (c == '[' || c == '{') {
+                            nested_bracket_level += 1;
+                        } else if (c == ']' || c == '}') {
+                            nested_bracket_level -= 1;
+                        }
                         // like the default case at the end of this function, stay 8-bit clean
-                        vstr_add_byte(&lex->fstring_args, CUR_CHAR(lex));
+                        vstr_add_byte(&lex->fstring_args, c);
                         next_char(lex);
                     }
                     if (lex->fstring_args.buf[lex->fstring_args.len - 1] == '=') {
@@ -466,25 +473,23 @@ STATIC void parse_string_literal(mp_lexer_t *lex, bool is_raw, bool is_fstring)
                     }
                 }
                 if (c != MP_LEXER_EOF) {
-                    if (MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) {
-                        if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
-                            vstr_add_char(&lex->vstr, c);
-                        } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
-                            vstr_add_byte(&lex->vstr, c);
-                        } else {
-                            // unicode character out of range
-                            // this raises a generic SyntaxError; could provide more info
-                            lex->tok_kind = MP_TOKEN_INVALID;
-                        }
-                    } else {
-                        // without unicode everything is just added as an 8-bit byte
-                        if (c < 0x100) {
-                            vstr_add_byte(&lex->vstr, c);
-                        } else {
-                            // 8-bit character out of range
-                            // this raises a generic SyntaxError; could provide more info
-                            lex->tok_kind = MP_TOKEN_INVALID;
-                        }
+                    #if MICROPY_PY_BUILTINS_STR_UNICODE
+                    if (c < 0x110000 && lex->tok_kind == MP_TOKEN_STRING) {
+                        // Valid unicode character in a str object.
+                        vstr_add_char(&lex->vstr, c);
+                    } else if (c < 0x100 && lex->tok_kind == MP_TOKEN_BYTES) {
+                        // Valid byte in a bytes object.
+                        vstr_add_byte(&lex->vstr, c);
+                    }
+                    #else
+                    if (c < 0x100) {
+                        // Without unicode everything is just added as an 8-bit byte.
+                        vstr_add_byte(&lex->vstr, c);
+                    }
+                    #endif
+                    else {
+                        // Character out of range; this raises a generic SyntaxError.
+                        lex->tok_kind = MP_TOKEN_INVALID;
                     }
                 }
             } else {
@@ -594,7 +599,7 @@ void mp_lexer_to_next(mp_lexer_t *lex) {
         // a string or bytes literal
 
         // Python requires adjacent string/bytes literals to be automatically
-        // concatenated.  We do it here in the tokenizer to make efficient use of RAM,
+        // concatenated.  We do it here in the tokeniser to make efficient use of RAM,
         // because then the lexer's vstr can be used to accumulate the string literal,
         // in contrast to creating a parse tree of strings and then joining them later
         // in the compiler.  It's also more compact in code size to do it here.
diff --git a/python/src/py/lexer.h b/python/src/py/lexer.h
index 4b0c097af..8295dec0f 100644
--- a/python/src/py/lexer.h
+++ b/python/src/py/lexer.h
@@ -32,7 +32,7 @@
 #include "py/qstr.h"
 #include "py/reader.h"
 
-/* lexer.h -- simple tokenizer for MicroPython
+/* lexer.h -- simple tokeniser for MicroPython
  *
  * Uses (byte) length instead of null termination.
  * Tokens are the same - UTF-8 with (byte) length.
@@ -189,24 +189,15 @@ typedef struct _mp_lexer_t {
 mp_lexer_t *mp_lexer_new(qstr src_name, mp_reader_t reader);
 mp_lexer_t *mp_lexer_new_from_str_len(qstr src_name, const char *str, size_t len, size_t free_len);
 
-void mp_lexer_free(mp_lexer_t *lex);
-void mp_lexer_to_next(mp_lexer_t *lex);
-
-/******************************************************************/
-// platform specific import function; must be implemented for a specific port
-// TODO tidy up, rename, or put elsewhere
-
-typedef enum {
-    MP_IMPORT_STAT_NO_EXIST,
-    MP_IMPORT_STAT_DIR,
-    MP_IMPORT_STAT_FILE,
-} mp_import_stat_t;
-
-mp_import_stat_t mp_import_stat(const char *path);
+// If MICROPY_READER_POSIX or MICROPY_READER_VFS aren't enabled then
+// this function must be implemented by the port.
 mp_lexer_t *mp_lexer_new_from_file(const char *filename);
 
 #if MICROPY_HELPER_LEXER_UNIX
 mp_lexer_t *mp_lexer_new_from_fd(qstr filename, int fd, bool close_fd);
 #endif
 
+void mp_lexer_free(mp_lexer_t *lex);
+void mp_lexer_to_next(mp_lexer_t *lex);
+
 #endif // MICROPY_INCLUDED_PY_LEXER_H
diff --git a/python/src/py/makecompresseddata.py b/python/src/py/makecompresseddata.py
index 1bce3e8e8..9603de871 100644
--- a/python/src/py/makecompresseddata.py
+++ b/python/src/py/makecompresseddata.py
@@ -24,7 +24,7 @@ def check_non_ascii(msg):
 
 
 # Replace <char><space> with <char | 0x80>.
-# Trivial scheme to demo/test.
+# Trival scheme to demo/test.
 def space_compression(error_strings):
     for line in error_strings:
         check_non_ascii(line)
diff --git a/python/src/py/makemoduledefs.py b/python/src/py/makemoduledefs.py
index 612f3d29a..9061cd890 100644
--- a/python/src/py/makemoduledefs.py
+++ b/python/src/py/makemoduledefs.py
@@ -1,88 +1,69 @@
-#!/usr/bin/env python
-
-# This pre-processor parses provided objects' c files for
-# MP_REGISTER_MODULE(module_name, obj_module, enabled_define)
-# These are used to generate a header with the required entries for
-# "mp_rom_map_elem_t mp_builtin_module_table[]" in py/objmodule.c
+"""
+This pre-processor parses a single file containing a list of
+MP_REGISTER_MODULE(module_name, obj_module)
+These are used to generate a header with the required entries for
+"mp_rom_map_elem_t mp_builtin_module_table[]" in py/objmodule.c
+"""
 
 from __future__ import print_function
 
+import sys
 import re
 import io
-import os
 import argparse
 
 
-pattern = re.compile(r"[\n;]\s*MP_REGISTER_MODULE\((.*?),\s*(.*?),\s*(.*?)\);", flags=re.DOTALL)
+pattern = re.compile(r"\s*MP_REGISTER_MODULE\((.*?),\s*(.*?)\);", flags=re.DOTALL)
 
 
-def find_c_file(obj_file, vpath):
-    """Search vpaths for the c file that matches the provided object_file.
+def find_module_registrations(filename):
+    """Find any MP_REGISTER_MODULE definitions in the provided file.
 
-    :param str obj_file: object file to find the matching c file for
-    :param List[str] vpath: List of base paths, similar to gcc vpath
-    :return: str path to c file or None
-    """
-    c_file = None
-    relative_c_file = os.path.splitext(obj_file)[0] + ".c"
-    relative_c_file = relative_c_file.lstrip("/\\")
-    for p in vpath:
-        possible_c_file = os.path.join(p, relative_c_file)
-        if os.path.exists(possible_c_file):
-            c_file = possible_c_file
-            break
-
-    return c_file
-
-
-def find_module_registrations(c_file):
-    """Find any MP_REGISTER_MODULE definitions in the provided c file.
-
-    :param str c_file: path to c file to check
-    :return: List[(module_name, obj_module, enabled_define)]
+    :param str filename: path to file to check
+    :return: List[(module_name, obj_module)]
     """
     global pattern
 
-    if c_file is None:
-        # No c file to match the object file, skip
-        return set()
-
-    with io.open(c_file, encoding="utf-8") as c_file_obj:
+    with io.open(filename, encoding="utf-8") as c_file_obj:
         return set(re.findall(pattern, c_file_obj.read()))
 
 
 def generate_module_table_header(modules):
     """Generate header with module table entries for builtin modules.
 
-    :param List[(module_name, obj_module, enabled_define)] modules: module defs
+    :param List[(module_name, obj_module)] modules: module defs
     :return: None
     """
 
     # Print header file for all external modules.
-    mod_defs = []
+    mod_defs = set()
     print("// Automatically generated by makemoduledefs.py.\n")
-    for module_name, obj_module, enabled_define in modules:
+    for module_name, obj_module in modules:
         mod_def = "MODULE_DEF_{}".format(module_name.upper())
-        mod_defs.append(mod_def)
+        mod_defs.add(mod_def)
+        if "," in obj_module:
+            print(
+                "ERROR: Call to MP_REGISTER_MODULE({}, {}) should be MP_REGISTER_MODULE({}, {})\n".format(
+                    module_name, obj_module, module_name, obj_module.split(",")[0]
+                ),
+                file=sys.stderr,
+            )
+            sys.exit(1)
         print(
             (
-                "#if ({enabled_define})\n"
-                "    extern const struct _mp_obj_module_t {obj_module};\n"
-                "    #define {mod_def} {{ MP_ROM_QSTR({module_name}), MP_ROM_PTR(&{obj_module}) }},\n"
-                "#else\n"
-                "    #define {mod_def}\n"
-                "#endif\n"
+                "extern const struct _mp_obj_module_t {obj_module};\n"
+                "#undef {mod_def}\n"
+                "#define {mod_def} {{ MP_ROM_QSTR({module_name}), MP_ROM_PTR(&{obj_module}) }},\n"
             ).format(
                 module_name=module_name,
                 obj_module=obj_module,
-                enabled_define=enabled_define,
                 mod_def=mod_def,
             )
         )
 
     print("\n#define MICROPY_REGISTERED_MODULES \\")
 
-    for mod_def in mod_defs:
+    for mod_def in sorted(mod_defs):
         print("    {mod_def} \\".format(mod_def=mod_def))
 
     print("// MICROPY_REGISTERED_MODULES")
@@ -90,19 +71,10 @@ def generate_module_table_header(modules):
 
 def main():
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--vpath", default=".", help="comma separated list of folders to search for c files in"
-    )
-    parser.add_argument("files", nargs="*", help="list of c files to search")
+    parser.add_argument("file", nargs=1, help="file with MP_REGISTER_MODULE definitions")
     args = parser.parse_args()
 
-    vpath = [p.strip() for p in args.vpath.split(",")]
-
-    modules = set()
-    for obj_file in args.files:
-        c_file = find_c_file(obj_file, vpath)
-        modules |= find_module_registrations(c_file)
-
+    modules = find_module_registrations(args.file[0])
     generate_module_table_header(sorted(modules))
 
 
diff --git a/python/src/py/makeqstrdata.py b/python/src/py/makeqstrdata.py
index 403c40688..e332ab94e 100644
--- a/python/src/py/makeqstrdata.py
+++ b/python/src/py/makeqstrdata.py
@@ -317,26 +317,24 @@ def parse_input_headers(infiles):
     return qcfgs, qstrs
 
 
+def escape_bytes(qstr, qbytes):
+    if all(32 <= ord(c) <= 126 and c != "\\" and c != '"' for c in qstr):
+        # qstr is all printable ASCII so render it as-is (for easier debugging)
+        return qstr
+    else:
+        # qstr contains non-printable codes so render entire thing as hex pairs
+        return "".join(("\\x%02x" % b) for b in qbytes)
+
+
 def make_bytes(cfg_bytes_len, cfg_bytes_hash, qstr):
     qbytes = bytes_cons(qstr, "utf8")
     qlen = len(qbytes)
     qhash = compute_hash(qbytes, cfg_bytes_hash)
-    if all(32 <= ord(c) <= 126 and c != "\\" and c != '"' for c in qstr):
-        # qstr is all printable ASCII so render it as-is (for easier debugging)
-        qdata = qstr
-    else:
-        # qstr contains non-printable codes so render entire thing as hex pairs
-        qdata = "".join(("\\x%02x" % b) for b in qbytes)
     if qlen >= (1 << (8 * cfg_bytes_len)):
         print("qstr is too long:", qstr)
         assert False
-    qlen_str = ("\\x%02x" * cfg_bytes_len) % tuple(
-        ((qlen >> (8 * i)) & 0xFF) for i in range(cfg_bytes_len)
-    )
-    qhash_str = ("\\x%02x" * cfg_bytes_hash) % tuple(
-        ((qhash >> (8 * i)) & 0xFF) for i in range(cfg_bytes_hash)
-    )
-    return '(const byte*)"%s%s" "%s"' % (qhash_str, qlen_str, qdata)
+    qdata = escape_bytes(qstr, qbytes)
+    return '%d, %d, "%s"' % (qhash, qlen, qdata)
 
 
 def print_qstr_data(qcfgs, qstrs):
@@ -349,10 +347,7 @@ def print_qstr_data(qcfgs, qstrs):
     print("")
 
     # add NULL qstr with no hash or data
-    print(
-        'QDEF(MP_QSTRnull, (const byte*)"%s%s" "")'
-        % ("\\x00" * cfg_bytes_hash, "\\x00" * cfg_bytes_len)
-    )
+    print('QDEF(MP_QSTRnull, 0, 0, "")')
 
     # go through each qstr and print it out
     for order, ident, qstr in sorted(qstrs.values(), key=lambda x: x[0]):
diff --git a/python/src/py/makeqstrdefs.py b/python/src/py/makeqstrdefs.py
index 187a9aeea..4c416a874 100644
--- a/python/src/py/makeqstrdefs.py
+++ b/python/src/py/makeqstrdefs.py
@@ -21,6 +21,17 @@ _MODE_QSTR = "qstr"
 # Extract MP_COMPRESSED_ROM_TEXT("") macros.  (Which come from MP_ERROR_TEXT)
 _MODE_COMPRESS = "compress"
 
+# Extract MP_REGISTER_MODULE(...) macros.
+_MODE_MODULE = "module"
+
+
+def is_c_source(fname):
+    return os.path.splitext(fname)[1] in [".c"]
+
+
+def is_cxx_source(fname):
+    return os.path.splitext(fname)[1] in [".cc", ".cp", ".cxx", ".cpp", ".CPP", ".c++", ".C"]
+
 
 def preprocess():
     if any(src in args.dependencies for src in args.changed_sources):
@@ -32,9 +43,9 @@ def preprocess():
     csources = []
     cxxsources = []
     for source in sources:
-        if source.endswith(".cpp"):
+        if is_cxx_source(source):
             cxxsources.append(source)
-        elif source.endswith(".c"):
+        elif is_c_source(source):
             csources.append(source)
     try:
         os.makedirs(os.path.dirname(args.output[0]))
@@ -77,6 +88,8 @@ def process_file(f):
         re_match = re.compile(r"MP_QSTR_[_a-zA-Z0-9]+")
     elif args.mode == _MODE_COMPRESS:
         re_match = re.compile(r'MP_COMPRESSED_ROM_TEXT\("([^"]*)"\)')
+    elif args.mode == _MODE_MODULE:
+        re_match = re.compile(r"MP_REGISTER_MODULE\(.*?,\s*.*?\);")
     output = []
     last_fname = None
     for line in f:
@@ -87,7 +100,7 @@ def process_file(f):
             m = re_line.match(line)
             assert m is not None
             fname = m.group(1)
-            if os.path.splitext(fname)[1] not in [".c", ".cpp"]:
+            if not is_c_source(fname) and not is_cxx_source(fname):
                 continue
             if fname != last_fname:
                 write_out(last_fname, output)
@@ -98,7 +111,7 @@ def process_file(f):
             if args.mode == _MODE_QSTR:
                 name = match.replace("MP_QSTR_", "")
                 output.append("Q(" + name + ")")
-            elif args.mode == _MODE_COMPRESS:
+            elif args.mode in (_MODE_COMPRESS, _MODE_MODULE):
                 output.append(match)
 
     if last_fname:
@@ -133,6 +146,8 @@ def cat_together():
     mode_full = "QSTR"
     if args.mode == _MODE_COMPRESS:
         mode_full = "Compressed data"
+    elif args.mode == _MODE_MODULE:
+        mode_full = "Module registrations"
     if old_hash != new_hash:
         print(mode_full, "updated")
         try:
@@ -193,7 +208,7 @@ if __name__ == "__main__":
     args.output_dir = sys.argv[4]
     args.output_file = None if len(sys.argv) == 5 else sys.argv[5]  # Unused for command=split
 
-    if args.mode not in (_MODE_QSTR, _MODE_COMPRESS):
+    if args.mode not in (_MODE_QSTR, _MODE_COMPRESS, _MODE_MODULE):
         print("error: mode %s unrecognised" % sys.argv[2])
         sys.exit(2)
 
diff --git a/python/src/py/malloc.c b/python/src/py/malloc.c
index c775d5b15..efdff7539 100644
--- a/python/src/py/malloc.c
+++ b/python/src/py/malloc.c
@@ -207,6 +207,99 @@ void m_free(void *ptr)
     #endif
 }
 
+#if MICROPY_TRACKED_ALLOC
+
+#define MICROPY_TRACKED_ALLOC_STORE_SIZE (!MICROPY_ENABLE_GC)
+
+typedef struct _m_tracked_node_t {
+    struct _m_tracked_node_t *prev;
+    struct _m_tracked_node_t *next;
+    #if MICROPY_TRACKED_ALLOC_STORE_SIZE
+    uintptr_t size;
+    #endif
+    uint8_t data[];
+} m_tracked_node_t;
+
+#if MICROPY_DEBUG_VERBOSE
+STATIC size_t m_tracked_count_links(size_t *nb) {
+    m_tracked_node_t *node = MP_STATE_VM(m_tracked_head);
+    size_t n = 0;
+    *nb = 0;
+    while (node != NULL) {
+        ++n;
+        #if MICROPY_TRACKED_ALLOC_STORE_SIZE
+        *nb += node->size;
+        #else
+        *nb += gc_nbytes(node);
+        #endif
+        node = node->next;
+    }
+    return n;
+}
+#endif
+
+void *m_tracked_calloc(size_t nmemb, size_t size) {
+    m_tracked_node_t *node = m_malloc_maybe(sizeof(m_tracked_node_t) + nmemb * size);
+    if (node == NULL) {
+        return NULL;
+    }
+    #if MICROPY_DEBUG_VERBOSE
+    size_t nb;
+    size_t n = m_tracked_count_links(&nb);
+    DEBUG_printf("m_tracked_calloc(%u, %u) -> (%u;%u) %p\n", (int)nmemb, (int)size, (int)n, (int)nb, node);
+    #endif
+    if (MP_STATE_VM(m_tracked_head) != NULL) {
+        MP_STATE_VM(m_tracked_head)->prev = node;
+    }
+    node->prev = NULL;
+    node->next = MP_STATE_VM(m_tracked_head);
+    MP_STATE_VM(m_tracked_head) = node;
+    #if MICROPY_TRACKED_ALLOC_STORE_SIZE
+    node->size = nmemb * size;
+    #endif
+    #if !MICROPY_GC_CONSERVATIVE_CLEAR
+    memset(&node->data[0], 0, nmemb * size);
+    #endif
+    return &node->data[0];
+}
+
+void m_tracked_free(void *ptr_in) {
+    if (ptr_in == NULL) {
+        return;
+    }
+    m_tracked_node_t *node = (m_tracked_node_t *)((uint8_t *)ptr_in - sizeof(m_tracked_node_t));
+    #if MICROPY_DEBUG_VERBOSE
+    size_t data_bytes;
+    #if MICROPY_TRACKED_ALLOC_STORE_SIZE
+    data_bytes = node->size;
+    #else
+    data_bytes = gc_nbytes(node);
+    #endif
+    size_t nb;
+    size_t n = m_tracked_count_links(&nb);
+    DEBUG_printf("m_tracked_free(%p, [%p, %p], nbytes=%u, links=%u;%u)\n", node, node->prev, node->next, (int)data_bytes, (int)n, (int)nb);
+    #endif
+    if (node->next != NULL) {
+        node->next->prev = node->prev;
+    }
+    if (node->prev != NULL) {
+        node->prev->next = node->next;
+    } else {
+        MP_STATE_VM(m_tracked_head) = node->next;
+    }
+    m_free(node
+        #if MICROPY_MALLOC_USES_ALLOCATED_SIZE
+        #if MICROPY_TRACKED_ALLOC_STORE_SIZE
+        , node->size
+        #else
+        , gc_nbytes(node)
+        #endif
+        #endif
+        );
+}
+
+#endif // MICROPY_TRACKED_ALLOC
+
 #if MICROPY_MEM_STATS
 size_t m_get_total_bytes_allocated(void) {
     return MP_STATE_MEM(total_bytes_allocated);
diff --git a/python/src/py/map.c b/python/src/py/map.c
index 54f4b0204..b194250cb 100644
--- a/python/src/py/map.c
+++ b/python/src/py/map.c
@@ -40,6 +40,27 @@
 #define DEBUG_printf(...) (void)0
 #endif
 
+#if MICROPY_OPT_MAP_LOOKUP_CACHE
+// MP_STATE_VM(map_lookup_cache) provides a cache of index to the last known
+// position of that index in any map. On a cache hit, this allows
+// short-circuiting the full linear search in the case of an ordered map
+// (i.e. all builtin modules and objects' locals dicts), and computation of
+// the hash (and potentially some linear probing) in the case of a regular
+// map. Note the same cache is shared across all maps.
+
+// Gets the index into the cache for this index. Shift down by two to remove
+// mp_obj_t tag bits.
+#define MAP_CACHE_OFFSET(index) ((((uintptr_t)(index)) >> 2) % MICROPY_OPT_MAP_LOOKUP_CACHE_SIZE)
+// Gets the map cache entry for the corresponding index.
+#define MAP_CACHE_ENTRY(index) (MP_STATE_VM(map_lookup_cache)[MAP_CACHE_OFFSET(index)])
+// Retrieve the mp_obj_t at the location suggested by the cache.
+#define MAP_CACHE_GET(map, index) (&(map)->table[MAP_CACHE_ENTRY(index) % (map)->alloc])
+// Update the cache for this index.
+#define MAP_CACHE_SET(index, pos) MAP_CACHE_ENTRY(index) = (pos) & 0xff;
+#else
+#define MAP_CACHE_SET(index, pos)
+#endif
+
 // This table of sizes is used to control the growth of hash tables.
 // The first set of sizes are chosen so the allocation fits exactly in a
 // 4-word GC block, and it's not so important for these small values to be
@@ -132,10 +153,22 @@ STATIC void mp_map_rehash(mp_map_t *map) {
 //  - returns slot, with key non-null and value=MP_OBJ_NULL if it was added
 // MP_MAP_LOOKUP_REMOVE_IF_FOUND behaviour:
 //  - returns NULL if not found, else the slot if was found in with key null and value non-null
-mp_map_elem_t *mp_map_lookup(mp_map_t *map, mp_obj_t index, mp_map_lookup_kind_t lookup_kind) {
+mp_map_elem_t *MICROPY_WRAP_MP_MAP_LOOKUP(mp_map_lookup)(mp_map_t * map, mp_obj_t index, mp_map_lookup_kind_t lookup_kind) {
     // If the map is a fixed array then we must only be called for a lookup
     assert(!map->is_fixed || lookup_kind == MP_MAP_LOOKUP);
 
+    #if MICROPY_OPT_MAP_LOOKUP_CACHE
+    // Try the cache for lookup or add-if-not-found.
+    if (lookup_kind != MP_MAP_LOOKUP_REMOVE_IF_FOUND && map->alloc) {
+        mp_map_elem_t *slot = MAP_CACHE_GET(map, index);
+        // Note: Just comparing key for value equality will have false negatives, but
+        // these will be handled by the regular path below.
+        if (slot->key == index) {
+            return slot;
+        }
+    }
+    #endif
+
     // Work out if we can compare just pointers
     bool compare_only_ptrs = map->all_keys_are_qstrs;
     if (compare_only_ptrs) {
@@ -172,6 +205,7 @@ mp_map_elem_t *mp_map_lookup(mp_map_t *map, mp_obj_t index, mp_map_lookup_kind_t
                     elem->value = value;
                 }
                 #endif
+                MAP_CACHE_SET(index, elem - map->table);
                 return elem;
             }
         }
@@ -254,6 +288,7 @@ mp_map_elem_t *mp_map_lookup(mp_map_t *map, mp_obj_t index, mp_map_lookup_kind_t
                 }
                 // keep slot->value so that caller can access it if needed
             }
+            MAP_CACHE_SET(index, pos);
             return slot;
         }
 
diff --git a/python/src/py/misc.h b/python/src/py/misc.h
index e1d27dc7b..d94afd0b0 100644
--- a/python/src/py/misc.h
+++ b/python/src/py/misc.h
@@ -103,6 +103,13 @@ void m_free(void *ptr);
 #endif
 NORETURN void m_malloc_fail(size_t num_bytes);
 
+#if MICROPY_TRACKED_ALLOC
+// These alloc/free functions track the pointers in a linked list so the GC does not reclaim
+// them.  They can be used by code that requires traditional C malloc/free semantics.
+void *m_tracked_calloc(size_t nmemb, size_t size);
+void m_tracked_free(void *ptr_in);
+#endif
+
 #if MICROPY_MEM_STATS
 size_t m_get_total_bytes_allocated(void);
 size_t m_get_current_bytes_allocated(void);
diff --git a/python/src/py/mkrules.cmake b/python/src/py/mkrules.cmake
index 9d0801793..d0dc01962 100644
--- a/python/src/py/mkrules.cmake
+++ b/python/src/py/mkrules.cmake
@@ -2,13 +2,24 @@
 
 set(MICROPY_GENHDR_DIR "${CMAKE_BINARY_DIR}/genhdr")
 set(MICROPY_MPVERSION "${MICROPY_GENHDR_DIR}/mpversion.h")
-set(MICROPY_MODULEDEFS "${MICROPY_GENHDR_DIR}/moduledefs.h")
 set(MICROPY_QSTRDEFS_PY "${MICROPY_PY_DIR}/qstrdefs.h")
 set(MICROPY_QSTRDEFS_LAST "${MICROPY_GENHDR_DIR}/qstr.i.last")
 set(MICROPY_QSTRDEFS_SPLIT "${MICROPY_GENHDR_DIR}/qstr.split")
 set(MICROPY_QSTRDEFS_COLLECTED "${MICROPY_GENHDR_DIR}/qstrdefs.collected.h")
 set(MICROPY_QSTRDEFS_PREPROCESSED "${MICROPY_GENHDR_DIR}/qstrdefs.preprocessed.h")
 set(MICROPY_QSTRDEFS_GENERATED "${MICROPY_GENHDR_DIR}/qstrdefs.generated.h")
+set(MICROPY_MODULEDEFS_SPLIT "${MICROPY_GENHDR_DIR}/moduledefs.split")
+set(MICROPY_MODULEDEFS_COLLECTED "${MICROPY_GENHDR_DIR}/moduledefs.collected")
+set(MICROPY_MODULEDEFS "${MICROPY_GENHDR_DIR}/moduledefs.h")
+
+# Need to do this before extracting MICROPY_CPP_DEF below. Rest of frozen
+# manifest handling is at the end of this file.
+if(MICROPY_FROZEN_MANIFEST)
+    target_compile_definitions(${MICROPY_TARGET} PUBLIC
+        MICROPY_QSTR_EXTRA_POOL=mp_qstr_frozen_const_pool
+        MICROPY_MODULE_FROZEN_MPY=\(1\)
+    )
+endif()
 
 # Provide defaults for preprocessor flags if not already defined
 if(NOT MICROPY_CPP_FLAGS)
@@ -34,6 +45,7 @@ find_package(Python3 REQUIRED COMPONENTS Interpreter)
 target_sources(${MICROPY_TARGET} PRIVATE
     ${MICROPY_MPVERSION}
     ${MICROPY_QSTRDEFS_GENERATED}
+    ${MICROPY_MODULEDEFS}
 )
 
 # Command to force the build of another command
@@ -53,15 +65,6 @@ add_custom_command(
     DEPENDS MICROPY_FORCE_BUILD
 )
 
-# Generate moduledefs.h
-
-add_custom_command(
-    OUTPUT ${MICROPY_MODULEDEFS}
-    COMMAND ${Python3_EXECUTABLE} ${MICROPY_PY_DIR}/makemoduledefs.py --vpath="/" ${MICROPY_SOURCE_QSTR} > ${MICROPY_MODULEDEFS}
-    DEPENDS ${MICROPY_MPVERSION}
-        ${MICROPY_SOURCE_QSTR}
-)
-
 # Generate qstrs
 
 # If any of the dependencies in this rule change then the C-preprocessor step must be run.
@@ -70,7 +73,7 @@ add_custom_command(
 add_custom_command(
     OUTPUT ${MICROPY_QSTRDEFS_LAST}
     COMMAND ${Python3_EXECUTABLE} ${MICROPY_PY_DIR}/makeqstrdefs.py pp ${CMAKE_C_COMPILER} -E output ${MICROPY_GENHDR_DIR}/qstr.i.last cflags ${MICROPY_CPP_FLAGS} -DNO_QSTR cxxflags ${MICROPY_CPP_FLAGS} -DNO_QSTR sources ${MICROPY_SOURCE_QSTR}
-    DEPENDS ${MICROPY_MODULEDEFS}
+    DEPENDS ${MICROPY_MPVERSION}
         ${MICROPY_SOURCE_QSTR}
     VERBATIM
     COMMAND_EXPAND_LISTS
@@ -111,6 +114,31 @@ add_custom_command(
     COMMAND_EXPAND_LISTS
 )
 
+# Generate moduledefs.h
+
+add_custom_command(
+    OUTPUT ${MICROPY_MODULEDEFS_SPLIT}
+    COMMAND ${Python3_EXECUTABLE} ${MICROPY_PY_DIR}/makeqstrdefs.py split module ${MICROPY_GENHDR_DIR}/qstr.i.last ${MICROPY_GENHDR_DIR}/module _
+    COMMAND touch ${MICROPY_MODULEDEFS_SPLIT}
+    DEPENDS ${MICROPY_QSTRDEFS_LAST}
+    VERBATIM
+    COMMAND_EXPAND_LISTS
+)
+
+add_custom_command(
+    OUTPUT ${MICROPY_MODULEDEFS_COLLECTED}
+    COMMAND ${Python3_EXECUTABLE} ${MICROPY_PY_DIR}/makeqstrdefs.py cat module _ ${MICROPY_GENHDR_DIR}/module ${MICROPY_MODULEDEFS_COLLECTED}
+    DEPENDS ${MICROPY_MODULEDEFS_SPLIT}
+    VERBATIM
+    COMMAND_EXPAND_LISTS
+)
+
+add_custom_command(
+    OUTPUT ${MICROPY_MODULEDEFS}
+    COMMAND ${Python3_EXECUTABLE} ${MICROPY_PY_DIR}/makemoduledefs.py ${MICROPY_MODULEDEFS_COLLECTED} > ${MICROPY_MODULEDEFS}
+    DEPENDS ${MICROPY_MODULEDEFS_COLLECTED}
+)
+
 # Build frozen code if enabled
 
 if(MICROPY_FROZEN_MANIFEST)
@@ -120,10 +148,7 @@ if(MICROPY_FROZEN_MANIFEST)
         ${MICROPY_FROZEN_CONTENT}
     )
 
-    target_compile_definitions(${MICROPY_TARGET} PUBLIC
-        MICROPY_QSTR_EXTRA_POOL=mp_qstr_frozen_const_pool
-        MICROPY_MODULE_FROZEN_MPY=\(1\)
-    )
+    # Note: target_compile_definitions already added earlier.
 
     if(NOT MICROPY_LIB_DIR)
         set(MICROPY_LIB_DIR ${MICROPY_DIR}/../micropython-lib)
@@ -153,3 +178,10 @@ if(MICROPY_FROZEN_MANIFEST)
         VERBATIM
     )
 endif()
+
+# Update submodules
+if(ECHO_SUBMODULES)
+    # If cmake is run with GIT_SUBMODULES defined on command line, process the port / board
+    # settings then print the final GIT_SUBMODULES variable as a fatal error and exit.
+    message(FATAL_ERROR "GIT_SUBMODULES=${GIT_SUBMODULES}")
+endif()
diff --git a/python/src/py/mkrules.mk b/python/src/py/mkrules.mk
index bde96c7b4..fa1aad881 100644
--- a/python/src/py/mkrules.mk
+++ b/python/src/py/mkrules.mk
@@ -7,6 +7,9 @@ endif
 # Extra deps that need to happen before object compilation.
 OBJ_EXTRA_ORDER_DEPS =
 
+# Generate moduledefs.h.
+OBJ_EXTRA_ORDER_DEPS += $(HEADER_BUILD)/moduledefs.h
+
 ifeq ($(MICROPY_ROM_TEXT_COMPRESSION),1)
 # If compression is enabled, trigger the build of compressed.data.h...
 OBJ_EXTRA_ORDER_DEPS += $(HEADER_BUILD)/compressed.data.h
@@ -16,7 +19,7 @@ endif
 
 # QSTR generation uses the same CFLAGS, with these modifications.
 QSTR_GEN_FLAGS = -DNO_QSTR
-# Note: := to force evaluation immediately.
+# Note: := to force evalulation immediately.
 QSTR_GEN_CFLAGS := $(CFLAGS)
 QSTR_GEN_CFLAGS += $(QSTR_GEN_FLAGS)
 QSTR_GEN_CXXFLAGS := $(CXXFLAGS)
@@ -28,7 +31,7 @@ QSTR_GEN_CXXFLAGS += $(QSTR_GEN_FLAGS)
 # tree.
 #
 # So for example, py/map.c would have an object file name py/map.o
-# The object files will go into the build directory and maintain the same
+# The object files will go into the build directory and mantain the same
 # directory structure as the source tree. So the final dependency will look
 # like this:
 #
@@ -100,7 +103,7 @@ $(OBJ): | $(HEADER_BUILD)/qstrdefs.generated.h $(HEADER_BUILD)/mpversion.h $(OBJ
 # - else, if list of newer prerequisites ($?) is not empty, then process just these ($?)
 # - else, process all source files ($^) [this covers "make -B" which can set $? to empty]
 # See more information about this process in docs/develop/qstr.rst.
-$(HEADER_BUILD)/qstr.i.last: $(SRC_QSTR) $(QSTR_GLOBAL_DEPENDENCIES) $(HEADER_BUILD)/moduledefs.h | $(QSTR_GLOBAL_REQUIREMENTS)
+$(HEADER_BUILD)/qstr.i.last: $(SRC_QSTR) $(QSTR_GLOBAL_DEPENDENCIES) | $(QSTR_GLOBAL_REQUIREMENTS)
 	$(ECHO) "GEN $@"
 	$(Q)$(PYTHON) $(PY_SRC)/makeqstrdefs.py pp $(CPP) output $(HEADER_BUILD)/qstr.i.last cflags $(QSTR_GEN_CFLAGS) cxxflags $(QSTR_GEN_CXXFLAGS) sources $^ dependencies $(QSTR_GLOBAL_DEPENDENCIES) changed_sources $?
 
@@ -113,6 +116,16 @@ $(QSTR_DEFS_COLLECTED): $(HEADER_BUILD)/qstr.split
 	$(ECHO) "GEN $@"
 	$(Q)$(PYTHON) $(PY_SRC)/makeqstrdefs.py cat qstr _ $(HEADER_BUILD)/qstr $@
 
+# Module definitions via MP_REGISTER_MODULE.
+$(HEADER_BUILD)/moduledefs.split: $(HEADER_BUILD)/qstr.i.last
+	$(ECHO) "GEN $@"
+	$(Q)$(PYTHON) $(PY_SRC)/makeqstrdefs.py split module $< $(HEADER_BUILD)/module _
+	$(Q)$(TOUCH) $@
+
+$(HEADER_BUILD)/moduledefs.collected: $(HEADER_BUILD)/moduledefs.split
+	$(ECHO) "GEN $@"
+	$(Q)$(PYTHON) $(PY_SRC)/makeqstrdefs.py cat module _ $(HEADER_BUILD)/module $@
+
 # Compressed error strings.
 $(HEADER_BUILD)/compressed.split: $(HEADER_BUILD)/qstr.i.last
 	$(ECHO) "GEN $@"
@@ -142,49 +155,24 @@ $(MICROPY_MPYCROSS_DEPENDENCY):
 	$(MAKE) -C $(dir $@)
 endif
 
+ifneq ($(FROZEN_DIR),)
+$(error Support for FROZEN_DIR was removed. Please use manifest.py instead, see https://docs.micropython.org/en/latest/reference/manifest.html)
+endif
+
+ifneq ($(FROZEN_MPY_DIR),)
+$(error Support for FROZEN_MPY_DIR was removed. Please use manifest.py instead, see https://docs.micropython.org/en/latest/reference/manifest.html)
+endif
+
 ifneq ($(FROZEN_MANIFEST),)
 # to build frozen_content.c from a manifest
 $(BUILD)/frozen_content.c: FORCE $(BUILD)/genhdr/qstrdefs.generated.h | $(MICROPY_MPYCROSS_DEPENDENCY)
 	$(Q)$(MAKE_MANIFEST) -o $@ -v "MPY_DIR=$(TOP)" -v "MPY_LIB_DIR=$(MPY_LIB_DIR)" -v "PORT_DIR=$(shell pwd)" -v "BOARD_DIR=$(BOARD_DIR)" -b "$(BUILD)" $(if $(MPY_CROSS_FLAGS),-f"$(MPY_CROSS_FLAGS)",) --mpy-tool-flags="$(MPY_TOOL_FLAGS)" $(FROZEN_MANIFEST)
-
-ifneq ($(FROZEN_DIR),)
-$(error FROZEN_DIR cannot be used in conjunction with FROZEN_MANIFEST)
-endif
-
-ifneq ($(FROZEN_MPY_DIR),)
-$(error FROZEN_MPY_DIR cannot be used in conjunction with FROZEN_MANIFEST)
-endif
-endif
-
-ifneq ($(FROZEN_DIR),)
-$(info Warning: FROZEN_DIR is deprecated in favour of FROZEN_MANIFEST)
-$(BUILD)/frozen.c: $(wildcard $(FROZEN_DIR)/*) $(HEADER_BUILD) $(FROZEN_EXTRA_DEPS)
-	$(ECHO) "GEN $@"
-	$(Q)$(MAKE_FROZEN) $(FROZEN_DIR) > $@
-endif
-
-ifneq ($(FROZEN_MPY_DIR),)
-$(info Warning: FROZEN_MPY_DIR is deprecated in favour of FROZEN_MANIFEST)
-# make a list of all the .py files that need compiling and freezing
-FROZEN_MPY_PY_FILES := $(shell find -L $(FROZEN_MPY_DIR) -type f -name '*.py' | $(SED) -e 's=^$(FROZEN_MPY_DIR)/==')
-FROZEN_MPY_MPY_FILES := $(addprefix $(BUILD)/frozen_mpy/,$(FROZEN_MPY_PY_FILES:.py=.mpy))
-
-# to build .mpy files from .py files
-$(BUILD)/frozen_mpy/%.mpy: $(FROZEN_MPY_DIR)/%.py | $(MICROPY_MPYCROSS_DEPENDENCY)
-	@$(ECHO) "MPY $<"
-	$(Q)$(MKDIR) -p $(dir $@)
-	$(Q)$(MICROPY_MPYCROSS) -o $@ -s $(<:$(FROZEN_MPY_DIR)/%=%) $(MPY_CROSS_FLAGS) $<
-
-# to build frozen_mpy.c from all .mpy files
-$(BUILD)/frozen_mpy.c: $(FROZEN_MPY_MPY_FILES) $(BUILD)/genhdr/qstrdefs.generated.h
-	@$(ECHO) "GEN $@"
-	$(Q)$(MPY_TOOL) -f -q $(BUILD)/genhdr/qstrdefs.preprocessed.h $(FROZEN_MPY_MPY_FILES) > $@
 endif
 
 ifneq ($(PROG),)
 # Build a standalone executable (unix does this)
 
-# The executable should have an .exe extension for builds targeting 'pure'
+# The executable should have an .exe extension for builds targetting 'pure'
 # Windows, i.e. msvc or mingw builds, but not when using msys or cygwin's gcc.
 COMPILER_TARGET := $(shell $(CC) -dumpmachine)
 ifneq (,$(findstring mingw,$(COMPILER_TARGET)))
@@ -234,27 +222,6 @@ clean:
 	$(RM) -rf $(BUILD) $(CLEAN_EXTRA)
 .PHONY: clean
 
-# Clean every non-git file from FROZEN_DIR/FROZEN_MPY_DIR, but making a backup.
-# We run rmdir below to avoid empty backup dir (it will silently fail if backup
-# is non-empty).
-clean-frozen:
-	if [ -n "$(FROZEN_MPY_DIR)" ]; then \
-	backup_dir=$(FROZEN_MPY_DIR).$$(date +%Y%m%dT%H%M%S); mkdir $$backup_dir; \
-	cd $(FROZEN_MPY_DIR); git status --ignored -u all -s . | awk ' {print $$2}' \
-	| xargs --no-run-if-empty cp --parents -t ../$$backup_dir; \
-	rmdir ../$$backup_dir 2>/dev/null || true; \
-	git clean -d -f .; \
-	fi
-
-	if [ -n "$(FROZEN_DIR)" ]; then \
-	backup_dir=$(FROZEN_DIR).$$(date +%Y%m%dT%H%M%S); mkdir $$backup_dir; \
-	cd $(FROZEN_DIR); git status --ignored -u all -s . | awk ' {print $$2}' \
-	| xargs --no-run-if-empty cp --parents -t ../$$backup_dir; \
-	rmdir ../$$backup_dir 2>/dev/null || true; \
-	git clean -d -f .; \
-	fi
-.PHONY: clean-frozen
-
 print-cfg:
 	$(ECHO) "PY_SRC = $(PY_SRC)"
 	$(ECHO) "BUILD  = $(BUILD)"
diff --git a/python/src/py/modarray.c b/python/src/py/modarray.c
index 9ab1795f8..d9f7a0452 100644
--- a/python/src/py/modarray.c
+++ b/python/src/py/modarray.c
@@ -40,6 +40,6 @@ const mp_obj_module_t mp_module_uarray = {
     .globals = (mp_obj_dict_t *)&mp_module_array_globals,
 };
 
-MP_REGISTER_MODULE(MP_QSTR_uarray, mp_module_uarray, MICROPY_PY_ARRAY);
+MP_REGISTER_MODULE(MP_QSTR_uarray, mp_module_uarray);
 
 #endif
diff --git a/python/src/py/modbuiltins.c b/python/src/py/modbuiltins.c
index 2a142a6bb..f3caccbc8 100644
--- a/python/src/py/modbuiltins.c
+++ b/python/src/py/modbuiltins.c
@@ -79,7 +79,7 @@ STATIC mp_obj_t mp_builtin___build_class__(size_t n_args, const mp_obj_t *args)
     meta_args[2] = class_locals; // dict of members
     mp_obj_t new_class = mp_call_function_n_kw(meta, 3, 0, meta_args);
 
-    // store into cell if needed
+    // store into cell if neede
     if (cell != mp_const_none) {
         mp_obj_cell_set(cell, new_class);
     }
@@ -729,6 +729,9 @@ STATIC const mp_rom_map_elem_t mp_module_builtins_globals_table[] = {
     #endif
     { MP_ROM_QSTR(MP_QSTR_next), MP_ROM_PTR(&mp_builtin_next_obj) },
     { MP_ROM_QSTR(MP_QSTR_oct), MP_ROM_PTR(&mp_builtin_oct_obj) },
+    #if MICROPY_PY_IO
+    { MP_ROM_QSTR(MP_QSTR_open), MP_ROM_PTR(&mp_builtin_open_obj) },
+    #endif
     { MP_ROM_QSTR(MP_QSTR_ord), MP_ROM_PTR(&mp_builtin_ord_obj) },
     { MP_ROM_QSTR(MP_QSTR_pow), MP_ROM_PTR(&mp_builtin_pow_obj) },
     { MP_ROM_QSTR(MP_QSTR_print), MP_ROM_PTR(&mp_builtin_print_obj) },
@@ -775,6 +778,7 @@ STATIC const mp_rom_map_elem_t mp_module_builtins_globals_table[] = {
 
     // Extra builtins as defined by a port
     MICROPY_PORT_BUILTINS
+    MICROPY_PORT_EXTRA_BUILTINS
 };
 
 MP_DEFINE_CONST_DICT(mp_module_builtins_globals, mp_module_builtins_globals_table);
@@ -783,3 +787,5 @@ const mp_obj_module_t mp_module_builtins = {
     .base = { &mp_type_module },
     .globals = (mp_obj_dict_t *)&mp_module_builtins_globals,
 };
+
+MP_REGISTER_MODULE(MP_QSTR_builtins, mp_module_builtins);
diff --git a/python/src/py/modcmath.c b/python/src/py/modcmath.c
index fb1f2a8fc..1418362ad 100644
--- a/python/src/py/modcmath.c
+++ b/python/src/py/modcmath.c
@@ -149,4 +149,6 @@ const mp_obj_module_t mp_module_cmath = {
     .globals = (mp_obj_dict_t *)&mp_module_cmath_globals,
 };
 
-#endif // MICROPY_PY_BUILTINS_FLOAT && MICROPY_PY_CMATH
+MP_REGISTER_MODULE(MP_QSTR_cmath, mp_module_cmath);
+
+#endif // MICROPY_PY_BUILTINS_FLOAT && MICROPY_PY_BUILTINS_COMPLEX && MICROPY_PY_CMATH
diff --git a/python/src/py/modcollections.c b/python/src/py/modcollections.c
index c145f12cc..8c62f34db 100644
--- a/python/src/py/modcollections.c
+++ b/python/src/py/modcollections.c
@@ -46,4 +46,6 @@ const mp_obj_module_t mp_module_collections = {
     .globals = (mp_obj_dict_t *)&mp_module_collections_globals,
 };
 
+MP_REGISTER_MODULE(MP_QSTR_ucollections, mp_module_collections);
+
 #endif // MICROPY_PY_COLLECTIONS
diff --git a/python/src/py/modgc.c b/python/src/py/modgc.c
index 534a711c1..c11bcaecd 100644
--- a/python/src/py/modgc.c
+++ b/python/src/py/modgc.c
@@ -115,4 +115,6 @@ const mp_obj_module_t mp_module_gc = {
     .globals = (mp_obj_dict_t *)&mp_module_gc_globals,
 };
 
+MP_REGISTER_MODULE(MP_QSTR_gc, mp_module_gc);
+
 #endif
diff --git a/python/src/py/modio.c b/python/src/py/modio.c
index 7f0d13cdf..50af0b6a4 100644
--- a/python/src/py/modio.c
+++ b/python/src/py/modio.c
@@ -121,8 +121,7 @@ typedef struct _mp_obj_bufwriter_t {
 STATIC mp_obj_t bufwriter_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
     mp_arg_check_num(n_args, n_kw, 2, 2, false);
     size_t alloc = mp_obj_get_int(args[1]);
-    mp_obj_bufwriter_t *o = m_new_obj_var(mp_obj_bufwriter_t, byte, alloc);
-    o->base.type = type;
+    mp_obj_bufwriter_t *o = mp_obj_malloc_var(mp_obj_bufwriter_t, byte, alloc, type);
     o->stream = args[0];
     o->alloc = alloc;
     o->len = 0;
@@ -204,50 +203,6 @@ STATIC const mp_obj_type_t mp_type_bufwriter = {
 };
 #endif // MICROPY_PY_IO_BUFFEREDWRITER
 
-#if MICROPY_PY_IO_RESOURCE_STREAM
-STATIC mp_obj_t resource_stream(mp_obj_t package_in, mp_obj_t path_in) {
-    VSTR_FIXED(path_buf, MICROPY_ALLOC_PATH_MAX);
-    size_t len;
-
-    // As an extension to pkg_resources.resource_stream(), we support
-    // package parameter being None, the path_in is interpreted as a
-    // raw path.
-    if (package_in != mp_const_none) {
-        // Pass "True" as sentinel value in fromlist to force returning of leaf module
-        mp_obj_t pkg = mp_import_name(mp_obj_str_get_qstr(package_in), mp_const_true, MP_OBJ_NEW_SMALL_INT(0));
-
-        mp_obj_t dest[2];
-        mp_load_method_maybe(pkg, MP_QSTR___path__, dest);
-        if (dest[0] == MP_OBJ_NULL) {
-            mp_raise_TypeError(NULL);
-        }
-
-        const char *path = mp_obj_str_get_data(dest[0], &len);
-        vstr_add_strn(&path_buf, path, len);
-        vstr_add_byte(&path_buf, '/');
-    }
-
-    const char *path = mp_obj_str_get_data(path_in, &len);
-    vstr_add_strn(&path_buf, path, len);
-
-    len = path_buf.len;
-    const char *data = mp_find_frozen_str(path_buf.buf, &len);
-    if (data != NULL) {
-        mp_obj_stringio_t *o = m_new_obj(mp_obj_stringio_t);
-        o->base.type = &mp_type_bytesio;
-        o->vstr = m_new_obj(vstr_t);
-        vstr_init_fixed_buf(o->vstr, len + 1, (char *)data);
-        o->vstr->len = len;
-        o->pos = 0;
-        return MP_OBJ_FROM_PTR(o);
-    }
-
-    mp_obj_t path_out = mp_obj_new_str(path_buf.buf, path_buf.len);
-    return mp_builtin_open(1, &path_out, (mp_map_t *)&mp_const_empty_map);
-}
-STATIC MP_DEFINE_CONST_FUN_OBJ_2(resource_stream_obj, resource_stream);
-#endif
-
 STATIC const mp_rom_map_elem_t mp_module_io_globals_table[] = {
     { MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_uio) },
     // Note: mp_builtin_open_obj should be defined by port, it's not
@@ -256,9 +211,6 @@ STATIC const mp_rom_map_elem_t mp_module_io_globals_table[] = {
     #if MICROPY_PY_IO_IOBASE
     { MP_ROM_QSTR(MP_QSTR_IOBase), MP_ROM_PTR(&mp_type_iobase) },
     #endif
-    #if MICROPY_PY_IO_RESOURCE_STREAM
-    { MP_ROM_QSTR(MP_QSTR_resource_stream), MP_ROM_PTR(&resource_stream_obj) },
-    #endif
     #if MICROPY_PY_IO_FILEIO
     { MP_ROM_QSTR(MP_QSTR_FileIO), MP_ROM_PTR(&mp_type_fileio) },
     #if MICROPY_CPYTHON_COMPAT
@@ -281,4 +233,6 @@ const mp_obj_module_t mp_module_io = {
     .globals = (mp_obj_dict_t *)&mp_module_io_globals,
 };
 
+MP_REGISTER_MODULE(MP_QSTR_uio, mp_module_io);
+
 #endif
diff --git a/python/src/py/modmath.c b/python/src/py/modmath.c
index ac9e0bbc4..72b5dde51 100644
--- a/python/src/py/modmath.c
+++ b/python/src/py/modmath.c
@@ -371,6 +371,11 @@ STATIC const mp_rom_map_elem_t mp_module_math_globals_table[] = {
     { MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_math) },
     { MP_ROM_QSTR(MP_QSTR_e), mp_const_float_e },
     { MP_ROM_QSTR(MP_QSTR_pi), mp_const_float_pi },
+    #if MICROPY_PY_MATH_CONSTANTS
+    { MP_ROM_QSTR(MP_QSTR_tau), mp_const_float_tau },
+    { MP_ROM_QSTR(MP_QSTR_inf), mp_const_float_inf },
+    { MP_ROM_QSTR(MP_QSTR_nan), mp_const_float_nan },
+    #endif
     { MP_ROM_QSTR(MP_QSTR_sqrt), MP_ROM_PTR(&mp_math_sqrt_obj) },
     { MP_ROM_QSTR(MP_QSTR_pow), MP_ROM_PTR(&mp_math_pow_obj) },
     { MP_ROM_QSTR(MP_QSTR_exp), MP_ROM_PTR(&mp_math_exp_obj) },
@@ -430,4 +435,6 @@ const mp_obj_module_t mp_module_math = {
     .globals = (mp_obj_dict_t *)&mp_module_math_globals,
 };
 
+MP_REGISTER_MODULE(MP_QSTR_math, mp_module_math);
+
 #endif // MICROPY_PY_BUILTINS_FLOAT && MICROPY_PY_MATH
diff --git a/python/src/py/modmicropython.c b/python/src/py/modmicropython.c
index 180f7f186..eafff90c6 100644
--- a/python/src/py/modmicropython.c
+++ b/python/src/py/modmicropython.c
@@ -209,3 +209,5 @@ const mp_obj_module_t mp_module_micropython = {
     .base = { &mp_type_module },
     .globals = (mp_obj_dict_t *)&mp_module_micropython_globals,
 };
+
+MP_REGISTER_MODULE(MP_QSTR_micropython, mp_module_micropython);
diff --git a/python/src/py/modstruct.c b/python/src/py/modstruct.c
index 4cbcad6d4..69c7279e3 100644
--- a/python/src/py/modstruct.c
+++ b/python/src/py/modstruct.c
@@ -266,4 +266,6 @@ const mp_obj_module_t mp_module_ustruct = {
     .globals = (mp_obj_dict_t *)&mp_module_struct_globals,
 };
 
+MP_REGISTER_MODULE(MP_QSTR_ustruct, mp_module_ustruct);
+
 #endif
diff --git a/python/src/py/modsys.c b/python/src/py/modsys.c
index 64349f3c3..a090f1212 100644
--- a/python/src/py/modsys.c
+++ b/python/src/py/modsys.c
@@ -27,6 +27,7 @@
 
 #include "py/builtin.h"
 #include "py/objlist.h"
+#include "py/objmodule.h"
 #include "py/objtuple.h"
 #include "py/objstr.h"
 #include "py/objint.h"
@@ -35,6 +36,8 @@
 #include "py/smallint.h"
 #include "py/runtime.h"
 #include "py/persistentcode.h"
+#include "extmod/moduplatform.h"
+#include "genhdr/mpversion.h"
 
 #if MICROPY_PY_SYS_SETTRACE
 #include "py/objmodule.h"
@@ -53,7 +56,7 @@ const mp_print_t mp_sys_stdout_print = {&mp_sys_stdout_obj, mp_stream_write_adap
 #endif
 
 // version - Python language version that this implementation conforms to, as a string
-STATIC const MP_DEFINE_STR_OBJ(mp_sys_version_obj, "3.4.0");
+STATIC const MP_DEFINE_STR_OBJ(mp_sys_version_obj, "3.4.0; " MICROPY_BANNER_NAME_AND_VERSION);
 
 // version_info - Python language version that this implementation conforms to, as a tuple of ints
 #define I(n) MP_OBJ_NEW_SMALL_INT(n)
@@ -67,34 +70,38 @@ STATIC const mp_obj_tuple_t mp_sys_implementation_version_info_obj = {
     3,
     { I(MICROPY_VERSION_MAJOR), I(MICROPY_VERSION_MINOR), I(MICROPY_VERSION_MICRO) }
 };
+STATIC const MP_DEFINE_STR_OBJ(mp_sys_implementation_machine_obj, MICROPY_BANNER_MACHINE);
 #if MICROPY_PERSISTENT_CODE_LOAD
 #define SYS_IMPLEMENTATION_ELEMS \
     MP_ROM_QSTR(MP_QSTR_micropython), \
     MP_ROM_PTR(&mp_sys_implementation_version_info_obj), \
+    MP_ROM_PTR(&mp_sys_implementation_machine_obj), \
     MP_ROM_INT(MPY_FILE_HEADER_INT)
 #else
 #define SYS_IMPLEMENTATION_ELEMS \
     MP_ROM_QSTR(MP_QSTR_micropython), \
-    MP_ROM_PTR(&mp_sys_implementation_version_info_obj)
+    MP_ROM_PTR(&mp_sys_implementation_version_info_obj), \
+    MP_ROM_PTR(&mp_sys_implementation_machine_obj)
 #endif
 #if MICROPY_PY_ATTRTUPLE
 STATIC const qstr impl_fields[] = {
     MP_QSTR_name,
     MP_QSTR_version,
+    MP_QSTR__machine,
     #if MICROPY_PERSISTENT_CODE_LOAD
-    MP_QSTR_mpy,
+    MP_QSTR__mpy,
     #endif
 };
 STATIC MP_DEFINE_ATTRTUPLE(
     mp_sys_implementation_obj,
     impl_fields,
-    2 + MICROPY_PERSISTENT_CODE_LOAD,
+    3 + MICROPY_PERSISTENT_CODE_LOAD,
     SYS_IMPLEMENTATION_ELEMS
     );
 #else
 STATIC const mp_rom_obj_tuple_t mp_sys_implementation_obj = {
     {&mp_type_tuple},
-    2 + MICROPY_PERSISTENT_CODE_LOAD,
+    3 + MICROPY_PERSISTENT_CODE_LOAD,
     {
         SYS_IMPLEMENTATION_ELEMS
     }
@@ -175,13 +182,32 @@ STATIC MP_DEFINE_CONST_FUN_OBJ_1(mp_sys_atexit_obj, mp_sys_atexit);
 #endif
 
 #if MICROPY_PY_SYS_SETTRACE
-// settrace(tracefunc): Set the system’s trace function.
+// settrace(tracefunc): Set the system's trace function.
 STATIC mp_obj_t mp_sys_settrace(mp_obj_t obj) {
     return mp_prof_settrace(obj);
 }
 MP_DEFINE_CONST_FUN_OBJ_1(mp_sys_settrace_obj, mp_sys_settrace);
 #endif // MICROPY_PY_SYS_SETTRACE
 
+#if MICROPY_PY_SYS_ATTR_DELEGATION
+STATIC const uint16_t sys_mutable_keys[] = {
+    #if MICROPY_PY_SYS_PS1_PS2
+    MP_QSTR_ps1,
+    MP_QSTR_ps2,
+    #endif
+    #if MICROPY_PY_SYS_TRACEBACKLIMIT
+    MP_QSTR_tracebacklimit,
+    #endif
+    MP_QSTRnull,
+};
+
+STATIC void mp_module_sys_attr(mp_obj_t self_in, qstr attr, mp_obj_t *dest) {
+    MP_STATIC_ASSERT(MP_ARRAY_SIZE(sys_mutable_keys) == MP_SYS_MUTABLE_NUM + 1);
+    MP_STATIC_ASSERT(MP_ARRAY_SIZE(MP_STATE_VM(sys_mutable)) == MP_SYS_MUTABLE_NUM);
+    mp_module_generic_attr(attr, dest, sys_mutable_keys, MP_STATE_VM(sys_mutable));
+}
+#endif
+
 STATIC const mp_rom_map_elem_t mp_module_sys_globals_table[] = {
     { MP_ROM_QSTR(MP_QSTR___name__), MP_ROM_QSTR(MP_QSTR_sys) },
 
@@ -244,6 +270,11 @@ STATIC const mp_rom_map_elem_t mp_module_sys_globals_table[] = {
     #if MICROPY_PY_SYS_ATEXIT
     { MP_ROM_QSTR(MP_QSTR_atexit), MP_ROM_PTR(&mp_sys_atexit_obj) },
     #endif
+
+    #if MICROPY_PY_SYS_ATTR_DELEGATION
+    // Delegation of attr lookup.
+    MP_MODULE_ATTR_DELEGATION_ENTRY(&mp_module_sys_attr),
+    #endif
 };
 
 STATIC MP_DEFINE_CONST_DICT(mp_module_sys_globals, mp_module_sys_globals_table);
@@ -253,4 +284,6 @@ const mp_obj_module_t mp_module_sys = {
     .globals = (mp_obj_dict_t *)&mp_module_sys_globals,
 };
 
+MP_REGISTER_MODULE(MP_QSTR_usys, mp_module_sys);
+
 #endif
diff --git a/python/src/py/modthread.c b/python/src/py/modthread.c
index 29b765493..bad94fbf2 100644
--- a/python/src/py/modthread.c
+++ b/python/src/py/modthread.c
@@ -54,8 +54,7 @@ typedef struct _mp_obj_thread_lock_t {
 } mp_obj_thread_lock_t;
 
 STATIC mp_obj_thread_lock_t *mp_obj_new_thread_lock(void) {
-    mp_obj_thread_lock_t *self = m_new_obj(mp_obj_thread_lock_t);
-    self->base.type = &mp_type_thread_lock;
+    mp_obj_thread_lock_t *self = mp_obj_malloc(mp_obj_thread_lock_t, &mp_type_thread_lock);
     mp_thread_mutex_init(&self->mutex);
     self->locked = false;
     return self;
@@ -301,4 +300,6 @@ const mp_obj_module_t mp_module_thread = {
     .globals = (mp_obj_dict_t *)&mp_module_thread_globals,
 };
 
+MP_REGISTER_MODULE(MP_QSTR__thread, mp_module_thread);
+
 #endif // MICROPY_PY_THREAD
diff --git a/python/src/py/moduerrno.c b/python/src/py/moduerrno.c
index d9affd9b2..1b16fd9d9 100644
--- a/python/src/py/moduerrno.c
+++ b/python/src/py/moduerrno.c
@@ -99,6 +99,8 @@ const mp_obj_module_t mp_module_uerrno = {
     .globals = (mp_obj_dict_t *)&mp_module_uerrno_globals,
 };
 
+MP_REGISTER_MODULE(MP_QSTR_uerrno, mp_module_uerrno);
+
 qstr mp_errno_to_str(mp_obj_t errno_val) {
     #if MICROPY_PY_UERRNO_ERRORCODE
     // We have the errorcode dict so can do a lookup using the hash map
diff --git a/python/src/py/mpconfig.h b/python/src/py/mpconfig.h
index 680b43d62..d70d39ae9 100644
--- a/python/src/py/mpconfig.h
+++ b/python/src/py/mpconfig.h
@@ -28,8 +28,8 @@
 
 // Current version of MicroPython
 #define MICROPY_VERSION_MAJOR 1
-#define MICROPY_VERSION_MINOR 17
-#define MICROPY_VERSION_MICRO 0
+#define MICROPY_VERSION_MINOR 19
+#define MICROPY_VERSION_MICRO 1
 
 // Combined version as a 32-bit number for convenience
 #define MICROPY_VERSION ( \
@@ -62,6 +62,31 @@
 #include <mpconfigport.h>
 #endif
 
+// Disable all optional features (i.e. minimal port).
+#define MICROPY_CONFIG_ROM_LEVEL_MINIMUM (0)
+// Only enable core features (constrained flash, e.g. STM32L072)
+#define MICROPY_CONFIG_ROM_LEVEL_CORE_FEATURES (10)
+// Enable most common features (small on-device flash, e.g. STM32F411)
+#define MICROPY_CONFIG_ROM_LEVEL_BASIC_FEATURES (20)
+// Enable convenience features (medium on-device flash, e.g. STM32F405)
+#define MICROPY_CONFIG_ROM_LEVEL_EXTRA_FEATURES (30)
+// Enable all common features (large/external flash, rp2, unix)
+#define MICROPY_CONFIG_ROM_LEVEL_FULL_FEATURES (40)
+// Enable everything (e.g. coverage)
+#define MICROPY_CONFIG_ROM_LEVEL_EVERYTHING (50)
+
+// Ports/boards should set this, but default to level=core.
+#ifndef MICROPY_CONFIG_ROM_LEVEL
+#define MICROPY_CONFIG_ROM_LEVEL (MICROPY_CONFIG_ROM_LEVEL_CORE_FEATURES)
+#endif
+
+// Helper macros for "have at least this level".
+#define MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES (MICROPY_CONFIG_ROM_LEVEL >= MICROPY_CONFIG_ROM_LEVEL_CORE_FEATURES)
+#define MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_BASIC_FEATURES (MICROPY_CONFIG_ROM_LEVEL >= MICROPY_CONFIG_ROM_LEVEL_BASIC_FEATURES)
+#define MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES (MICROPY_CONFIG_ROM_LEVEL >= MICROPY_CONFIG_ROM_LEVEL_EXTRA_FEATURES)
+#define MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_FULL_FEATURES (MICROPY_CONFIG_ROM_LEVEL >= MICROPY_CONFIG_ROM_LEVEL_FULL_FEATURES)
+#define MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EVERYTHING (MICROPY_CONFIG_ROM_LEVEL >= MICROPY_CONFIG_ROM_LEVEL_EVERYTHING)
+
 // Any options not explicitly set in mpconfigport.h will get default
 // values below.
 
@@ -113,6 +138,13 @@
 #define MICROPY_OBJ_REPR (MICROPY_OBJ_REPR_A)
 #endif
 
+// Whether to encode None/False/True as immediate objects instead of pointers to
+// real objects.  Reduces code size by a decent amount without hurting
+// performance, for all representations except D on some architectures.
+#ifndef MICROPY_OBJ_IMMEDIATE_OBJS
+#define MICROPY_OBJ_IMMEDIATE_OBJS (MICROPY_OBJ_REPR != MICROPY_OBJ_REPR_D)
+#endif
+
 /*****************************************************************************/
 /* Memory allocation policy                                                  */
 
@@ -149,7 +181,7 @@
 // Support automatic GC when reaching allocation threshold,
 // configurable by gc.threshold().
 #ifndef MICROPY_GC_ALLOC_THRESHOLD
-#define MICROPY_GC_ALLOC_THRESHOLD (1)
+#define MICROPY_GC_ALLOC_THRESHOLD (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Number of bytes to allocate initially when creating new chunks to store
@@ -242,7 +274,11 @@
 
 // Number of bytes used to store qstr hash
 #ifndef MICROPY_QSTR_BYTES_IN_HASH
+#if MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES
 #define MICROPY_QSTR_BYTES_IN_HASH (2)
+#else
+#define MICROPY_QSTR_BYTES_IN_HASH (1)
+#endif
 #endif
 
 // Avoid using C stack when making Python function calls. C stack still
@@ -292,6 +328,14 @@
 #define MICROPY_PERSISTENT_CODE (MICROPY_PERSISTENT_CODE_LOAD || MICROPY_PERSISTENT_CODE_SAVE || MICROPY_MODULE_FROZEN_MPY)
 #endif
 
+// Whether bytecode uses a qstr_table to map internal qstr indices in the bytecode
+// to global qstr values in the runtime (behaviour when feature is enabled), or
+// just stores global qstr values directly in the bytecode.  This must be enabled
+// if MICROPY_PERSISTENT_CODE is enabled.
+#ifndef MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
+#define MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE (MICROPY_PERSISTENT_CODE)
+#endif
+
 // Whether to emit x64 native code
 #ifndef MICROPY_EMIT_X64
 #define MICROPY_EMIT_X64 (0)
@@ -317,11 +361,6 @@
 #define MICROPY_EMIT_INLINE_THUMB (0)
 #endif
 
-// Whether to enable ARMv7-M instruction support in the Thumb2 inline assembler
-#ifndef MICROPY_EMIT_INLINE_THUMB_ARMV7M
-#define MICROPY_EMIT_INLINE_THUMB_ARMV7M (1)
-#endif
-
 // Whether to enable float support in the Thumb2 inline assembler
 #ifndef MICROPY_EMIT_INLINE_THUMB_FLOAT
 #define MICROPY_EMIT_INLINE_THUMB_FLOAT (1)
@@ -350,8 +389,10 @@
 // Convenience definition for whether any native emitter is enabled
 #define MICROPY_EMIT_NATIVE (MICROPY_EMIT_X64 || MICROPY_EMIT_X86 || MICROPY_EMIT_THUMB || MICROPY_EMIT_ARM || MICROPY_EMIT_XTENSA || MICROPY_EMIT_XTENSAWIN)
 
-// Select prelude-as-bytes-object for certain emitters
-#define MICROPY_EMIT_NATIVE_PRELUDE_AS_BYTES_OBJ (MICROPY_EMIT_XTENSAWIN)
+// Some architectures cannot read byte-wise from executable memory.  In this case
+// the prelude for a native function (which usually sits after the machine code)
+// must be separated and placed somewhere where it can be read byte-wise.
+#define MICROPY_EMIT_NATIVE_PRELUDE_SEPARATE_FROM_MACHINE_CODE (MICROPY_EMIT_XTENSAWIN)
 
 // Convenience definition for whether any inline assembler emitter is enabled
 #define MICROPY_EMIT_INLINE_ASM (MICROPY_EMIT_INLINE_THUMB || MICROPY_EMIT_INLINE_XTENSA)
@@ -376,7 +417,7 @@
 
 // Whether to include the compiler
 #ifndef MICROPY_ENABLE_COMPILER
-#define MICROPY_ENABLE_COMPILER (1)
+#define MICROPY_ENABLE_COMPILER (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether the compiler is dynamically configurable (ie at runtime)
@@ -385,51 +426,48 @@
 #define MICROPY_DYNAMIC_COMPILER (0)
 #endif
 
-// Configure dynamic compiler macros
-#if MICROPY_DYNAMIC_COMPILER
-#define MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE_DYNAMIC (mp_dynamic_compiler.opt_cache_map_lookup_in_bytecode)
-#define MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC (mp_dynamic_compiler.py_builtins_str_unicode)
-#else
-#define MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE_DYNAMIC MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE
-#define MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC MICROPY_PY_BUILTINS_STR_UNICODE
-#endif
-
 // Whether to enable constant folding; eg 1+2 rewritten as 3
 #ifndef MICROPY_COMP_CONST_FOLDING
-#define MICROPY_COMP_CONST_FOLDING (1)
+#define MICROPY_COMP_CONST_FOLDING (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
+#endif
+
+// Whether to compile constant tuples immediately to their respective objects; eg (1, True)
+// Otherwise the tuple will be built at runtime
+#ifndef MICROPY_COMP_CONST_TUPLE
+#define MICROPY_COMP_CONST_TUPLE (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to enable optimisations for constant literals, eg OrderedDict
 #ifndef MICROPY_COMP_CONST_LITERAL
-#define MICROPY_COMP_CONST_LITERAL (1)
+#define MICROPY_COMP_CONST_LITERAL (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to enable lookup of constants in modules; eg module.CONST
 #ifndef MICROPY_COMP_MODULE_CONST
-#define MICROPY_COMP_MODULE_CONST (0)
+#define MICROPY_COMP_MODULE_CONST (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to enable constant optimisation; id = const(value)
 #ifndef MICROPY_COMP_CONST
-#define MICROPY_COMP_CONST (1)
+#define MICROPY_COMP_CONST (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to enable optimisation of: a, b = c, d
 // Costs 124 bytes (Thumb2)
 #ifndef MICROPY_COMP_DOUBLE_TUPLE_ASSIGN
-#define MICROPY_COMP_DOUBLE_TUPLE_ASSIGN (1)
+#define MICROPY_COMP_DOUBLE_TUPLE_ASSIGN (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to enable optimisation of: a, b, c = d, e, f
 // Requires MICROPY_COMP_DOUBLE_TUPLE_ASSIGN and costs 68 bytes (Thumb2)
 #ifndef MICROPY_COMP_TRIPLE_TUPLE_ASSIGN
-#define MICROPY_COMP_TRIPLE_TUPLE_ASSIGN (0)
+#define MICROPY_COMP_TRIPLE_TUPLE_ASSIGN (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to enable optimisation of: return a if b else c
 // Costs about 80 bytes (Thumb2) and saves 2 bytes of bytecode for each use
 #ifndef MICROPY_COMP_RETURN_IF_EXPR
-#define MICROPY_COMP_RETURN_IF_EXPR (0)
+#define MICROPY_COMP_RETURN_IF_EXPR (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 /*****************************************************************************/
@@ -484,23 +522,36 @@
 #define MICROPY_OPT_COMPUTED_GOTO (0)
 #endif
 
-// Whether to cache result of map lookups in LOAD_NAME, LOAD_GLOBAL, LOAD_ATTR,
-// STORE_ATTR bytecodes.  Uses 1 byte extra RAM for each of these opcodes and
-// uses a bit of extra code ROM, but greatly improves lookup speed.
-#ifndef MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE
-#define MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE (0)
+// Optimise the fast path for loading attributes from instance types. Increases
+// Thumb2 code size by about 48 bytes.
+#ifndef MICROPY_OPT_LOAD_ATTR_FAST_PATH
+#define MICROPY_OPT_LOAD_ATTR_FAST_PATH (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
+#endif
+
+// Use extra RAM to cache map lookups by remembering the likely location of
+// the index. Avoids the hash computation on unordered maps, and avoids the
+// linear search on ordered (especially in-ROM) maps. Can provide a +10-15%
+// performance improvement on benchmarks involving lots of attribute access
+// or dictionary lookup.
+#ifndef MICROPY_OPT_MAP_LOOKUP_CACHE
+#define MICROPY_OPT_MAP_LOOKUP_CACHE (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
+#endif
+
+// How much RAM (in bytes) to use for the map lookup cache.
+#ifndef MICROPY_OPT_MAP_LOOKUP_CACHE_SIZE
+#define MICROPY_OPT_MAP_LOOKUP_CACHE_SIZE (128)
 #endif
 
 // Whether to use fast versions of bitwise operations (and, or, xor) when the
 // arguments are both positive.  Increases Thumb2 code size by about 250 bytes.
 #ifndef MICROPY_OPT_MPZ_BITWISE
-#define MICROPY_OPT_MPZ_BITWISE (0)
+#define MICROPY_OPT_MPZ_BITWISE (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 
 // Whether math.factorial is large, fast and recursive (1) or small and slow (0).
 #ifndef MICROPY_OPT_MATH_FACTORIAL
-#define MICROPY_OPT_MATH_FACTORIAL (0)
+#define MICROPY_OPT_MATH_FACTORIAL (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 /*****************************************************************************/
@@ -510,7 +561,7 @@
 // When disabled, only importing of built-in modules is supported
 // When enabled, a port must implement mp_import_stat (among other things)
 #ifndef MICROPY_ENABLE_EXTERNAL_IMPORT
-#define MICROPY_ENABLE_EXTERNAL_IMPORT (1)
+#define MICROPY_ENABLE_EXTERNAL_IMPORT (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to use the POSIX reader for importing files
@@ -555,9 +606,19 @@
 #define MICROPY_ENABLE_GC (0)
 #endif
 
+// Hook to run code during time consuming garbage collector operations
+#ifndef MICROPY_GC_HOOK_LOOP
+#define MICROPY_GC_HOOK_LOOP
+#endif
+
+// Whether to provide m_tracked_calloc, m_tracked_free functions
+#ifndef MICROPY_TRACKED_ALLOC
+#define MICROPY_TRACKED_ALLOC (0)
+#endif
+
 // Whether to enable finalisers in the garbage collector (ie call __del__)
 #ifndef MICROPY_ENABLE_FINALISER
-#define MICROPY_ENABLE_FINALISER (0)
+#define MICROPY_ENABLE_FINALISER (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to enable a separate allocator for the Python stack.
@@ -574,7 +635,7 @@
 // Whether to check C stack usage. C stack used for calling Python functions,
 // etc. Not checking means segfault on overflow.
 #ifndef MICROPY_STACK_CHECK
-#define MICROPY_STACK_CHECK (0)
+#define MICROPY_STACK_CHECK (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to have an emergency exception buffer
@@ -589,7 +650,7 @@
 
 // Whether to provide the mp_kbd_exception object, and micropython.kbd_intr function
 #ifndef MICROPY_KBD_EXCEPTION
-#define MICROPY_KBD_EXCEPTION (0)
+#define MICROPY_KBD_EXCEPTION (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Prefer to raise KeyboardInterrupt asynchronously (from signal or interrupt
@@ -600,7 +661,7 @@
 
 // Whether to include REPL helper function
 #ifndef MICROPY_HELPER_REPL
-#define MICROPY_HELPER_REPL (0)
+#define MICROPY_HELPER_REPL (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Allow enabling debug prints after each REPL line
@@ -610,7 +671,7 @@
 
 // Whether to include emacs-style readline behavior in REPL
 #ifndef MICROPY_REPL_EMACS_KEYS
-#define MICROPY_REPL_EMACS_KEYS (0)
+#define MICROPY_REPL_EMACS_KEYS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to include emacs-style word movement/kill readline behavior in REPL.
@@ -630,7 +691,7 @@
 
 // Whether to implement auto-indent in REPL
 #ifndef MICROPY_REPL_AUTO_INDENT
-#define MICROPY_REPL_AUTO_INDENT (0)
+#define MICROPY_REPL_AUTO_INDENT (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether port requires event-driven REPL functions
@@ -659,7 +720,7 @@ typedef long long mp_longint_impl_t;
 // Whether to include information in the byte code to determine source
 // line number (increases RAM usage, but doesn't slow byte code execution)
 #ifndef MICROPY_ENABLE_SOURCE_LINE
-#define MICROPY_ENABLE_SOURCE_LINE (0)
+#define MICROPY_ENABLE_SOURCE_LINE (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to include doc strings (increases RAM usage)
@@ -677,7 +738,13 @@ typedef long long mp_longint_impl_t;
 #define MICROPY_ERROR_REPORTING_DETAILED (3)
 
 #ifndef MICROPY_ERROR_REPORTING
+#if MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_FULL_FEATURES
+#define MICROPY_ERROR_REPORTING (MICROPY_ERROR_REPORTING_DETAILED)
+#elif MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES
 #define MICROPY_ERROR_REPORTING (MICROPY_ERROR_REPORTING_NORMAL)
+#else
+#define MICROPY_ERROR_REPORTING (MICROPY_ERROR_REPORTING_TERSE)
+#endif
 #endif
 
 // Whether issue warnings during compiling/execution
@@ -733,7 +800,7 @@ typedef double mp_float_t;
 // TODO: Originally intended as generic category to not
 // add bunch of once-off options. May need refactoring later
 #ifndef MICROPY_CPYTHON_COMPAT
-#define MICROPY_CPYTHON_COMPAT (1)
+#define MICROPY_CPYTHON_COMPAT (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Perform full checks as done by CPython. Disabling this
@@ -742,12 +809,12 @@ typedef double mp_float_t;
 // grave issues (in other words, only user app should be,
 // affected, not system).
 #ifndef MICROPY_FULL_CHECKS
-#define MICROPY_FULL_CHECKS (1)
+#define MICROPY_FULL_CHECKS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether POSIX-semantics non-blocking streams are supported
 #ifndef MICROPY_STREAMS_NON_BLOCK
-#define MICROPY_STREAMS_NON_BLOCK (0)
+#define MICROPY_STREAMS_NON_BLOCK (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide stream functions with POSIX-like signatures
@@ -756,19 +823,31 @@ typedef double mp_float_t;
 #define MICROPY_STREAMS_POSIX_API (0)
 #endif
 
+// Whether modules can use MP_MODULE_ATTR_DELEGATION_ENTRY() to delegate failed
+// attribute lookups.
+#ifndef MICROPY_MODULE_ATTR_DELEGATION
+#define MICROPY_MODULE_ATTR_DELEGATION (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
+#endif
+
 // Whether to call __init__ when importing builtin modules for the first time
 #ifndef MICROPY_MODULE_BUILTIN_INIT
-#define MICROPY_MODULE_BUILTIN_INIT (0)
+#define MICROPY_MODULE_BUILTIN_INIT (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to support module-level __getattr__ (see PEP 562)
 #ifndef MICROPY_MODULE_GETATTR
-#define MICROPY_MODULE_GETATTR (1)
+#define MICROPY_MODULE_GETATTR (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether module weak links are supported
 #ifndef MICROPY_MODULE_WEAK_LINKS
-#define MICROPY_MODULE_WEAK_LINKS (0)
+#define MICROPY_MODULE_WEAK_LINKS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
+#endif
+
+// Whether to enable importing foo.py with __name__ set to '__main__'
+// Used by the unix port for the -m flag.
+#ifndef MICROPY_MODULE_OVERRIDE_MAIN_IMPORT
+#define MICROPY_MODULE_OVERRIDE_MAIN_IMPORT (0)
 #endif
 
 // Whether frozen modules are supported in the form of strings
@@ -788,7 +867,7 @@ typedef double mp_float_t;
 
 // Whether you can override builtins in the builtins module
 #ifndef MICROPY_CAN_OVERRIDE_BUILTINS
-#define MICROPY_CAN_OVERRIDE_BUILTINS (0)
+#define MICROPY_CAN_OVERRIDE_BUILTINS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to check that the "self" argument of a builtin method has the
@@ -797,7 +876,7 @@ typedef double mp_float_t;
 // list.append([], 1).  Without this check such calls will have undefined
 // behaviour (usually segfault) if the first argument is the wrong type.
 #ifndef MICROPY_BUILTIN_METHOD_CHECK_SELF_ARG
-#define MICROPY_BUILTIN_METHOD_CHECK_SELF_ARG (1)
+#define MICROPY_BUILTIN_METHOD_CHECK_SELF_ARG (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to use internally defined errno's (otherwise system provided ones)
@@ -812,7 +891,12 @@ typedef double mp_float_t;
 
 // Support for internal scheduler
 #ifndef MICROPY_ENABLE_SCHEDULER
-#define MICROPY_ENABLE_SCHEDULER (0)
+#define MICROPY_ENABLE_SCHEDULER (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
+#endif
+
+// Whether the scheduler supports scheduling static nodes with C callbacks
+#ifndef MICROPY_SCHEDULER_STATIC_NODES
+#define MICROPY_SCHEDULER_STATIC_NODES (0)
 #endif
 
 // Maximum number of entries in the scheduler
@@ -842,41 +926,41 @@ typedef double mp_float_t;
 // inheritance makes some C functions inherently recursive, and adds a bit of
 // code overhead.
 #ifndef MICROPY_MULTIPLE_INHERITANCE
-#define MICROPY_MULTIPLE_INHERITANCE (1)
+#define MICROPY_MULTIPLE_INHERITANCE (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to implement attributes on functions
 #ifndef MICROPY_PY_FUNCTION_ATTRS
-#define MICROPY_PY_FUNCTION_ATTRS (0)
+#define MICROPY_PY_FUNCTION_ATTRS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to support the descriptors __get__, __set__, __delete__
 // This costs some code size and makes load/store/delete of instance
 // attributes slower for the classes that use this feature
 #ifndef MICROPY_PY_DESCRIPTORS
-#define MICROPY_PY_DESCRIPTORS (0)
+#define MICROPY_PY_DESCRIPTORS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to support class __delattr__ and __setattr__ methods
 // This costs some code size and makes store/delete of instance
 // attributes slower for the classes that use this feature
 #ifndef MICROPY_PY_DELATTR_SETATTR
-#define MICROPY_PY_DELATTR_SETATTR (0)
+#define MICROPY_PY_DELATTR_SETATTR (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Support for async/await/async for/async with
 #ifndef MICROPY_PY_ASYNC_AWAIT
-#define MICROPY_PY_ASYNC_AWAIT (1)
+#define MICROPY_PY_ASYNC_AWAIT (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Support for literal string interpolation, f-strings (see PEP 498, Python 3.6+)
 #ifndef MICROPY_PY_FSTRINGS
-#define MICROPY_PY_FSTRINGS (0)
+#define MICROPY_PY_FSTRINGS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Support for assignment expressions with := (see PEP 572, Python 3.8+)
 #ifndef MICROPY_PY_ASSIGN_EXPR
-#define MICROPY_PY_ASSIGN_EXPR (1)
+#define MICROPY_PY_ASSIGN_EXPR (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Non-standard .pend_throw() method for generators, allowing for
@@ -885,7 +969,7 @@ typedef double mp_float_t;
 // to generator's .send() or .__next__(). (This is useful to implement
 // async schedulers.)
 #ifndef MICROPY_PY_GENERATOR_PEND_THROW
-#define MICROPY_PY_GENERATOR_PEND_THROW (1)
+#define MICROPY_PY_GENERATOR_PEND_THROW (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Issue a warning when comparing str and bytes objects
@@ -895,7 +979,7 @@ typedef double mp_float_t;
 
 // Whether str object is proper unicode
 #ifndef MICROPY_PY_BUILTINS_STR_UNICODE
-#define MICROPY_PY_BUILTINS_STR_UNICODE (0)
+#define MICROPY_PY_BUILTINS_STR_UNICODE (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to check for valid UTF-8 when converting bytes to str
@@ -905,42 +989,42 @@ typedef double mp_float_t;
 
 // Whether str.center() method provided
 #ifndef MICROPY_PY_BUILTINS_STR_CENTER
-#define MICROPY_PY_BUILTINS_STR_CENTER (0)
+#define MICROPY_PY_BUILTINS_STR_CENTER (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether str.count() method provided
 #ifndef MICROPY_PY_BUILTINS_STR_COUNT
-#define MICROPY_PY_BUILTINS_STR_COUNT (1)
+#define MICROPY_PY_BUILTINS_STR_COUNT (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether str % (...) formatting operator provided
 #ifndef MICROPY_PY_BUILTINS_STR_OP_MODULO
-#define MICROPY_PY_BUILTINS_STR_OP_MODULO (1)
+#define MICROPY_PY_BUILTINS_STR_OP_MODULO (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether str.partition()/str.rpartition() method provided
 #ifndef MICROPY_PY_BUILTINS_STR_PARTITION
-#define MICROPY_PY_BUILTINS_STR_PARTITION (0)
+#define MICROPY_PY_BUILTINS_STR_PARTITION (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether str.splitlines() method provided
 #ifndef MICROPY_PY_BUILTINS_STR_SPLITLINES
-#define MICROPY_PY_BUILTINS_STR_SPLITLINES (0)
+#define MICROPY_PY_BUILTINS_STR_SPLITLINES (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to support bytearray object
 #ifndef MICROPY_PY_BUILTINS_BYTEARRAY
-#define MICROPY_PY_BUILTINS_BYTEARRAY (1)
+#define MICROPY_PY_BUILTINS_BYTEARRAY (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to support dict.fromkeys() class method
 #ifndef MICROPY_PY_BUILTINS_DICT_FROMKEYS
-#define MICROPY_PY_BUILTINS_DICT_FROMKEYS (1)
+#define MICROPY_PY_BUILTINS_DICT_FROMKEYS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to support memoryview object
 #ifndef MICROPY_PY_BUILTINS_MEMORYVIEW
-#define MICROPY_PY_BUILTINS_MEMORYVIEW (0)
+#define MICROPY_PY_BUILTINS_MEMORYVIEW (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to support memoryview.itemsize attribute
@@ -950,39 +1034,39 @@ typedef double mp_float_t;
 
 // Whether to support set object
 #ifndef MICROPY_PY_BUILTINS_SET
-#define MICROPY_PY_BUILTINS_SET (1)
+#define MICROPY_PY_BUILTINS_SET (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to support slice subscript operators and slice object
 #ifndef MICROPY_PY_BUILTINS_SLICE
-#define MICROPY_PY_BUILTINS_SLICE (1)
+#define MICROPY_PY_BUILTINS_SLICE (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to support slice attribute read access,
 // i.e. slice.start, slice.stop, slice.step
 #ifndef MICROPY_PY_BUILTINS_SLICE_ATTRS
-#define MICROPY_PY_BUILTINS_SLICE_ATTRS (0)
+#define MICROPY_PY_BUILTINS_SLICE_ATTRS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to support the .indices(len) method on slice objects
 #ifndef MICROPY_PY_BUILTINS_SLICE_INDICES
-#define MICROPY_PY_BUILTINS_SLICE_INDICES (0)
+#define MICROPY_PY_BUILTINS_SLICE_INDICES (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to support frozenset object
 #ifndef MICROPY_PY_BUILTINS_FROZENSET
-#define MICROPY_PY_BUILTINS_FROZENSET (0)
+#define MICROPY_PY_BUILTINS_FROZENSET (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to support property object
 #ifndef MICROPY_PY_BUILTINS_PROPERTY
-#define MICROPY_PY_BUILTINS_PROPERTY (1)
+#define MICROPY_PY_BUILTINS_PROPERTY (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to implement the start/stop/step attributes (readback) on
 // the "range" builtin type. Rarely used, and costs ~60 bytes (x86).
 #ifndef MICROPY_PY_BUILTINS_RANGE_ATTRS
-#define MICROPY_PY_BUILTINS_RANGE_ATTRS (1)
+#define MICROPY_PY_BUILTINS_RANGE_ATTRS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to support binary ops [only (in)equality is defined] between range
@@ -1000,7 +1084,7 @@ typedef double mp_float_t;
 
 // Whether to support rounding of integers (incl bignum); eg round(123,-1)=120
 #ifndef MICROPY_PY_BUILTINS_ROUND_INT
-#define MICROPY_PY_BUILTINS_ROUND_INT (0)
+#define MICROPY_PY_BUILTINS_ROUND_INT (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to support complete set of special methods for user
@@ -1009,7 +1093,7 @@ typedef double mp_float_t;
 // "Reverse" methods are controlled by
 // MICROPY_PY_REVERSE_SPECIAL_METHODS below.
 #ifndef MICROPY_PY_ALL_SPECIAL_METHODS
-#define MICROPY_PY_ALL_SPECIAL_METHODS (0)
+#define MICROPY_PY_ALL_SPECIAL_METHODS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to support all inplace arithmetic operarion methods
@@ -1022,17 +1106,17 @@ typedef double mp_float_t;
 // (__radd__, etc.). Additionally gated by
 // MICROPY_PY_ALL_SPECIAL_METHODS.
 #ifndef MICROPY_PY_REVERSE_SPECIAL_METHODS
-#define MICROPY_PY_REVERSE_SPECIAL_METHODS (0)
+#define MICROPY_PY_REVERSE_SPECIAL_METHODS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to support compile function
 #ifndef MICROPY_PY_BUILTINS_COMPILE
-#define MICROPY_PY_BUILTINS_COMPILE (0)
+#define MICROPY_PY_BUILTINS_COMPILE (MICROPY_ENABLE_COMPILER && MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to support enumerate function(type)
 #ifndef MICROPY_PY_BUILTINS_ENUMERATE
-#define MICROPY_PY_BUILTINS_ENUMERATE (1)
+#define MICROPY_PY_BUILTINS_ENUMERATE (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to support eval and exec functions
@@ -1043,43 +1127,43 @@ typedef double mp_float_t;
 
 // Whether to support the Python 2 execfile function
 #ifndef MICROPY_PY_BUILTINS_EXECFILE
-#define MICROPY_PY_BUILTINS_EXECFILE (0)
+#define MICROPY_PY_BUILTINS_EXECFILE (MICROPY_ENABLE_COMPILER && MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to support filter function(type)
 #ifndef MICROPY_PY_BUILTINS_FILTER
-#define MICROPY_PY_BUILTINS_FILTER (1)
+#define MICROPY_PY_BUILTINS_FILTER (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to support reversed function(type)
 #ifndef MICROPY_PY_BUILTINS_REVERSED
-#define MICROPY_PY_BUILTINS_REVERSED (1)
+#define MICROPY_PY_BUILTINS_REVERSED (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to define "NotImplemented" special constant
 #ifndef MICROPY_PY_BUILTINS_NOTIMPLEMENTED
-#define MICROPY_PY_BUILTINS_NOTIMPLEMENTED (0)
+#define MICROPY_PY_BUILTINS_NOTIMPLEMENTED (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide the built-in input() function. The implementation of this
 // uses shared/readline, so can only be enabled if the port uses this readline.
 #ifndef MICROPY_PY_BUILTINS_INPUT
-#define MICROPY_PY_BUILTINS_INPUT (0)
+#define MICROPY_PY_BUILTINS_INPUT (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to support min/max functions
 #ifndef MICROPY_PY_BUILTINS_MIN_MAX
-#define MICROPY_PY_BUILTINS_MIN_MAX (1)
+#define MICROPY_PY_BUILTINS_MIN_MAX (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Support for calls to pow() with 3 integer arguments
 #ifndef MICROPY_PY_BUILTINS_POW3
-#define MICROPY_PY_BUILTINS_POW3 (0)
+#define MICROPY_PY_BUILTINS_POW3 (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide the help function
 #ifndef MICROPY_PY_BUILTINS_HELP
-#define MICROPY_PY_BUILTINS_HELP (0)
+#define MICROPY_PY_BUILTINS_HELP (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Use this to configure the help text shown for help().  It should be a
@@ -1090,17 +1174,17 @@ typedef double mp_float_t;
 
 // Add the ability to list the available modules when executing help('modules')
 #ifndef MICROPY_PY_BUILTINS_HELP_MODULES
-#define MICROPY_PY_BUILTINS_HELP_MODULES (0)
+#define MICROPY_PY_BUILTINS_HELP_MODULES (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to set __file__ for imported modules
 #ifndef MICROPY_PY___FILE__
-#define MICROPY_PY___FILE__ (1)
+#define MICROPY_PY___FILE__ (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to provide mem-info related functions in micropython module
 #ifndef MICROPY_PY_MICROPYTHON_MEM_INFO
-#define MICROPY_PY_MICROPYTHON_MEM_INFO (0)
+#define MICROPY_PY_MICROPYTHON_MEM_INFO (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide "micropython.stack_use" function
@@ -1117,34 +1201,34 @@ typedef double mp_float_t;
 // underlying code is shared with "bytearray" builtin type, so to
 // get real savings, it should be disabled too.
 #ifndef MICROPY_PY_ARRAY
-#define MICROPY_PY_ARRAY (1)
+#define MICROPY_PY_ARRAY (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to support slice assignments for array (and bytearray).
 // This is rarely used, but adds ~0.5K of code.
 #ifndef MICROPY_PY_ARRAY_SLICE_ASSIGN
-#define MICROPY_PY_ARRAY_SLICE_ASSIGN (0)
+#define MICROPY_PY_ARRAY_SLICE_ASSIGN (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to support attrtuple type (MicroPython extension)
 // It provides space-efficient tuples with attribute access
 #ifndef MICROPY_PY_ATTRTUPLE
-#define MICROPY_PY_ATTRTUPLE (1)
+#define MICROPY_PY_ATTRTUPLE (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to provide "collections" module
 #ifndef MICROPY_PY_COLLECTIONS
-#define MICROPY_PY_COLLECTIONS (1)
+#define MICROPY_PY_COLLECTIONS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to provide "ucollections.deque" type
 #ifndef MICROPY_PY_COLLECTIONS_DEQUE
-#define MICROPY_PY_COLLECTIONS_DEQUE (0)
+#define MICROPY_PY_COLLECTIONS_DEQUE (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide "collections.OrderedDict" type
 #ifndef MICROPY_PY_COLLECTIONS_ORDEREDDICT
-#define MICROPY_PY_COLLECTIONS_ORDEREDDICT (0)
+#define MICROPY_PY_COLLECTIONS_ORDEREDDICT (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide the _asdict function for namedtuple
@@ -1154,22 +1238,27 @@ typedef double mp_float_t;
 
 // Whether to provide "math" module
 #ifndef MICROPY_PY_MATH
-#define MICROPY_PY_MATH (1)
+#define MICROPY_PY_MATH (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
+#endif
+
+// Whether to provide all math module constants (Python 3.5+), or just pi and e.
+#ifndef MICROPY_PY_MATH_CONSTANTS
+#define MICROPY_PY_MATH_CONSTANTS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide special math functions: math.{erf,erfc,gamma,lgamma}
 #ifndef MICROPY_PY_MATH_SPECIAL_FUNCTIONS
-#define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (0)
+#define MICROPY_PY_MATH_SPECIAL_FUNCTIONS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide math.factorial function
 #ifndef MICROPY_PY_MATH_FACTORIAL
-#define MICROPY_PY_MATH_FACTORIAL (0)
+#define MICROPY_PY_MATH_FACTORIAL (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide math.isclose function
 #ifndef MICROPY_PY_MATH_ISCLOSE
-#define MICROPY_PY_MATH_ISCLOSE (0)
+#define MICROPY_PY_MATH_ISCLOSE (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide fix for atan2 Inf handling.
@@ -1194,12 +1283,12 @@ typedef double mp_float_t;
 
 // Whether to provide "cmath" module
 #ifndef MICROPY_PY_CMATH
-#define MICROPY_PY_CMATH (0)
+#define MICROPY_PY_CMATH (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide "gc" module
 #ifndef MICROPY_PY_GC
-#define MICROPY_PY_GC (1)
+#define MICROPY_PY_GC (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to return number of collected objects from gc.collect()
@@ -1209,28 +1298,17 @@ typedef double mp_float_t;
 
 // Whether to provide "io" module
 #ifndef MICROPY_PY_IO
-#define MICROPY_PY_IO (1)
+#define MICROPY_PY_IO (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to provide "io.IOBase" class to support user streams
 #ifndef MICROPY_PY_IO_IOBASE
-#define MICROPY_PY_IO_IOBASE (0)
-#endif
-
-// Whether to provide "uio.resource_stream()" function with
-// the semantics of CPython's pkg_resources.resource_stream()
-// (allows to access binary resources in frozen source packages).
-// Note that the same functionality can be achieved in "pure
-// Python" by preprocessing binary resources into Python source
-// and bytecode-freezing it (with a simple helper module available
-// e.g. in micropython-lib).
-#ifndef MICROPY_PY_IO_RESOURCE_STREAM
-#define MICROPY_PY_IO_RESOURCE_STREAM (0)
+#define MICROPY_PY_IO_IOBASE (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide "io.FileIO" class
 #ifndef MICROPY_PY_IO_FILEIO
-#define MICROPY_PY_IO_FILEIO (0)
+#define MICROPY_PY_IO_FILEIO (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide "io.BytesIO" class
@@ -1245,17 +1323,22 @@ typedef double mp_float_t;
 
 // Whether to provide "struct" module
 #ifndef MICROPY_PY_STRUCT
-#define MICROPY_PY_STRUCT (1)
+#define MICROPY_PY_STRUCT (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
 #endif
 
 // Whether to provide "sys" module
 #ifndef MICROPY_PY_SYS
-#define MICROPY_PY_SYS (1)
+#define MICROPY_PY_SYS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_CORE_FEATURES)
+#endif
+
+// Whether to initialise "sys.path" and "sys.argv" to their defaults in mp_init()
+#ifndef MICROPY_PY_SYS_PATH_ARGV_DEFAULTS
+#define MICROPY_PY_SYS_PATH_ARGV_DEFAULTS (MICROPY_PY_SYS)
 #endif
 
 // Whether to provide "sys.maxsize" constant
 #ifndef MICROPY_PY_SYS_MAXSIZE
-#define MICROPY_PY_SYS_MAXSIZE (0)
+#define MICROPY_PY_SYS_MAXSIZE (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide "sys.modules" dictionary
@@ -1279,6 +1362,11 @@ typedef double mp_float_t;
 #define MICROPY_PY_SYS_ATEXIT (0)
 #endif
 
+// Whether to provide sys.{ps1,ps2} mutable attributes, to control REPL prompts
+#ifndef MICROPY_PY_SYS_PS1_PS2
+#define MICROPY_PY_SYS_PS1_PS2 (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
+#endif
+
 // Whether to provide "sys.settrace" function
 #ifndef MICROPY_PY_SYS_SETTRACE
 #define MICROPY_PY_SYS_SETTRACE (0)
@@ -1291,18 +1379,29 @@ typedef double mp_float_t;
 
 // Whether to provide sys.{stdin,stdout,stderr} objects
 #ifndef MICROPY_PY_SYS_STDFILES
-#define MICROPY_PY_SYS_STDFILES (0)
+#define MICROPY_PY_SYS_STDFILES (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide sys.{stdin,stdout,stderr}.buffer object
 // This is implemented per-port
 #ifndef MICROPY_PY_SYS_STDIO_BUFFER
-#define MICROPY_PY_SYS_STDIO_BUFFER (0)
+#define MICROPY_PY_SYS_STDIO_BUFFER (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
+#endif
+
+// Whether to provide sys.tracebacklimit mutable attribute
+#ifndef MICROPY_PY_SYS_TRACEBACKLIMIT
+#define MICROPY_PY_SYS_TRACEBACKLIMIT (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EVERYTHING)
+#endif
+
+// Whether the sys module supports attribute delegation
+// This is enabled automatically when needed by other features
+#ifndef MICROPY_PY_SYS_ATTR_DELEGATION
+#define MICROPY_PY_SYS_ATTR_DELEGATION (MICROPY_PY_SYS_PS1_PS2 || MICROPY_PY_SYS_TRACEBACKLIMIT)
 #endif
 
 // Whether to provide "uerrno" module
 #ifndef MICROPY_PY_UERRNO
-#define MICROPY_PY_UERRNO (0)
+#define MICROPY_PY_UERRNO (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide the uerrno.errorcode dict
@@ -1312,7 +1411,7 @@ typedef double mp_float_t;
 
 // Whether to provide "uselect" module (baremetal implementation)
 #ifndef MICROPY_PY_USELECT
-#define MICROPY_PY_USELECT (0)
+#define MICROPY_PY_USELECT (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to enable the select() function in the "uselect" module (baremetal
@@ -1358,11 +1457,11 @@ typedef double mp_float_t;
 // Extended modules
 
 #ifndef MICROPY_PY_UASYNCIO
-#define MICROPY_PY_UASYNCIO (0)
+#define MICROPY_PY_UASYNCIO (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 #ifndef MICROPY_PY_UCTYPES
-#define MICROPY_PY_UCTYPES (0)
+#define MICROPY_PY_UCTYPES (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to provide SHORT, INT, LONG, etc. types in addition to
@@ -1372,11 +1471,11 @@ typedef double mp_float_t;
 #endif
 
 #ifndef MICROPY_PY_UZLIB
-#define MICROPY_PY_UZLIB (0)
+#define MICROPY_PY_UZLIB (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 #ifndef MICROPY_PY_UJSON
-#define MICROPY_PY_UJSON (0)
+#define MICROPY_PY_UJSON (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to support the "separators" argument to dump, dumps
@@ -1384,8 +1483,16 @@ typedef double mp_float_t;
 #define MICROPY_PY_UJSON_SEPARATORS (1)
 #endif
 
+#ifndef MICROPY_PY_UOS
+#define MICROPY_PY_UOS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
+#endif
+
+#ifndef MICROPY_PY_UOS_STATVFS
+#define MICROPY_PY_UOS_STATVFS (MICROPY_PY_UOS)
+#endif
+
 #ifndef MICROPY_PY_URE
-#define MICROPY_PY_URE (0)
+#define MICROPY_PY_URE (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 #ifndef MICROPY_PY_URE_DEBUG
@@ -1401,20 +1508,20 @@ typedef double mp_float_t;
 #endif
 
 #ifndef MICROPY_PY_URE_SUB
-#define MICROPY_PY_URE_SUB (0)
+#define MICROPY_PY_URE_SUB (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 #ifndef MICROPY_PY_UHEAPQ
-#define MICROPY_PY_UHEAPQ (0)
+#define MICROPY_PY_UHEAPQ (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
-// Optimized heap queue for relative timestamps
+// Optimized heap queue for relative timestamps (only used by uasyncio v2)
 #ifndef MICROPY_PY_UTIMEQ
 #define MICROPY_PY_UTIMEQ (0)
 #endif
 
 #ifndef MICROPY_PY_UHASHLIB
-#define MICROPY_PY_UHASHLIB (0)
+#define MICROPY_PY_UHASHLIB (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 #ifndef MICROPY_PY_UHASHLIB_MD5
@@ -1443,21 +1550,21 @@ typedef double mp_float_t;
 #endif
 
 #ifndef MICROPY_PY_UBINASCII
-#define MICROPY_PY_UBINASCII (0)
+#define MICROPY_PY_UBINASCII (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Depends on MICROPY_PY_UZLIB
 #ifndef MICROPY_PY_UBINASCII_CRC32
-#define MICROPY_PY_UBINASCII_CRC32 (0)
+#define MICROPY_PY_UBINASCII_CRC32 (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 #ifndef MICROPY_PY_URANDOM
-#define MICROPY_PY_URANDOM (0)
+#define MICROPY_PY_URANDOM (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 // Whether to include: randrange, randint, choice, random, uniform
 #ifndef MICROPY_PY_URANDOM_EXTRA_FUNCS
-#define MICROPY_PY_URANDOM_EXTRA_FUNCS (0)
+#define MICROPY_PY_URANDOM_EXTRA_FUNCS (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 #ifndef MICROPY_PY_MACHINE
@@ -1478,13 +1585,36 @@ typedef double mp_float_t;
 #define MICROPY_PY_MACHINE_I2C (0)
 #endif
 
+// Whether the low-level I2C transfer function supports a separate write as the first transfer
+#ifndef MICROPY_PY_MACHINE_I2C_TRANSFER_WRITE1
+#define MICROPY_PY_MACHINE_I2C_TRANSFER_WRITE1 (0)
+#endif
+
+// Whether to provide the "machine.SoftI2C" class
+#ifndef MICROPY_PY_MACHINE_SOFTI2C
+#define MICROPY_PY_MACHINE_SOFTI2C (0)
+#endif
+
 #ifndef MICROPY_PY_MACHINE_SPI
 #define MICROPY_PY_MACHINE_SPI (0)
 #endif
 
+// Whether to provide the "machine.SoftSPI" class
+#ifndef MICROPY_PY_MACHINE_SOFTSPI
+#define MICROPY_PY_MACHINE_SOFTSPI (0)
+#endif
+
+// The default backlog value for socket.listen(backlog)
+#ifndef MICROPY_PY_USOCKET_LISTEN_BACKLOG_DEFAULT
+#define MICROPY_PY_USOCKET_LISTEN_BACKLOG_DEFAULT (2)
+#endif
+
 #ifndef MICROPY_PY_USSL
 #define MICROPY_PY_USSL (0)
+#endif
+
 // Whether to add finaliser code to ussl objects
+#ifndef MICROPY_PY_USSL_FINALISER
 #define MICROPY_PY_USSL_FINALISER (0)
 #endif
 
@@ -1493,13 +1623,18 @@ typedef double mp_float_t;
 #endif
 
 #ifndef MICROPY_PY_FRAMEBUF
-#define MICROPY_PY_FRAMEBUF (0)
+#define MICROPY_PY_FRAMEBUF (MICROPY_CONFIG_ROM_LEVEL_AT_LEAST_EXTRA_FEATURES)
 #endif
 
 #ifndef MICROPY_PY_BTREE
 #define MICROPY_PY_BTREE (0)
 #endif
 
+// Whether to provide the low-level "_onewire" module
+#ifndef MICROPY_PY_ONEWIRE
+#define MICROPY_PY_ONEWIRE (0)
+#endif
+
 /*****************************************************************************/
 /* Hooks for a port to add builtins                                          */
 
@@ -1508,9 +1643,10 @@ typedef double mp_float_t;
 #define MICROPY_PORT_BUILTINS
 #endif
 
-// Additional builtin module definitions - see objmodule.c:mp_builtin_module_table for format.
-#ifndef MICROPY_PORT_BUILTIN_MODULES
-#define MICROPY_PORT_BUILTIN_MODULES
+// Additional builtin function definitions for extension by command-line, boards or variants.
+// See modbuiltins.c:mp_module_builtins_globals_table for format.
+#ifndef MICROPY_PORT_EXTRA_BUILTINS
+#define MICROPY_PORT_EXTRA_BUILTINS
 #endif
 
 // Additional constant definitions for the compiler - see compile.c:mp_constants_table.
@@ -1526,6 +1662,30 @@ typedef double mp_float_t;
 /*****************************************************************************/
 /* Hooks for a port to wrap functions with attributes                        */
 
+#ifndef MICROPY_WRAP_MP_BINARY_OP
+#define MICROPY_WRAP_MP_BINARY_OP(f) f
+#endif
+
+#ifndef MICROPY_WRAP_MP_EXECUTE_BYTECODE
+#define MICROPY_WRAP_MP_EXECUTE_BYTECODE(f) f
+#endif
+
+#ifndef MICROPY_WRAP_MP_LOAD_GLOBAL
+#define MICROPY_WRAP_MP_LOAD_GLOBAL(f) f
+#endif
+
+#ifndef MICROPY_WRAP_MP_LOAD_NAME
+#define MICROPY_WRAP_MP_LOAD_NAME(f) f
+#endif
+
+#ifndef MICROPY_WRAP_MP_MAP_LOOKUP
+#define MICROPY_WRAP_MP_MAP_LOOKUP(f) f
+#endif
+
+#ifndef MICROPY_WRAP_MP_OBJ_GET_TYPE
+#define MICROPY_WRAP_MP_OBJ_GET_TYPE(f) f
+#endif
+
 #ifndef MICROPY_WRAP_MP_SCHED_EXCEPTION
 #define MICROPY_WRAP_MP_SCHED_EXCEPTION(f) f
 #endif
@@ -1549,6 +1709,20 @@ typedef double mp_float_t;
 #define MICROPY_OBJ_BASE_ALIGNMENT
 #endif
 
+// String used for the banner, and sys.version additional information
+#ifndef MICROPY_BANNER_NAME_AND_VERSION
+#define MICROPY_BANNER_NAME_AND_VERSION "MicroPython " MICROPY_GIT_TAG " on " MICROPY_BUILD_DATE
+#endif
+
+// String used for the second part of the banner, and sys.implementation._machine
+#ifndef MICROPY_BANNER_MACHINE
+#ifdef MICROPY_HW_BOARD_NAME
+#define MICROPY_BANNER_MACHINE MICROPY_HW_BOARD_NAME " with " MICROPY_HW_MCU_NAME
+#else
+#define MICROPY_BANNER_MACHINE MICROPY_PY_SYS_PLATFORM " [" MICROPY_PLATFORM_COMPILER "] version"
+#endif
+#endif
+
 // On embedded platforms, these will typically enable/disable irqs.
 #ifndef MICROPY_BEGIN_ATOMIC_SECTION
 #define MICROPY_BEGIN_ATOMIC_SECTION() (0)
diff --git a/python/src/py/mpstate.h b/python/src/py/mpstate.h
index 07335bae4..98aa9a849 100644
--- a/python/src/py/mpstate.h
+++ b/python/src/py/mpstate.h
@@ -40,12 +40,21 @@
 // memory system, runtime and virtual machine.  The state is a global
 // variable, but in the future it is hoped that the state can become local.
 
+enum {
+    #if MICROPY_PY_SYS_PS1_PS2
+    MP_SYS_MUTABLE_PS1,
+    MP_SYS_MUTABLE_PS2,
+    #endif
+    #if MICROPY_PY_SYS_TRACEBACKLIMIT
+    MP_SYS_MUTABLE_TRACEBACKLIMIT,
+    #endif
+    MP_SYS_MUTABLE_NUM,
+};
+
 // This structure contains dynamic configuration for the compiler.
 #if MICROPY_DYNAMIC_COMPILER
 typedef struct mp_dynamic_compiler_t {
     uint8_t small_int_bits; // must be <= host small_int_bits
-    bool opt_cache_map_lookup_in_bytecode;
-    bool py_builtins_str_unicode;
     uint8_t native_arch;
     uint8_t nlr_buf_num_regs;
 } mp_dynamic_compiler_t;
@@ -115,6 +124,10 @@ typedef struct _mp_state_vm_t {
 
     qstr_pool_t *last_pool;
 
+    #if MICROPY_TRACKED_ALLOC
+    struct _m_tracked_node_t *m_tracked_head;
+    #endif
+
     // non-heap memory for creating an exception if we can't allocate RAM
     mp_obj_exception_t mp_emergency_exception_obj;
 
@@ -154,10 +167,18 @@ typedef struct _mp_state_vm_t {
     // dictionary for the __main__ module
     mp_obj_dict_t dict_main;
 
-    // these two lists must be initialised per port, after the call to mp_init
+    #if MICROPY_PY_SYS
+    // If MICROPY_PY_SYS_PATH_ARGV_DEFAULTS is not enabled then these two lists
+    // must be initialised after the call to mp_init.
     mp_obj_list_t mp_sys_path_obj;
     mp_obj_list_t mp_sys_argv_obj;
 
+    #if MICROPY_PY_SYS_ATTR_DELEGATION
+    // Contains mutable sys attributes.
+    mp_obj_t sys_mutable[MP_SYS_MUTABLE_NUM];
+    #endif
+    #endif
+
     // dictionary for overridden builtins
     #if MICROPY_CAN_OVERRIDE_BUILTINS
     mp_obj_dict_t *mp_module_builtins_override_dict;
@@ -200,7 +221,7 @@ typedef struct _mp_state_vm_t {
 
     // pointer and sizes to store interned string data
     // (qstr_last_chunk can be root pointer but is also stored in qstr pool)
-    byte *qstr_last_chunk;
+    char *qstr_last_chunk;
     size_t qstr_last_alloc;
     size_t qstr_last_used;
 
@@ -223,6 +244,16 @@ typedef struct _mp_state_vm_t {
 
     #if MICROPY_ENABLE_SCHEDULER
     volatile int16_t sched_state;
+
+    #if MICROPY_SCHEDULER_STATIC_NODES
+    // These will usually point to statically allocated memory.  They are not
+    // traced by the GC.  They are assumed to be zero'd out before mp_init() is
+    // called (usually because this struct lives in the BSS).
+    struct _mp_sched_node_t *sched_head;
+    struct _mp_sched_node_t *sched_tail;
+    #endif
+
+    // These index sched_queue.
     uint8_t sched_len;
     uint8_t sched_idx;
     #endif
@@ -231,6 +262,11 @@ typedef struct _mp_state_vm_t {
     // This is a global mutex used to make the VM/runtime thread-safe.
     mp_thread_mutex_t gil_mutex;
     #endif
+
+    #if MICROPY_OPT_MAP_LOOKUP_CACHE
+    // See mp_map_lookup.
+    uint8_t map_lookup_cache[MICROPY_OPT_MAP_LOOKUP_CACHE_SIZE];
+    #endif
 } mp_state_vm_t;
 
 // This structure holds state that is specific to a given thread.
diff --git a/python/src/py/mpz.c b/python/src/py/mpz.c
index 75e1fb1fd..b61997e2f 100644
--- a/python/src/py/mpz.c
+++ b/python/src/py/mpz.c
@@ -713,6 +713,7 @@ void mpz_set(mpz_t *dest, const mpz_t *src) {
 
 void mpz_set_from_int(mpz_t *z, mp_int_t val) {
     if (val == 0) {
+        z->neg = 0;
         z->len = 0;
         return;
     }
@@ -899,10 +900,6 @@ bool mpz_is_even(const mpz_t *z) {
 #endif
 
 int mpz_cmp(const mpz_t *z1, const mpz_t *z2) {
-    // to catch comparison of -0 with +0
-    if (z1->len == 0 && z2->len == 0) {
-        return 0;
-    }
     int cmp = (int)z2->neg - (int)z1->neg;
     if (cmp != 0) {
         return cmp;
@@ -1052,7 +1049,9 @@ void mpz_neg_inpl(mpz_t *dest, const mpz_t *z) {
     if (dest != z) {
         mpz_set(dest, z);
     }
-    dest->neg = 1 - dest->neg;
+    if (dest->len) {
+        dest->neg = 1 - dest->neg;
+    }
 }
 
 /* computes dest = ~z (= -z - 1)
@@ -1148,7 +1147,7 @@ void mpz_add_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs) {
         dest->len = mpn_sub(dest->dig, lhs->dig, lhs->len, rhs->dig, rhs->len);
     }
 
-    dest->neg = lhs->neg;
+    dest->neg = lhs->neg & !!dest->len;
 }
 
 /* computes dest = lhs - rhs
@@ -1172,7 +1171,9 @@ void mpz_sub_inpl(mpz_t *dest, const mpz_t *lhs, const mpz_t *rhs) {
         dest->len = mpn_sub(dest->dig, lhs->dig, lhs->len, rhs->dig, rhs->len);
     }
 
-    if (neg) {
+    if (dest->len == 0) {
+        dest->neg = 0;
+    } else if (neg) {
         dest->neg = 1 - lhs->neg;
     } else {
         dest->neg = lhs->neg;
@@ -1484,14 +1485,16 @@ void mpz_divmod_inpl(mpz_t *dest_quo, mpz_t *dest_rem, const mpz_t *lhs, const m
 
     mpz_need_dig(dest_quo, lhs->len + 1); // +1 necessary?
     memset(dest_quo->dig, 0, (lhs->len + 1) * sizeof(mpz_dig_t));
+    dest_quo->neg = 0;
     dest_quo->len = 0;
     mpz_need_dig(dest_rem, lhs->len + 1); // +1 necessary?
     mpz_set(dest_rem, lhs);
     mpn_div(dest_rem->dig, &dest_rem->len, rhs->dig, rhs->len, dest_quo->dig, &dest_quo->len);
+    dest_rem->neg &= !!dest_rem->len;
 
     // check signs and do Python style modulo
     if (lhs->neg != rhs->neg) {
-        dest_quo->neg = 1;
+        dest_quo->neg = !!dest_quo->len;
         if (!mpz_is_zero(dest_rem)) {
             mpz_t mpzone;
             mpz_init_from_int(&mpzone, -1);
diff --git a/python/src/py/mpz.h b/python/src/py/mpz.h
index 425587ee9..d27f57240 100644
--- a/python/src/py/mpz.h
+++ b/python/src/py/mpz.h
@@ -91,6 +91,7 @@ typedef int8_t mpz_dbl_dig_signed_t;
 #define MPZ_NUM_DIG_FOR_LL ((sizeof(long long) * 8 + MPZ_DIG_SIZE - 1) / MPZ_DIG_SIZE)
 
 typedef struct _mpz_t {
+    // Zero has neg=0, len=0.  Negative zero is not allowed.
     size_t neg : 1;
     size_t fixed_dig : 1;
     size_t alloc : (8 * sizeof(size_t) - 2);
@@ -119,7 +120,7 @@ static inline bool mpz_is_zero(const mpz_t *z) {
     return z->len == 0;
 }
 static inline bool mpz_is_neg(const mpz_t *z) {
-    return z->len != 0 && z->neg != 0;
+    return z->neg != 0;
 }
 int mpz_cmp(const mpz_t *lhs, const mpz_t *rhs);
 
diff --git a/python/src/py/nativeglue.c b/python/src/py/nativeglue.c
index 30e5b4006..743ff38cc 100644
--- a/python/src/py/nativeglue.c
+++ b/python/src/py/nativeglue.c
@@ -300,9 +300,9 @@ const mp_fun_table_t mp_fun_table = {
     mp_unpack_ex,
     mp_delete_name,
     mp_delete_global,
-    mp_make_closure_from_raw_code,
+    mp_obj_new_closure,
     mp_arg_check_num_sig,
-    mp_setup_code_state,
+    mp_setup_code_state_native,
     mp_small_int_floor_divide,
     mp_small_int_modulo,
     mp_native_yield_from,
@@ -344,4 +344,8 @@ const mp_fun_table_t mp_fun_table = {
     &mp_stream_write_obj,
 };
 
+#elif MICROPY_EMIT_NATIVE && MICROPY_DYNAMIC_COMPILER
+
+const int mp_fun_table;
+
 #endif // MICROPY_EMIT_NATIVE
diff --git a/python/src/py/nativeglue.h b/python/src/py/nativeglue.h
index 9d9a97b9e..7b1ccd8d4 100644
--- a/python/src/py/nativeglue.h
+++ b/python/src/py/nativeglue.h
@@ -75,7 +75,7 @@ typedef enum {
     MP_F_UNPACK_EX,
     MP_F_DELETE_NAME,
     MP_F_DELETE_GLOBAL,
-    MP_F_MAKE_CLOSURE_FROM_RAW_CODE,
+    MP_F_NEW_CLOSURE,
     MP_F_ARG_CHECK_NUM_SIG,
     MP_F_SETUP_CODE_STATE,
     MP_F_SMALL_INT_FLOOR_DIVIDE,
@@ -112,7 +112,7 @@ typedef struct _mp_fun_table_t {
     void (*set_store)(mp_obj_t self_in, mp_obj_t item);
     mp_obj_t (*list_append)(mp_obj_t self_in, mp_obj_t arg);
     mp_obj_t (*dict_store)(mp_obj_t self_in, mp_obj_t key, mp_obj_t value);
-    mp_obj_t (*make_function_from_raw_code)(const mp_raw_code_t *rc, mp_obj_t def_args, mp_obj_t def_kw_args);
+    mp_obj_t (*make_function_from_raw_code)(const mp_raw_code_t *rc, const mp_module_context_t *cm, const mp_obj_t *def_args);
     mp_obj_t (*call_function_n_kw)(mp_obj_t fun_in, size_t n_args_kw, const mp_obj_t *args);
     mp_obj_t (*call_method_n_kw)(size_t n_args, size_t n_kw, const mp_obj_t *args);
     mp_obj_t (*call_method_n_kw_var)(bool have_self, size_t n_args_n_kw, const mp_obj_t *args);
@@ -129,9 +129,9 @@ typedef struct _mp_fun_table_t {
     void (*unpack_ex)(mp_obj_t seq, size_t num, mp_obj_t *items);
     void (*delete_name)(qstr qst);
     void (*delete_global)(qstr qst);
-    mp_obj_t (*make_closure_from_raw_code)(const mp_raw_code_t *rc, mp_uint_t n_closed_over, const mp_obj_t *args);
+    mp_obj_t (*new_closure)(mp_obj_t fun, size_t n_closed_over, const mp_obj_t *closed);
     void (*arg_check_num_sig)(size_t n_args, size_t n_kw, uint32_t sig);
-    void (*setup_code_state)(mp_code_state_t *code_state, size_t n_args, size_t n_kw, const mp_obj_t *args);
+    void (*setup_code_state_native)(mp_code_state_native_t *code_state, size_t n_args, size_t n_kw, const mp_obj_t *args);
     mp_int_t (*small_int_floor_divide)(mp_int_t num, mp_int_t denom);
     mp_int_t (*small_int_modulo)(mp_int_t dividend, mp_int_t divisor);
     bool (*yield_from)(mp_obj_t gen, mp_obj_t send_value, mp_obj_t *ret_value);
@@ -172,6 +172,12 @@ typedef struct _mp_fun_table_t {
     const mp_obj_fun_builtin_var_t *stream_write_obj;
 } mp_fun_table_t;
 
+#if (MICROPY_EMIT_NATIVE && !MICROPY_DYNAMIC_COMPILER) || MICROPY_ENABLE_DYNRUNTIME
 extern const mp_fun_table_t mp_fun_table;
+#elif MICROPY_EMIT_NATIVE && MICROPY_DYNAMIC_COMPILER
+// In dynamic-compiler mode eliminate dependency on entries in mp_fun_table.
+// This only needs to be an independent pointer, content doesn't matter.
+extern const int mp_fun_table;
+#endif
 
 #endif // MICROPY_INCLUDED_PY_NATIVEGLUE_H
diff --git a/python/src/py/nlr.h b/python/src/py/nlr.h
index 9f12ede9c..dade10771 100644
--- a/python/src/py/nlr.h
+++ b/python/src/py/nlr.h
@@ -167,7 +167,7 @@ NORETURN void nlr_jump_fail(void *val);
 
 #if !MICROPY_NLR_SETJMP
 #define nlr_push(val) \
-    assert(MP_STATE_THREAD(nlr_top) != val),nlr_push(val)
+    assert(MP_STATE_THREAD(nlr_top) != val), nlr_push(val)
 
 /*
 #define nlr_push(val) \
diff --git a/python/src/py/obj.c b/python/src/py/obj.c
index e7a23be53..024de3c96 100644
--- a/python/src/py/obj.c
+++ b/python/src/py/obj.c
@@ -37,7 +37,14 @@
 #include "py/stackctrl.h"
 #include "py/stream.h" // for mp_obj_print
 
-const mp_obj_type_t *mp_obj_get_type(mp_const_obj_t o_in) {
+// Allocates an object and also sets type, for mp_obj_malloc{,_var} macros.
+void *mp_obj_malloc_helper(size_t num_bytes, const mp_obj_type_t *type) {
+    mp_obj_base_t *base = (mp_obj_base_t *)m_malloc(num_bytes);
+    base->type = type;
+    return base;
+}
+
+const mp_obj_type_t *MICROPY_WRAP_MP_OBJ_GET_TYPE(mp_obj_get_type)(mp_const_obj_t o_in) {
     #if MICROPY_OBJ_IMMEDIATE_OBJS && MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_A
 
     if (mp_obj_is_obj(o_in)) {
@@ -279,7 +286,7 @@ mp_obj_t mp_obj_equal_not_equal(mp_binary_op_t op, mp_obj_t o1, mp_obj_t o2) {
         o2 = temp;
     }
 
-    // equality not implemented, so fall back to pointer comparison
+    // equality not implemented, so fall back to pointer conparison
     return (o1 == o2) ? local_true : local_false;
 }
 
diff --git a/python/src/py/obj.h b/python/src/py/obj.h
index 11918ba17..29cd1855c 100644
--- a/python/src/py/obj.h
+++ b/python/src/py/obj.h
@@ -104,8 +104,18 @@ static inline bool mp_obj_is_immediate_obj(mp_const_obj_t o) {
 #if MICROPY_PY_BUILTINS_FLOAT
 #define mp_const_float_e MP_ROM_PTR(&mp_const_float_e_obj)
 #define mp_const_float_pi MP_ROM_PTR(&mp_const_float_pi_obj)
+#if MICROPY_PY_MATH_CONSTANTS
+#define mp_const_float_tau MP_ROM_PTR(&mp_const_float_tau_obj)
+#define mp_const_float_inf MP_ROM_PTR(&mp_const_float_inf_obj)
+#define mp_const_float_nan MP_ROM_PTR(&mp_const_float_nan_obj)
+#endif
 extern const struct _mp_obj_float_t mp_const_float_e_obj;
 extern const struct _mp_obj_float_t mp_const_float_pi_obj;
+#if MICROPY_PY_MATH_CONSTANTS
+extern const struct _mp_obj_float_t mp_const_float_tau_obj;
+extern const struct _mp_obj_float_t mp_const_float_inf_obj;
+extern const struct _mp_obj_float_t mp_const_float_nan_obj;
+#endif
 
 #define mp_obj_is_float(o) mp_obj_is_type((o), &mp_type_float)
 mp_float_t mp_obj_float_get(mp_obj_t self_in);
@@ -139,8 +149,18 @@ static inline bool mp_obj_is_immediate_obj(mp_const_obj_t o) {
 #if MICROPY_PY_BUILTINS_FLOAT
 #define mp_const_float_e MP_ROM_PTR(&mp_const_float_e_obj)
 #define mp_const_float_pi MP_ROM_PTR(&mp_const_float_pi_obj)
+#if MICROPY_PY_MATH_CONSTANTS
+#define mp_const_float_tau MP_ROM_PTR(&mp_const_float_tau_obj)
+#define mp_const_float_inf MP_ROM_PTR(&mp_const_float_inf_obj)
+#define mp_const_float_nan MP_ROM_PTR(&mp_const_float_nan_obj)
+#endif
 extern const struct _mp_obj_float_t mp_const_float_e_obj;
 extern const struct _mp_obj_float_t mp_const_float_pi_obj;
+#if MICROPY_PY_MATH_CONSTANTS
+extern const struct _mp_obj_float_t mp_const_float_tau_obj;
+extern const struct _mp_obj_float_t mp_const_float_inf_obj;
+extern const struct _mp_obj_float_t mp_const_float_nan_obj;
+#endif
 
 #define mp_obj_is_float(o) mp_obj_is_type((o), &mp_type_float)
 mp_float_t mp_obj_float_get(mp_obj_t self_in);
@@ -162,6 +182,11 @@ static inline bool mp_obj_is_small_int(mp_const_obj_t o) {
 #if MICROPY_PY_BUILTINS_FLOAT
 #define mp_const_float_e MP_ROM_PTR((mp_obj_t)(((0x402df854 & ~3) | 2) + 0x80800000))
 #define mp_const_float_pi MP_ROM_PTR((mp_obj_t)(((0x40490fdb & ~3) | 2) + 0x80800000))
+#if MICROPY_PY_MATH_CONSTANTS
+#define mp_const_float_tau MP_ROM_PTR((mp_obj_t)(((0x40c90fdb & ~3) | 2) + 0x80800000))
+#define mp_const_float_inf MP_ROM_PTR((mp_obj_t)(((0x7f800000 & ~3) | 2) + 0x80800000))
+#define mp_const_float_nan MP_ROM_PTR((mp_obj_t)(((0xffc00000 & ~3) | 2) + 0x80800000))
+#endif
 
 static inline bool mp_obj_is_float(mp_const_obj_t o) {
     return (((mp_uint_t)(o)) & 3) == 2 && (((mp_uint_t)(o)) & 0xff800007) != 0x00000006;
@@ -226,6 +251,11 @@ static inline bool mp_obj_is_immediate_obj(mp_const_obj_t o) {
 
 #define mp_const_float_e {((mp_obj_t)((uint64_t)0x4005bf0a8b145769 + 0x8004000000000000))}
 #define mp_const_float_pi {((mp_obj_t)((uint64_t)0x400921fb54442d18 + 0x8004000000000000))}
+#if MICROPY_PY_MATH_CONSTANTS
+#define mp_const_float_tau {((mp_obj_t)((uint64_t)0x401921fb54442d18 + 0x8004000000000000))}
+#define mp_const_float_inf {((mp_obj_t)((uint64_t)0x7ff0000000000000 + 0x8004000000000000))}
+#define mp_const_float_nan {((mp_obj_t)((uint64_t)0xfff8000000000000 + 0x8004000000000000))}
+#endif
 
 static inline bool mp_obj_is_float(mp_const_obj_t o) {
     return ((uint64_t)(o) & 0xfffc000000000000) != 0;
@@ -389,9 +419,10 @@ typedef struct _mp_rom_obj_t { mp_const_obj_t o; } mp_rom_obj_t;
 // Declare a module as a builtin, processed by makemoduledefs.py
 // param module_name: MP_QSTR_<module name>
 // param obj_module: mp_obj_module_t instance
-// prarm enabled_define: used as `#if (enabled_define) around entry`
 
-#define MP_REGISTER_MODULE(module_name, obj_module, enabled_define)
+#ifndef NO_QSTR
+#define MP_REGISTER_MODULE(module_name, obj_module)
+#endif
 
 // Underlying map/hash table implementation (not dict object or map function)
 
@@ -551,6 +582,7 @@ struct _mp_obj_type_t {
     //
     // dest[0] = MP_OBJ_NULL means load
     //  return: for fail, do nothing
+    //          for fail but continue lookup in locals_dict, dest[1] = MP_OBJ_SENTINEL
     //          for attr, dest[0] = value
     //          for method, dest[0] = method, dest[1] = self
     //
@@ -701,6 +733,12 @@ extern const struct _mp_obj_exception_t mp_const_GeneratorExit_obj;
 
 // General API for objects
 
+// Helper versions of m_new_obj when you need to immediately set base.type.
+// Implementing this as a call rather than inline saves 8 bytes per usage.
+#define mp_obj_malloc(struct_type, obj_type) ((struct_type *)mp_obj_malloc_helper(sizeof(struct_type), obj_type))
+#define mp_obj_malloc_var(struct_type, var_type, var_num, obj_type) ((struct_type *)mp_obj_malloc_helper(sizeof(struct_type) + sizeof(var_type) * (var_num), obj_type))
+void *mp_obj_malloc_helper(size_t num_bytes, const mp_obj_type_t *type);
+
 // These macros are derived from more primitive ones and are used to
 // check for more specific object types.
 // Note: these are kept as macros because inline functions sometimes use much
@@ -750,9 +788,6 @@ mp_obj_t mp_obj_new_exception_msg_varg(const mp_obj_type_t *exc_type, mp_rom_err
 #ifdef va_start
 mp_obj_t mp_obj_new_exception_msg_vlist(const mp_obj_type_t *exc_type, mp_rom_error_text_t fmt, va_list arg); // same fmt restrictions as above
 #endif
-mp_obj_t mp_obj_new_fun_bc(mp_obj_t def_args, mp_obj_t def_kw_args, const byte *code, const mp_uint_t *const_table);
-mp_obj_t mp_obj_new_fun_native(mp_obj_t def_args_in, mp_obj_t def_kw_args, const void *fun_data, const mp_uint_t *const_table);
-mp_obj_t mp_obj_new_fun_asm(size_t n_args, const void *fun_data, mp_uint_t type_sig);
 mp_obj_t mp_obj_new_gen_wrap(mp_obj_t fun);
 mp_obj_t mp_obj_new_closure(mp_obj_t fun, size_t n_closed, const mp_obj_t *closed);
 mp_obj_t mp_obj_new_tuple(size_t n, const mp_obj_t *items);
@@ -961,7 +996,6 @@ typedef struct _mp_obj_fun_builtin_var_t {
 } mp_obj_fun_builtin_var_t;
 
 qstr mp_obj_fun_get_name(mp_const_obj_t fun);
-qstr mp_obj_code_get_name(const byte *code_info);
 
 mp_obj_t mp_identity(mp_obj_t self);
 MP_DECLARE_CONST_FUN_OBJ_1(mp_identity_obj);
diff --git a/python/src/py/objarray.c b/python/src/py/objarray.c
index 16a4d4aac..bff3126a2 100644
--- a/python/src/py/objarray.c
+++ b/python/src/py/objarray.c
@@ -639,8 +639,7 @@ mp_obj_t mp_obj_new_bytearray(size_t n, void *items) {
 
 // Create bytearray which references specified memory area
 mp_obj_t mp_obj_new_bytearray_by_ref(size_t n, void *items) {
-    mp_obj_array_t *o = m_new_obj(mp_obj_array_t);
-    o->base.type = &mp_type_bytearray;
+    mp_obj_array_t *o = mp_obj_malloc(mp_obj_array_t, &mp_type_bytearray);
     o->typecode = BYTEARRAY_TYPECODE;
     o->free = 0;
     o->len = n;
diff --git a/python/src/py/objattrtuple.c b/python/src/py/objattrtuple.c
index 3422d0146..13c281aa1 100644
--- a/python/src/py/objattrtuple.c
+++ b/python/src/py/objattrtuple.c
@@ -71,8 +71,7 @@ STATIC void mp_obj_attrtuple_attr(mp_obj_t self_in, qstr attr, mp_obj_t *dest) {
 }
 
 mp_obj_t mp_obj_new_attrtuple(const qstr *fields, size_t n, const mp_obj_t *items) {
-    mp_obj_tuple_t *o = m_new_obj_var(mp_obj_tuple_t, mp_obj_t, n + 1);
-    o->base.type = &mp_type_attrtuple;
+    mp_obj_tuple_t *o = mp_obj_malloc_var(mp_obj_tuple_t, mp_obj_t, n + 1, &mp_type_attrtuple);
     o->len = n;
     for (size_t i = 0; i < n; i++) {
         o->items[i] = items[i];
diff --git a/python/src/py/objboundmeth.c b/python/src/py/objboundmeth.c
index a3e1d302d..9936c06e4 100644
--- a/python/src/py/objboundmeth.c
+++ b/python/src/py/objboundmeth.c
@@ -108,8 +108,7 @@ STATIC const mp_obj_type_t mp_type_bound_meth = {
 };
 
 mp_obj_t mp_obj_new_bound_meth(mp_obj_t meth, mp_obj_t self) {
-    mp_obj_bound_meth_t *o = m_new_obj(mp_obj_bound_meth_t);
-    o->base.type = &mp_type_bound_meth;
+    mp_obj_bound_meth_t *o = mp_obj_malloc(mp_obj_bound_meth_t, &mp_type_bound_meth);
     o->meth = meth;
     o->self = self;
     return MP_OBJ_FROM_PTR(o);
diff --git a/python/src/py/objcell.c b/python/src/py/objcell.c
index be2ae8cd9..2702ca535 100644
--- a/python/src/py/objcell.c
+++ b/python/src/py/objcell.c
@@ -64,8 +64,7 @@ STATIC const mp_obj_type_t mp_type_cell = {
 };
 
 mp_obj_t mp_obj_new_cell(mp_obj_t obj) {
-    mp_obj_cell_t *o = m_new_obj(mp_obj_cell_t);
-    o->base.type = &mp_type_cell;
+    mp_obj_cell_t *o = mp_obj_malloc(mp_obj_cell_t, &mp_type_cell);
     o->obj = obj;
     return MP_OBJ_FROM_PTR(o);
 }
diff --git a/python/src/py/objclosure.c b/python/src/py/objclosure.c
index 054b65789..9dc3e5453 100644
--- a/python/src/py/objclosure.c
+++ b/python/src/py/objclosure.c
@@ -89,8 +89,7 @@ const mp_obj_type_t mp_type_closure = {
 };
 
 mp_obj_t mp_obj_new_closure(mp_obj_t fun, size_t n_closed_over, const mp_obj_t *closed) {
-    mp_obj_closure_t *o = m_new_obj_var(mp_obj_closure_t, mp_obj_t, n_closed_over);
-    o->base.type = &mp_type_closure;
+    mp_obj_closure_t *o = mp_obj_malloc_var(mp_obj_closure_t, mp_obj_t, n_closed_over, &mp_type_closure);
     o->fun = fun;
     o->n_closed = n_closed_over;
     memcpy(o->closed, closed, n_closed_over * sizeof(mp_obj_t));
diff --git a/python/src/py/objcomplex.c b/python/src/py/objcomplex.c
index f4c4aeffc..56c8353e9 100644
--- a/python/src/py/objcomplex.c
+++ b/python/src/py/objcomplex.c
@@ -162,8 +162,7 @@ const mp_obj_type_t mp_type_complex = {
 };
 
 mp_obj_t mp_obj_new_complex(mp_float_t real, mp_float_t imag) {
-    mp_obj_complex_t *o = m_new_obj(mp_obj_complex_t);
-    o->base.type = &mp_type_complex;
+    mp_obj_complex_t *o = mp_obj_malloc(mp_obj_complex_t, &mp_type_complex);
     o->real = real;
     o->imag = imag;
     return MP_OBJ_FROM_PTR(o);
diff --git a/python/src/py/objdeque.c b/python/src/py/objdeque.c
index c95bdeee9..b1c59a81e 100644
--- a/python/src/py/objdeque.c
+++ b/python/src/py/objdeque.c
@@ -57,8 +57,7 @@ STATIC mp_obj_t deque_make_new(const mp_obj_type_t *type, size_t n_args, size_t
         mp_raise_ValueError(NULL);
     }
 
-    mp_obj_deque_t *o = m_new_obj(mp_obj_deque_t);
-    o->base.type = type;
+    mp_obj_deque_t *o = mp_obj_malloc(mp_obj_deque_t, type);
     o->alloc = maxlen + 1;
     o->i_get = o->i_put = 0;
     o->items = m_new0(mp_obj_t, o->alloc);
diff --git a/python/src/py/objdict.c b/python/src/py/objdict.c
index ed4376aa4..1d8e9059a 100644
--- a/python/src/py/objdict.c
+++ b/python/src/py/objdict.c
@@ -521,8 +521,7 @@ STATIC const mp_obj_type_t mp_type_dict_view = {
 };
 
 STATIC mp_obj_t mp_obj_new_dict_view(mp_obj_t dict, mp_dict_view_kind_t kind) {
-    mp_obj_dict_view_t *o = m_new_obj(mp_obj_dict_view_t);
-    o->base.type = &mp_type_dict_view;
+    mp_obj_dict_view_t *o = mp_obj_malloc(mp_obj_dict_view_t, &mp_type_dict_view);
     o->dict = dict;
     o->kind = kind;
     return MP_OBJ_FROM_PTR(o);
diff --git a/python/src/py/objenumerate.c b/python/src/py/objenumerate.c
index d1de4add4..241aef302 100644
--- a/python/src/py/objenumerate.c
+++ b/python/src/py/objenumerate.c
@@ -54,14 +54,12 @@ STATIC mp_obj_t enumerate_make_new(const mp_obj_type_t *type, size_t n_args, siz
         MP_ARRAY_SIZE(allowed_args), allowed_args, (mp_arg_val_t *)&arg_vals);
 
     // create enumerate object
-    mp_obj_enumerate_t *o = m_new_obj(mp_obj_enumerate_t);
-    o->base.type = type;
+    mp_obj_enumerate_t *o = mp_obj_malloc(mp_obj_enumerate_t, type);
     o->iter = mp_getiter(arg_vals.iterable.u_obj, NULL);
     o->cur = arg_vals.start.u_int;
     #else
     mp_arg_check_num(n_args, n_kw, 1, 2, false);
-    mp_obj_enumerate_t *o = m_new_obj(mp_obj_enumerate_t);
-    o->base.type = type;
+    mp_obj_enumerate_t *o = mp_obj_malloc(mp_obj_enumerate_t, type);
     o->iter = mp_getiter(args[0], NULL);
     o->cur = n_args > 1 ? mp_obj_get_int(args[1]) : 0;
     #endif
diff --git a/python/src/py/objexcept.c b/python/src/py/objexcept.c
index 7a86c3647..dca287bb6 100644
--- a/python/src/py/objexcept.c
+++ b/python/src/py/objexcept.c
@@ -575,6 +575,16 @@ void mp_obj_exception_add_traceback(mp_obj_t self_in, qstr file, size_t line, qs
     // append this traceback info to traceback data
     // if memory allocation fails (eg because gc is locked), just return
 
+    #if MICROPY_PY_SYS_TRACEBACKLIMIT
+    mp_int_t max_traceback = MP_OBJ_SMALL_INT_VALUE(MP_STATE_VM(sys_mutable[MP_SYS_MUTABLE_TRACEBACKLIMIT]));
+    if (max_traceback <= 0) {
+        return;
+    } else if (self->traceback_data != NULL && self->traceback_len >= max_traceback * TRACEBACK_ENTRY_LEN) {
+        self->traceback_len -= TRACEBACK_ENTRY_LEN;
+        memmove(self->traceback_data, self->traceback_data + TRACEBACK_ENTRY_LEN, self->traceback_len * sizeof(self->traceback_data[0]));
+    }
+    #endif
+
     if (self->traceback_data == NULL) {
         self->traceback_data = m_new_maybe(size_t, TRACEBACK_ENTRY_LEN);
         if (self->traceback_data == NULL) {
diff --git a/python/src/py/objfilter.c b/python/src/py/objfilter.c
index 41b2a3bc5..a402d8c64 100644
--- a/python/src/py/objfilter.c
+++ b/python/src/py/objfilter.c
@@ -36,8 +36,7 @@ typedef struct _mp_obj_filter_t {
 
 STATIC mp_obj_t filter_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
     mp_arg_check_num(n_args, n_kw, 2, 2, false);
-    mp_obj_filter_t *o = m_new_obj(mp_obj_filter_t);
-    o->base.type = type;
+    mp_obj_filter_t *o = mp_obj_malloc(mp_obj_filter_t, type);
     o->fun = args[0];
     o->iter = mp_getiter(args[1], NULL);
     return MP_OBJ_FROM_PTR(o);
diff --git a/python/src/py/objfloat.c b/python/src/py/objfloat.c
index 5194dba51..0855baac8 100644
--- a/python/src/py/objfloat.c
+++ b/python/src/py/objfloat.c
@@ -54,6 +54,14 @@ typedef struct _mp_obj_float_t {
 
 const mp_obj_float_t mp_const_float_e_obj = {{&mp_type_float}, (mp_float_t)M_E};
 const mp_obj_float_t mp_const_float_pi_obj = {{&mp_type_float}, (mp_float_t)M_PI};
+#if MICROPY_PY_MATH_CONSTANTS
+#ifndef NAN
+#error NAN macro is not defined
+#endif
+const mp_obj_float_t mp_const_float_tau_obj = {{&mp_type_float}, (mp_float_t)(2.0 * M_PI)};
+const mp_obj_float_t mp_const_float_inf_obj = {{&mp_type_float}, (mp_float_t)INFINITY};
+const mp_obj_float_t mp_const_float_nan_obj = {{&mp_type_float}, (mp_float_t)NAN};
+#endif
 
 #endif
 
@@ -187,7 +195,8 @@ const mp_obj_type_t mp_type_float = {
 #if MICROPY_OBJ_REPR != MICROPY_OBJ_REPR_C && MICROPY_OBJ_REPR != MICROPY_OBJ_REPR_D
 
 mp_obj_t mp_obj_new_float(mp_float_t value) {
-    mp_obj_float_t *o = m_new(mp_obj_float_t, 1);
+    // Don't use mp_obj_malloc here to avoid extra function call overhead.
+    mp_obj_float_t *o = m_new_obj(mp_obj_float_t);
     o->base.type = &mp_type_float;
     o->value = value;
     return MP_OBJ_FROM_PTR(o);
diff --git a/python/src/py/objfun.c b/python/src/py/objfun.c
index d86a4d235..8f0c3eb6d 100644
--- a/python/src/py/objfun.c
+++ b/python/src/py/objfun.c
@@ -143,13 +143,13 @@ const mp_obj_type_t mp_type_fun_builtin_var = {
 /******************************************************************************/
 /* byte code functions                                                        */
 
-qstr mp_obj_code_get_name(const byte *code_info) {
+STATIC qstr mp_obj_code_get_name(const mp_obj_fun_bc_t *fun, const byte *code_info) {
     MP_BC_PRELUDE_SIZE_DECODE(code_info);
-    #if MICROPY_PERSISTENT_CODE
-    return code_info[0] | (code_info[1] << 8);
-    #else
-    return mp_decode_uint_value(code_info);
+    mp_uint_t name = mp_decode_uint_value(code_info);
+    #if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
+    name = fun->context->constants.qstr_table[name];
     #endif
+    return name;
 }
 
 #if MICROPY_EMIT_NATIVE
@@ -167,7 +167,7 @@ qstr mp_obj_fun_get_name(mp_const_obj_t fun_in) {
 
     const byte *bc = fun->bytecode;
     MP_BC_PRELUDE_SIG_DECODE(bc);
-    return mp_obj_code_get_name(bc);
+    return mp_obj_code_get_name(fun, bc);
 }
 
 #if MICROPY_CPYTHON_COMPAT
@@ -209,7 +209,6 @@ STATIC void dump_args(const mp_obj_t *a, size_t sz) {
 
 #define INIT_CODESTATE(code_state, _fun_bc, _n_state, n_args, n_kw, args) \
     code_state->fun_bc = _fun_bc; \
-    code_state->ip = 0; \
     code_state->n_state = _n_state; \
     mp_setup_code_state(code_state, n_args, n_kw, args); \
     code_state->old_globals = mp_globals_get();
@@ -240,7 +239,7 @@ mp_code_state_t *mp_obj_fun_bc_prepare_codestate(mp_obj_t self_in, size_t n_args
     INIT_CODESTATE(code_state, self, n_state, n_args, n_kw, args);
 
     // execute the byte code with the correct globals context
-    mp_globals_set(self->globals);
+    mp_globals_set(self->context->module.globals);
 
     return code_state;
 }
@@ -285,7 +284,7 @@ STATIC mp_obj_t fun_bc_call(mp_obj_t self_in, size_t n_args, size_t n_kw, const
     INIT_CODESTATE(code_state, self, n_state, n_args, n_kw, args);
 
     // execute the byte code with the correct globals context
-    mp_globals_set(self->globals);
+    mp_globals_set(self->context->module.globals);
     mp_vm_return_kind_t vm_return_kind = mp_execute_bytecode(code_state, MP_OBJ_NULL);
     mp_globals_set(code_state->old_globals);
 
@@ -358,7 +357,7 @@ void mp_obj_fun_bc_attr(mp_obj_t self_in, qstr attr, mp_obj_t *dest) {
     }
     if (attr == MP_QSTR___globals__) {
         mp_obj_fun_bc_t *self = MP_OBJ_TO_PTR(self_in);
-        dest[0] = MP_OBJ_FROM_PTR(self->globals);
+        dest[0] = MP_OBJ_FROM_PTR(self->context->module.globals);
     }
 }
 #endif
@@ -377,25 +376,28 @@ const mp_obj_type_t mp_type_fun_bc = {
     #endif
 };
 
-mp_obj_t mp_obj_new_fun_bc(mp_obj_t def_args_in, mp_obj_t def_kw_args, const byte *code, const mp_uint_t *const_table) {
+mp_obj_t mp_obj_new_fun_bc(const mp_obj_t *def_args, const byte *code, const mp_module_context_t *context, struct _mp_raw_code_t *const *child_table) {
     size_t n_def_args = 0;
     size_t n_extra_args = 0;
-    mp_obj_tuple_t *def_args = MP_OBJ_TO_PTR(def_args_in);
-    if (def_args_in != MP_OBJ_NULL) {
-        assert(mp_obj_is_type(def_args_in, &mp_type_tuple));
-        n_def_args = def_args->len;
-        n_extra_args = def_args->len;
+    mp_obj_tuple_t *def_pos_args = NULL;
+    mp_obj_t def_kw_args = MP_OBJ_NULL;
+    if (def_args != NULL && def_args[0] != MP_OBJ_NULL) {
+        assert(mp_obj_is_type(def_args[0], &mp_type_tuple));
+        def_pos_args = MP_OBJ_TO_PTR(def_args[0]);
+        n_def_args = def_pos_args->len;
+        n_extra_args = def_pos_args->len;
     }
-    if (def_kw_args != MP_OBJ_NULL) {
+    if (def_args != NULL && def_args[1] != MP_OBJ_NULL) {
+        assert(mp_obj_is_type(def_args[1], &mp_type_dict));
+        def_kw_args = def_args[1];
         n_extra_args += 1;
     }
-    mp_obj_fun_bc_t *o = m_new_obj_var(mp_obj_fun_bc_t, mp_obj_t, n_extra_args);
-    o->base.type = &mp_type_fun_bc;
-    o->globals = mp_globals_get();
+    mp_obj_fun_bc_t *o = mp_obj_malloc_var(mp_obj_fun_bc_t, mp_obj_t, n_extra_args, &mp_type_fun_bc);
     o->bytecode = code;
-    o->const_table = const_table;
-    if (def_args != NULL) {
-        memcpy(o->extra_args, def_args->items, n_def_args * sizeof(mp_obj_t));
+    o->context = context;
+    o->child_table = child_table;
+    if (def_pos_args != NULL) {
+        memcpy(o->extra_args, def_pos_args->items, n_def_args * sizeof(mp_obj_t));
     }
     if (def_kw_args != MP_OBJ_NULL) {
         o->extra_args[n_def_args] = def_kw_args;
@@ -410,7 +412,7 @@ mp_obj_t mp_obj_new_fun_bc(mp_obj_t def_args_in, mp_obj_t def_kw_args, const byt
 
 STATIC mp_obj_t fun_native_call(mp_obj_t self_in, size_t n_args, size_t n_kw, const mp_obj_t *args) {
     MP_STACK_CHECK();
-    mp_obj_fun_bc_t *self = self_in;
+    mp_obj_fun_bc_t *self = MP_OBJ_TO_PTR(self_in);
     mp_call_fun_t fun = MICROPY_MAKE_POINTER_CALLABLE((void *)self->bytecode);
     return fun(self_in, n_args, n_kw, args);
 }
@@ -423,10 +425,10 @@ STATIC const mp_obj_type_t mp_type_fun_native = {
     .unary_op = mp_generic_unary_op,
 };
 
-mp_obj_t mp_obj_new_fun_native(mp_obj_t def_args_in, mp_obj_t def_kw_args, const void *fun_data, const mp_uint_t *const_table) {
-    mp_obj_fun_bc_t *o = mp_obj_new_fun_bc(def_args_in, def_kw_args, (const byte *)fun_data, const_table);
+mp_obj_t mp_obj_new_fun_native(const mp_obj_t *def_args, const void *fun_data, const mp_module_context_t *mc, struct _mp_raw_code_t *const *child_table) {
+    mp_obj_fun_bc_t *o = MP_OBJ_TO_PTR(mp_obj_new_fun_bc(def_args, (const byte *)fun_data, mc, child_table));
     o->base.type = &mp_type_fun_native;
-    return o;
+    return MP_OBJ_FROM_PTR(o);
 }
 
 #endif // MICROPY_EMIT_NATIVE
@@ -494,7 +496,7 @@ STATIC mp_uint_t convert_obj_for_inline_asm(mp_obj_t obj) {
 }
 
 STATIC mp_obj_t fun_asm_call(mp_obj_t self_in, size_t n_args, size_t n_kw, const mp_obj_t *args) {
-    mp_obj_fun_asm_t *self = self_in;
+    mp_obj_fun_asm_t *self = MP_OBJ_TO_PTR(self_in);
 
     mp_arg_check_num(n_args, n_kw, self->n_args, self->n_args, false);
 
@@ -532,12 +534,11 @@ STATIC const mp_obj_type_t mp_type_fun_asm = {
 };
 
 mp_obj_t mp_obj_new_fun_asm(size_t n_args, const void *fun_data, mp_uint_t type_sig) {
-    mp_obj_fun_asm_t *o = m_new_obj(mp_obj_fun_asm_t);
-    o->base.type = &mp_type_fun_asm;
+    mp_obj_fun_asm_t *o = mp_obj_malloc(mp_obj_fun_asm_t, &mp_type_fun_asm);
     o->n_args = n_args;
     o->fun_data = fun_data;
     o->type_sig = type_sig;
-    return o;
+    return MP_OBJ_FROM_PTR(o);
 }
 
 #endif // MICROPY_EMIT_INLINE_ASM
diff --git a/python/src/py/objfun.h b/python/src/py/objfun.h
index 905b5dbca..9de15b884 100644
--- a/python/src/py/objfun.h
+++ b/python/src/py/objfun.h
@@ -26,24 +26,26 @@
 #ifndef MICROPY_INCLUDED_PY_OBJFUN_H
 #define MICROPY_INCLUDED_PY_OBJFUN_H
 
+#include "py/bc.h"
 #include "py/obj.h"
 
 typedef struct _mp_obj_fun_bc_t {
     mp_obj_base_t base;
-    mp_obj_dict_t *globals;         // the context within which this function was defined
-    const byte *bytecode;           // bytecode for the function
-    const mp_uint_t *const_table;   // constant table
+    const mp_module_context_t *context;         // context within which this function was defined
+    struct _mp_raw_code_t *const *child_table;  // table of children
+    const byte *bytecode;                       // bytecode for the function
     #if MICROPY_PY_SYS_SETTRACE
     const struct _mp_raw_code_t *rc;
     #endif
     // the following extra_args array is allocated space to take (in order):
     //  - values of positional default args (if any)
     //  - a single slot for default kw args dict (if it has them)
-    //  - a single slot for var args tuple (if it takes them)
-    //  - a single slot for kw args dict (if it takes them)
     mp_obj_t extra_args[];
 } mp_obj_fun_bc_t;
 
+mp_obj_t mp_obj_new_fun_bc(const mp_obj_t *def_args, const byte *code, const mp_module_context_t *cm, struct _mp_raw_code_t *const *raw_code_table);
+mp_obj_t mp_obj_new_fun_native(const mp_obj_t *def_args, const void *fun_data, const mp_module_context_t *cm, struct _mp_raw_code_t *const *raw_code_table);
+mp_obj_t mp_obj_new_fun_asm(size_t n_args, const void *fun_data, mp_uint_t type_sig);
 void mp_obj_fun_bc_attr(mp_obj_t self_in, qstr attr, mp_obj_t *dest);
 
 #endif // MICROPY_INCLUDED_PY_OBJFUN_H
diff --git a/python/src/py/objgenerator.c b/python/src/py/objgenerator.c
index 784310092..802fd45bb 100644
--- a/python/src/py/objgenerator.c
+++ b/python/src/py/objgenerator.c
@@ -59,13 +59,12 @@ STATIC mp_obj_t gen_wrap_call(mp_obj_t self_in, size_t n_args, size_t n_kw, cons
     MP_BC_PRELUDE_SIG_DECODE(ip);
 
     // allocate the generator object, with room for local stack and exception stack
-    mp_obj_gen_instance_t *o = m_new_obj_var(mp_obj_gen_instance_t, byte,
-        n_state * sizeof(mp_obj_t) + n_exc_stack * sizeof(mp_exc_stack_t));
-    o->base.type = &mp_type_gen_instance;
+    mp_obj_gen_instance_t *o = mp_obj_malloc_var(mp_obj_gen_instance_t, byte,
+        n_state * sizeof(mp_obj_t) + n_exc_stack * sizeof(mp_exc_stack_t),
+        &mp_type_gen_instance);
 
     o->pend_exc = mp_const_none;
     o->code_state.fun_bc = self_fun;
-    o->code_state.ip = 0;
     o->code_state.n_state = n_state;
     mp_setup_code_state(&o->code_state, n_args, n_kw, args);
     return MP_OBJ_FROM_PTR(o);
@@ -87,33 +86,40 @@ const mp_obj_type_t mp_type_gen_wrap = {
 
 #if MICROPY_EMIT_NATIVE
 
+// Based on mp_obj_gen_instance_t.
+typedef struct _mp_obj_gen_instance_native_t {
+    mp_obj_base_t base;
+    mp_obj_t pend_exc;
+    mp_code_state_native_t code_state;
+} mp_obj_gen_instance_native_t;
+
 STATIC mp_obj_t native_gen_wrap_call(mp_obj_t self_in, size_t n_args, size_t n_kw, const mp_obj_t *args) {
     // The state for a native generating function is held in the same struct as a bytecode function
     mp_obj_fun_bc_t *self_fun = MP_OBJ_TO_PTR(self_in);
 
-    // Determine start of prelude, and extract n_state from it
-    uintptr_t prelude_offset = ((uintptr_t *)self_fun->bytecode)[0];
-    #if MICROPY_EMIT_NATIVE_PRELUDE_AS_BYTES_OBJ
-    // Prelude is in bytes object in const_table, at index prelude_offset
-    mp_obj_str_t *prelude_bytes = MP_OBJ_TO_PTR(self_fun->const_table[prelude_offset]);
-    prelude_offset = (const byte *)prelude_bytes->data - self_fun->bytecode;
-    #endif
-    const uint8_t *ip = self_fun->bytecode + prelude_offset;
-    size_t n_state, n_exc_stack_unused, scope_flags, n_pos_args, n_kwonly_args, n_def_args;
-    MP_BC_PRELUDE_SIG_DECODE_INTO(ip, n_state, n_exc_stack_unused, scope_flags, n_pos_args, n_kwonly_args, n_def_args);
-    size_t n_exc_stack = 0;
+    // Determine start of prelude.
+    uintptr_t prelude_ptr_index = ((uintptr_t *)self_fun->bytecode)[0];
+    const uint8_t *prelude_ptr;
+    if (prelude_ptr_index == 0) {
+        prelude_ptr = (void *)self_fun->child_table;
+    } else {
+        prelude_ptr = (void *)self_fun->child_table[prelude_ptr_index];
+    }
 
-    // Allocate the generator object, with room for local stack and exception stack
-    mp_obj_gen_instance_t *o = m_new_obj_var(mp_obj_gen_instance_t, byte,
-        n_state * sizeof(mp_obj_t) + n_exc_stack * sizeof(mp_exc_stack_t));
-    o->base.type = &mp_type_gen_instance;
+    // Extract n_state from the prelude.
+    const uint8_t *ip = prelude_ptr;
+    MP_BC_PRELUDE_SIG_DECODE(ip);
+
+    // Allocate the generator object, with room for local stack (exception stack not needed).
+    mp_obj_gen_instance_native_t *o = mp_obj_malloc_var(mp_obj_gen_instance_native_t, byte, n_state * sizeof(mp_obj_t), &mp_type_gen_instance);
 
     // Parse the input arguments and set up the code state
     o->pend_exc = mp_const_none;
     o->code_state.fun_bc = self_fun;
-    o->code_state.ip = (const byte *)prelude_offset;
+    o->code_state.ip = prelude_ptr;
     o->code_state.n_state = n_state;
-    mp_setup_code_state(&o->code_state, n_args, n_kw, args);
+    o->code_state.sp = &o->code_state.state[0] - 1;
+    mp_setup_code_state_native(&o->code_state, n_args, n_kw, args);
 
     // Indicate we are a native function, which doesn't use this variable
     o->code_state.exc_sp_idx = MP_CODE_STATE_EXC_SP_IDX_SENTINEL;
@@ -171,7 +177,13 @@ mp_vm_return_kind_t mp_obj_gen_resume(mp_obj_t self_in, mp_obj_t send_value, mp_
     #endif
 
     // If the generator is started, allow sending a value.
-    if (self->code_state.sp == self->code_state.state - 1) {
+    void *state_start = self->code_state.state - 1;
+    #if MICROPY_EMIT_NATIVE
+    if (self->code_state.exc_sp_idx == MP_CODE_STATE_EXC_SP_IDX_SENTINEL) {
+        state_start = ((mp_obj_gen_instance_native_t *)self)->code_state.state - 1;
+    }
+    #endif
+    if (self->code_state.sp == state_start) {
         if (send_value != mp_const_none) {
             mp_raise_TypeError(MP_ERROR_TEXT("can't send non-None value to a just-started generator"));
         }
@@ -184,7 +196,7 @@ mp_vm_return_kind_t mp_obj_gen_resume(mp_obj_t self_in, mp_obj_t send_value, mp_
 
     // Set up the correct globals context for the generator and execute it
     self->code_state.old_globals = mp_globals_get();
-    mp_globals_set(self->code_state.fun_bc->globals);
+    mp_globals_set(self->code_state.fun_bc->context->module.globals);
 
     mp_vm_return_kind_t ret_kind;
 
@@ -226,7 +238,14 @@ mp_vm_return_kind_t mp_obj_gen_resume(mp_obj_t self_in, mp_obj_t send_value, mp_
 
         case MP_VM_RETURN_EXCEPTION: {
             self->code_state.ip = 0;
-            *ret_val = self->code_state.state[0];
+            #if MICROPY_EMIT_NATIVE
+            if (self->code_state.exc_sp_idx == MP_CODE_STATE_EXC_SP_IDX_SENTINEL) {
+                *ret_val = ((mp_obj_gen_instance_native_t *)self)->code_state.state[0];
+            } else
+            #endif
+            {
+                *ret_val = self->code_state.state[0];
+            }
             // PEP479: if StopIteration is raised inside a generator it is replaced with RuntimeError
             if (mp_obj_is_subclass_fast(MP_OBJ_FROM_PTR(mp_obj_get_type(*ret_val)), MP_OBJ_FROM_PTR(&mp_type_StopIteration))) {
                 *ret_val = mp_obj_new_exception_msg(&mp_type_RuntimeError, MP_ERROR_TEXT("generator raised StopIteration"));
diff --git a/python/src/py/objint_longlong.c b/python/src/py/objint_longlong.c
index f2e88c3ea..7fcb5462f 100644
--- a/python/src/py/objint_longlong.c
+++ b/python/src/py/objint_longlong.c
@@ -243,8 +243,7 @@ mp_obj_t mp_obj_new_int_from_uint(mp_uint_t value) {
 }
 
 mp_obj_t mp_obj_new_int_from_ll(long long val) {
-    mp_obj_int_t *o = m_new_obj(mp_obj_int_t);
-    o->base.type = &mp_type_int;
+    mp_obj_int_t *o = mp_obj_malloc(mp_obj_int_t, &mp_type_int);
     o->val = val;
     return o;
 }
@@ -254,8 +253,7 @@ mp_obj_t mp_obj_new_int_from_ull(unsigned long long val) {
     if (val >> (sizeof(unsigned long long) * 8 - 1) != 0) {
         mp_raise_msg(&mp_type_OverflowError, MP_ERROR_TEXT("ulonglong too large"));
     }
-    mp_obj_int_t *o = m_new_obj(mp_obj_int_t);
-    o->base.type = &mp_type_int;
+    mp_obj_int_t *o = mp_obj_malloc(mp_obj_int_t, &mp_type_int);
     o->val = val;
     return o;
 }
@@ -263,8 +261,7 @@ mp_obj_t mp_obj_new_int_from_ull(unsigned long long val) {
 mp_obj_t mp_obj_new_int_from_str_len(const char **str, size_t len, bool neg, unsigned int base) {
     // TODO this does not honor the given length of the string, but it all cases it should anyway be null terminated
     // TODO check overflow
-    mp_obj_int_t *o = m_new_obj(mp_obj_int_t);
-    o->base.type = &mp_type_int;
+    mp_obj_int_t *o = mp_obj_malloc(mp_obj_int_t, &mp_type_int);
     char *endptr;
     o->val = strtoll(*str, &endptr, base);
     *str = endptr;
diff --git a/python/src/py/objint_mpz.c b/python/src/py/objint_mpz.c
index ef3e01796..cbc4cb75a 100644
--- a/python/src/py/objint_mpz.c
+++ b/python/src/py/objint_mpz.c
@@ -75,8 +75,7 @@ const mp_obj_int_t mp_sys_maxsize_obj = {
 #endif
 
 mp_obj_int_t *mp_obj_int_new_mpz(void) {
-    mp_obj_int_t *o = m_new_obj(mp_obj_int_t);
-    o->base.type = &mp_type_int;
+    mp_obj_int_t *o = mp_obj_malloc(mp_obj_int_t, &mp_type_int);
     mpz_init_zero(&o->mpz);
     return o;
 }
diff --git a/python/src/py/objmap.c b/python/src/py/objmap.c
index 78c52c892..1f9275854 100644
--- a/python/src/py/objmap.c
+++ b/python/src/py/objmap.c
@@ -38,8 +38,7 @@ typedef struct _mp_obj_map_t {
 
 STATIC mp_obj_t map_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
     mp_arg_check_num(n_args, n_kw, 2, MP_OBJ_FUN_ARGS_MAX, false);
-    mp_obj_map_t *o = m_new_obj_var(mp_obj_map_t, mp_obj_t, n_args - 1);
-    o->base.type = type;
+    mp_obj_map_t *o = mp_obj_malloc_var(mp_obj_map_t, mp_obj_t, n_args - 1, type);
     o->n_iters = n_args - 1;
     o->fun = args[0];
     for (size_t i = 0; i < n_args - 1; i++) {
diff --git a/python/src/py/objmodule.c b/python/src/py/objmodule.c
index a1f9d9d7f..783d6b050 100644
--- a/python/src/py/objmodule.c
+++ b/python/src/py/objmodule.c
@@ -29,11 +29,20 @@
 #include <string.h>
 #include <assert.h>
 
+#include "py/bc.h"
 #include "py/objmodule.h"
 #include "py/runtime.h"
 #include "py/builtin.h"
 
+#ifndef NO_QSTR
+// Only include module definitions when not doing qstr extraction, because the
+// qstr extraction stage also generates this module definition header file.
 #include "genhdr/moduledefs.h"
+#endif
+
+#if MICROPY_MODULE_BUILTIN_INIT
+STATIC void mp_module_call_init(mp_obj_t module_name, mp_obj_t module_obj);
+#endif
 
 STATIC void module_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind_t kind) {
     (void)kind;
@@ -58,6 +67,21 @@ STATIC void module_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kin
     mp_printf(print, "<module '%s'>", module_name);
 }
 
+STATIC void module_attr_try_delegation(mp_obj_t self_in, qstr attr, mp_obj_t *dest) {
+    #if MICROPY_MODULE_ATTR_DELEGATION
+    // Delegate lookup to a module's custom attr method (found in last lot of globals dict).
+    mp_obj_module_t *self = MP_OBJ_TO_PTR(self_in);
+    mp_map_t *map = &self->globals->map;
+    if (map->table[map->alloc - 1].key == MP_OBJ_NEW_QSTR(MP_QSTRnull)) {
+        ((mp_attr_fun_t)MP_OBJ_TO_PTR(map->table[map->alloc - 1].value))(self_in, attr, dest);
+    }
+    #else
+    (void)self_in;
+    (void)attr;
+    (void)dest;
+    #endif
+}
+
 STATIC void module_attr(mp_obj_t self_in, qstr attr, mp_obj_t *dest) {
     mp_obj_module_t *self = MP_OBJ_TO_PTR(self_in);
     if (dest[0] == MP_OBJ_NULL) {
@@ -70,8 +94,12 @@ STATIC void module_attr(mp_obj_t self_in, qstr attr, mp_obj_t *dest) {
             elem = mp_map_lookup(&self->globals->map, MP_OBJ_NEW_QSTR(MP_QSTR___getattr__), MP_MAP_LOOKUP);
             if (elem != NULL) {
                 dest[0] = mp_call_function_1(elem->value, MP_OBJ_NEW_QSTR(attr));
+            } else {
+                module_attr_try_delegation(self_in, attr, dest);
             }
         #endif
+        } else {
+            module_attr_try_delegation(self_in, attr, dest);
         }
     } else {
         // delete/store attribute
@@ -87,6 +115,7 @@ STATIC void module_attr(mp_obj_t self_in, qstr attr, mp_obj_t *dest) {
             #endif
             {
                 // can't delete or store to fixed map
+                module_attr_try_delegation(self_in, attr, dest);
                 return;
             }
         }
@@ -118,12 +147,12 @@ mp_obj_t mp_obj_new_module(qstr module_name) {
     }
 
     // create new module object
-    mp_obj_module_t *o = m_new_obj(mp_obj_module_t);
-    o->base.type = &mp_type_module;
-    o->globals = MP_OBJ_TO_PTR(mp_obj_new_dict(MICROPY_MODULE_DICT_SIZE));
+    mp_module_context_t *o = m_new_obj(mp_module_context_t);
+    o->module.base.type = &mp_type_module;
+    o->module.globals = MP_OBJ_TO_PTR(mp_obj_new_dict(MICROPY_MODULE_DICT_SIZE));
 
     // store __name__ entry in the module
-    mp_obj_dict_store(MP_OBJ_FROM_PTR(o->globals), MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(module_name));
+    mp_obj_dict_store(MP_OBJ_FROM_PTR(o->module.globals), MP_OBJ_NEW_QSTR(MP_QSTR___name__), MP_OBJ_NEW_QSTR(module_name));
 
     // store the new module into the slot in the global dict holding all modules
     el->value = MP_OBJ_FROM_PTR(o);
@@ -136,153 +165,62 @@ mp_obj_t mp_obj_new_module(qstr module_name) {
 // Global module table and related functions
 
 STATIC const mp_rom_map_elem_t mp_builtin_module_table[] = {
-    { MP_ROM_QSTR(MP_QSTR___main__), MP_ROM_PTR(&mp_module___main__) },
-    { MP_ROM_QSTR(MP_QSTR_builtins), MP_ROM_PTR(&mp_module_builtins) },
-    { MP_ROM_QSTR(MP_QSTR_micropython), MP_ROM_PTR(&mp_module_micropython) },
-
-    #if MICROPY_PY_IO
-    { MP_ROM_QSTR(MP_QSTR_uio), MP_ROM_PTR(&mp_module_io) },
-    #endif
-    #if MICROPY_PY_COLLECTIONS
-    { MP_ROM_QSTR(MP_QSTR_ucollections), MP_ROM_PTR(&mp_module_collections) },
-    #endif
-    #if MICROPY_PY_STRUCT
-    { MP_ROM_QSTR(MP_QSTR_ustruct), MP_ROM_PTR(&mp_module_ustruct) },
-    #endif
-
-    #if MICROPY_PY_BUILTINS_FLOAT
-    #if MICROPY_PY_MATH
-    { MP_ROM_QSTR(MP_QSTR_math), MP_ROM_PTR(&mp_module_math) },
-    #endif
-    #if MICROPY_PY_BUILTINS_COMPLEX && MICROPY_PY_CMATH
-    { MP_ROM_QSTR(MP_QSTR_cmath), MP_ROM_PTR(&mp_module_cmath) },
-    #endif
-    #endif
-    #if MICROPY_PY_SYS
-    { MP_ROM_QSTR(MP_QSTR_usys), MP_ROM_PTR(&mp_module_sys) },
-    #endif
-    #if MICROPY_PY_GC && MICROPY_ENABLE_GC
-    { MP_ROM_QSTR(MP_QSTR_gc), MP_ROM_PTR(&mp_module_gc) },
-    #endif
-    #if MICROPY_PY_THREAD
-    { MP_ROM_QSTR(MP_QSTR__thread), MP_ROM_PTR(&mp_module_thread) },
-    #endif
-
-    // extmod modules
-
-    #if MICROPY_PY_UASYNCIO
-    { MP_ROM_QSTR(MP_QSTR__uasyncio), MP_ROM_PTR(&mp_module_uasyncio) },
-    #endif
-    #if MICROPY_PY_UERRNO
-    { MP_ROM_QSTR(MP_QSTR_uerrno), MP_ROM_PTR(&mp_module_uerrno) },
-    #endif
-    #if MICROPY_PY_UCTYPES
-    { MP_ROM_QSTR(MP_QSTR_uctypes), MP_ROM_PTR(&mp_module_uctypes) },
-    #endif
-    #if MICROPY_PY_UZLIB
-    { MP_ROM_QSTR(MP_QSTR_uzlib), MP_ROM_PTR(&mp_module_uzlib) },
-    #endif
-    #if MICROPY_PY_UJSON
-    { MP_ROM_QSTR(MP_QSTR_ujson), MP_ROM_PTR(&mp_module_ujson) },
-    #endif
-    #if MICROPY_PY_URE
-    { MP_ROM_QSTR(MP_QSTR_ure), MP_ROM_PTR(&mp_module_ure) },
-    #endif
-    #if MICROPY_PY_UHEAPQ
-    { MP_ROM_QSTR(MP_QSTR_uheapq), MP_ROM_PTR(&mp_module_uheapq) },
-    #endif
-    #if MICROPY_PY_UTIMEQ
-    { MP_ROM_QSTR(MP_QSTR_utimeq), MP_ROM_PTR(&mp_module_utimeq) },
-    #endif
-    #if MICROPY_PY_UHASHLIB
-    { MP_ROM_QSTR(MP_QSTR_uhashlib), MP_ROM_PTR(&mp_module_uhashlib) },
-    #endif
-    #if MICROPY_PY_UCRYPTOLIB
-    { MP_ROM_QSTR(MP_QSTR_ucryptolib), MP_ROM_PTR(&mp_module_ucryptolib) },
-    #endif
-    #if MICROPY_PY_UBINASCII
-    { MP_ROM_QSTR(MP_QSTR_ubinascii), MP_ROM_PTR(&mp_module_ubinascii) },
-    #endif
-    #if MICROPY_PY_URANDOM
-    { MP_ROM_QSTR(MP_QSTR_urandom), MP_ROM_PTR(&mp_module_urandom) },
-    #endif
-    #if MICROPY_PY_USELECT
-    { MP_ROM_QSTR(MP_QSTR_uselect), MP_ROM_PTR(&mp_module_uselect) },
-    #endif
-    #if MICROPY_PY_USSL
-    { MP_ROM_QSTR(MP_QSTR_ussl), MP_ROM_PTR(&mp_module_ussl) },
-    #endif
-    #if MICROPY_PY_LWIP
-    { MP_ROM_QSTR(MP_QSTR_lwip), MP_ROM_PTR(&mp_module_lwip) },
-    #endif
-    #if MICROPY_PY_UWEBSOCKET
-    { MP_ROM_QSTR(MP_QSTR_uwebsocket), MP_ROM_PTR(&mp_module_uwebsocket) },
-    #endif
-    #if MICROPY_PY_WEBREPL
-    { MP_ROM_QSTR(MP_QSTR__webrepl), MP_ROM_PTR(&mp_module_webrepl) },
-    #endif
-    #if MICROPY_PY_FRAMEBUF
-    { MP_ROM_QSTR(MP_QSTR_framebuf), MP_ROM_PTR(&mp_module_framebuf) },
-    #endif
-    #if MICROPY_PY_BTREE
-    { MP_ROM_QSTR(MP_QSTR_btree), MP_ROM_PTR(&mp_module_btree) },
-    #endif
-    #if MICROPY_PY_BLUETOOTH
-    { MP_ROM_QSTR(MP_QSTR_ubluetooth), MP_ROM_PTR(&mp_module_ubluetooth) },
-    #endif
-
-    // extra builtin modules as defined by a port
-    MICROPY_PORT_BUILTIN_MODULES
-
-    #ifdef MICROPY_REGISTERED_MODULES
     // builtin modules declared with MP_REGISTER_MODULE()
     MICROPY_REGISTERED_MODULES
-    #endif
 };
 
 MP_DEFINE_CONST_MAP(mp_builtin_module_map, mp_builtin_module_table);
 
-// returns MP_OBJ_NULL if not found
-mp_obj_t mp_module_get(qstr module_name) {
-    mp_map_t *mp_loaded_modules_map = &MP_STATE_VM(mp_loaded_modules_dict).map;
-    // lookup module
-    mp_map_elem_t *el = mp_map_lookup(mp_loaded_modules_map, MP_OBJ_NEW_QSTR(module_name), MP_MAP_LOOKUP);
+// Tries to find a loaded module, otherwise attempts to load a builtin, otherwise MP_OBJ_NULL.
+mp_obj_t mp_module_get_loaded_or_builtin(qstr module_name) {
+    // First try loaded modules.
+    mp_map_elem_t *elem = mp_map_lookup(&MP_STATE_VM(mp_loaded_modules_dict).map, MP_OBJ_NEW_QSTR(module_name), MP_MAP_LOOKUP);
 
-    if (el == NULL) {
-        // module not found, look for builtin module names
-        el = mp_map_lookup((mp_map_t *)&mp_builtin_module_map, MP_OBJ_NEW_QSTR(module_name), MP_MAP_LOOKUP);
-        if (el == NULL) {
+    if (!elem) {
+        #if MICROPY_MODULE_WEAK_LINKS
+        return mp_module_get_builtin(module_name);
+        #else
+        // Otherwise try builtin.
+        elem = mp_map_lookup((mp_map_t *)&mp_builtin_module_map, MP_OBJ_NEW_QSTR(module_name), MP_MAP_LOOKUP);
+        if (!elem) {
             return MP_OBJ_NULL;
         }
-        mp_module_call_init(module_name, el->value);
+
+        #if MICROPY_MODULE_BUILTIN_INIT
+        // If found, it's a newly loaded built-in, so init it.
+        mp_module_call_init(MP_OBJ_NEW_QSTR(module_name), elem->value);
+        #endif
+        #endif
     }
 
-    // module found, return it
-    return el->value;
-}
-
-void mp_module_register(qstr qst, mp_obj_t module) {
-    mp_map_t *mp_loaded_modules_map = &MP_STATE_VM(mp_loaded_modules_dict).map;
-    mp_map_lookup(mp_loaded_modules_map, MP_OBJ_NEW_QSTR(qst), MP_MAP_LOOKUP_ADD_IF_NOT_FOUND)->value = module;
+    return elem->value;
 }
 
 #if MICROPY_MODULE_WEAK_LINKS
-// Search for u"foo" in built-in modules, return MP_OBJ_NULL if not found
-mp_obj_t mp_module_search_umodule(const char *module_str) {
-    for (size_t i = 0; i < MP_ARRAY_SIZE(mp_builtin_module_table); ++i) {
-        const mp_map_elem_t *entry = (const mp_map_elem_t *)&mp_builtin_module_table[i];
-        const char *key = qstr_str(MP_OBJ_QSTR_VALUE(entry->key));
-        if (key[0] == 'u' && strcmp(&key[1], module_str) == 0) {
-            return (mp_obj_t)entry->value;
-        }
-
+// Tries to find a loaded module, otherwise attempts to load a builtin, otherwise MP_OBJ_NULL.
+mp_obj_t mp_module_get_builtin(qstr module_name) {
+    // Try builtin.
+    mp_map_elem_t *elem = mp_map_lookup((mp_map_t *)&mp_builtin_module_map, MP_OBJ_NEW_QSTR(module_name), MP_MAP_LOOKUP);
+    if (!elem) {
+        return MP_OBJ_NULL;
     }
-    return MP_OBJ_NULL;
+
+    #if MICROPY_MODULE_BUILTIN_INIT
+    // If found, it's a newly loaded built-in, so init it.
+    mp_module_call_init(MP_OBJ_NEW_QSTR(module_name), elem->value);
+    #endif
+
+    return elem->value;
 }
 #endif
 
 #if MICROPY_MODULE_BUILTIN_INIT
-void mp_module_call_init(qstr module_name, mp_obj_t module_obj) {
+STATIC void mp_module_register(mp_obj_t module_name, mp_obj_t module) {
+    mp_map_t *mp_loaded_modules_map = &MP_STATE_VM(mp_loaded_modules_dict).map;
+    mp_map_lookup(mp_loaded_modules_map, module_name, MP_MAP_LOOKUP_ADD_IF_NOT_FOUND)->value = module;
+}
+
+STATIC void mp_module_call_init(mp_obj_t module_name, mp_obj_t module_obj) {
     // Look for __init__ and call it if it exists
     mp_obj_t dest[2];
     mp_load_method_maybe(module_obj, MP_QSTR___init__, dest);
@@ -296,3 +234,19 @@ void mp_module_call_init(qstr module_name, mp_obj_t module_obj) {
     }
 }
 #endif
+
+void mp_module_generic_attr(qstr attr, mp_obj_t *dest, const uint16_t *keys, mp_obj_t *values) {
+    for (size_t i = 0; keys[i] != MP_QSTRnull; ++i) {
+        if (attr == keys[i]) {
+            if (dest[0] == MP_OBJ_NULL) {
+                // load attribute (MP_OBJ_NULL returned for deleted items)
+                dest[0] = values[i];
+            } else {
+                // delete or store (delete stores MP_OBJ_NULL)
+                values[i] = dest[1];
+                dest[0] = MP_OBJ_NULL; // indicate success
+            }
+            return;
+        }
+    }
+}
diff --git a/python/src/py/objmodule.h b/python/src/py/objmodule.h
index fde4fff34..d11d5bcd7 100644
--- a/python/src/py/objmodule.h
+++ b/python/src/py/objmodule.h
@@ -28,20 +28,16 @@
 
 #include "py/obj.h"
 
+// Place at the very end of a module's globals_table.
+#define MP_MODULE_ATTR_DELEGATION_ENTRY(ptr) { MP_ROM_QSTR(MP_QSTRnull), MP_ROM_PTR(ptr) }
+
 extern const mp_map_t mp_builtin_module_map;
 
-mp_obj_t mp_module_get(qstr module_name);
-void mp_module_register(qstr qstr, mp_obj_t module);
-
-mp_obj_t mp_module_search_umodule(const char *module_str);
-
-#if MICROPY_MODULE_BUILTIN_INIT
-void mp_module_call_init(qstr module_name, mp_obj_t module_obj);
-#else
-static inline void mp_module_call_init(qstr module_name, mp_obj_t module_obj) {
-    (void)module_name;
-    (void)module_obj;
-}
+mp_obj_t mp_module_get_loaded_or_builtin(qstr module_name);
+#if MICROPY_MODULE_WEAK_LINKS
+mp_obj_t mp_module_get_builtin(qstr module_name);
 #endif
 
+void mp_module_generic_attr(qstr attr, mp_obj_t *dest, const uint16_t *keys, mp_obj_t *values);
+
 #endif // MICROPY_INCLUDED_PY_OBJMODULE_H
diff --git a/python/src/py/objobject.c b/python/src/py/objobject.c
index 00082dfe0..165280280 100644
--- a/python/src/py/objobject.c
+++ b/python/src/py/objobject.c
@@ -36,8 +36,7 @@ typedef struct _mp_obj_object_t {
 STATIC mp_obj_t object_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
     (void)args;
     mp_arg_check_num(n_args, n_kw, 0, 0, false);
-    mp_obj_object_t *o = m_new_obj(mp_obj_object_t);
-    o->base.type = type;
+    mp_obj_object_t *o = mp_obj_malloc(mp_obj_object_t, type);
     return MP_OBJ_FROM_PTR(o);
 }
 
diff --git a/python/src/py/objproperty.c b/python/src/py/objproperty.c
index 8d2c292c5..49327c981 100644
--- a/python/src/py/objproperty.c
+++ b/python/src/py/objproperty.c
@@ -47,8 +47,7 @@ STATIC mp_obj_t property_make_new(const mp_obj_type_t *type, size_t n_args, size
     mp_arg_val_t vals[MP_ARRAY_SIZE(allowed_args)];
     mp_arg_parse_all_kw_array(n_args, n_kw, args, MP_ARRAY_SIZE(allowed_args), allowed_args, vals);
 
-    mp_obj_property_t *o = m_new_obj(mp_obj_property_t);
-    o->base.type = type;
+    mp_obj_property_t *o = mp_obj_malloc(mp_obj_property_t, type);
     o->proxy[0] = vals[ARG_fget].u_obj;
     o->proxy[1] = vals[ARG_fset].u_obj;
     o->proxy[2] = vals[ARG_fdel].u_obj;
diff --git a/python/src/py/objrange.c b/python/src/py/objrange.c
index 1f028eb86..549602189 100644
--- a/python/src/py/objrange.c
+++ b/python/src/py/objrange.c
@@ -92,8 +92,7 @@ STATIC void range_print(const mp_print_t *print, mp_obj_t self_in, mp_print_kind
 STATIC mp_obj_t range_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
     mp_arg_check_num(n_args, n_kw, 1, 3, false);
 
-    mp_obj_range_t *o = m_new_obj(mp_obj_range_t);
-    o->base.type = type;
+    mp_obj_range_t *o = mp_obj_malloc(mp_obj_range_t, type);
     o->start = 0;
     o->step = 1;
 
@@ -168,8 +167,7 @@ STATIC mp_obj_t range_subscr(mp_obj_t self_in, mp_obj_t index, mp_obj_t value) {
         if (mp_obj_is_type(index, &mp_type_slice)) {
             mp_bound_slice_t slice;
             mp_seq_get_fast_slice_indexes(len, index, &slice);
-            mp_obj_range_t *o = m_new_obj(mp_obj_range_t);
-            o->base.type = &mp_type_range;
+            mp_obj_range_t *o = mp_obj_malloc(mp_obj_range_t, &mp_type_range);
             o->start = self->start + slice.start * self->step;
             o->stop = self->start + slice.stop * self->step;
             o->step = slice.step * self->step;
diff --git a/python/src/py/objreversed.c b/python/src/py/objreversed.c
index 4254668e7..08961c0d2 100644
--- a/python/src/py/objreversed.c
+++ b/python/src/py/objreversed.c
@@ -47,8 +47,7 @@ STATIC mp_obj_t reversed_make_new(const mp_obj_type_t *type, size_t n_args, size
         return mp_call_method_n_kw(0, 0, dest);
     }
 
-    mp_obj_reversed_t *o = m_new_obj(mp_obj_reversed_t);
-    o->base.type = type;
+    mp_obj_reversed_t *o = mp_obj_malloc(mp_obj_reversed_t, type);
     o->seq = args[0];
     o->cur_index = mp_obj_get_int(mp_obj_len(args[0])); // start at the end of the sequence
 
diff --git a/python/src/py/objset.c b/python/src/py/objset.c
index d2508bfbf..26fd74398 100644
--- a/python/src/py/objset.c
+++ b/python/src/py/objset.c
@@ -174,8 +174,7 @@ STATIC MP_DEFINE_CONST_FUN_OBJ_1(set_clear_obj, set_clear);
 STATIC mp_obj_t set_copy(mp_obj_t self_in) {
     check_set_or_frozenset(self_in);
     mp_obj_set_t *self = MP_OBJ_TO_PTR(self_in);
-    mp_obj_set_t *other = m_new_obj(mp_obj_set_t);
-    other->base.type = self->base.type;
+    mp_obj_set_t *other = mp_obj_malloc(mp_obj_set_t, self->base.type);
     mp_set_init(&other->set, self->set.alloc);
     other->set.used = self->set.used;
     memcpy(other->set.table, self->set.table, self->set.alloc * sizeof(mp_obj_t));
@@ -579,8 +578,7 @@ const mp_obj_type_t mp_type_frozenset = {
 #endif
 
 mp_obj_t mp_obj_new_set(size_t n_args, mp_obj_t *items) {
-    mp_obj_set_t *o = m_new_obj(mp_obj_set_t);
-    o->base.type = &mp_type_set;
+    mp_obj_set_t *o = mp_obj_malloc(mp_obj_set_t, &mp_type_set);
     mp_set_init(&o->set, n_args);
     for (size_t i = 0; i < n_args; i++) {
         mp_set_lookup(&o->set, items[i], MP_MAP_LOOKUP_ADD_IF_NOT_FOUND);
diff --git a/python/src/py/objslice.c b/python/src/py/objslice.c
index c65c30601..0b34516c1 100644
--- a/python/src/py/objslice.c
+++ b/python/src/py/objslice.c
@@ -104,8 +104,7 @@ const mp_obj_type_t mp_type_slice = {
 };
 
 mp_obj_t mp_obj_new_slice(mp_obj_t ostart, mp_obj_t ostop, mp_obj_t ostep) {
-    mp_obj_slice_t *o = m_new_obj(mp_obj_slice_t);
-    o->base.type = &mp_type_slice;
+    mp_obj_slice_t *o = mp_obj_malloc(mp_obj_slice_t, &mp_type_slice);
     o->start = ostart;
     o->stop = ostop;
     o->step = ostep;
diff --git a/python/src/py/objstr.c b/python/src/py/objstr.c
index 7d7f0e1df..6e5a316d7 100644
--- a/python/src/py/objstr.c
+++ b/python/src/py/objstr.c
@@ -1163,7 +1163,7 @@ STATIC vstr_t mp_obj_str_format_helper(const char *str, const char *top, int *ar
                 s++;
             }
             if (*s == '0') {
-                if (!align) {
+                if (!align && arg_looks_numeric(arg)) {
                     align = '=';
                 }
                 if (!fill) {
@@ -2026,8 +2026,7 @@ const mp_obj_str_t mp_const_empty_bytes_obj = {{&mp_type_bytes}, 0, 0, (const by
 // the data is copied across.  This function should only be used if the type is bytes,
 // or if the type is str and the string data is known to be not interned.
 mp_obj_t mp_obj_new_str_copy(const mp_obj_type_t *type, const byte *data, size_t len) {
-    mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
-    o->base.type = type;
+    mp_obj_str_t *o = mp_obj_malloc(mp_obj_str_t, type);
     o->len = len;
     if (data) {
         o->hash = qstr_compute_hash(data, len);
@@ -2070,8 +2069,7 @@ mp_obj_t mp_obj_new_str_from_vstr(const mp_obj_type_t *type, vstr_t *vstr) {
     }
 
     // make a new str/bytes object
-    mp_obj_str_t *o = m_new_obj(mp_obj_str_t);
-    o->base.type = type;
+    mp_obj_str_t *o = mp_obj_malloc(mp_obj_str_t, type);
     o->len = vstr->len;
     o->hash = qstr_compute_hash((byte *)vstr->buf, vstr->len);
     if (vstr->len + 1 == vstr->alloc) {
diff --git a/python/src/py/objstringio.c b/python/src/py/objstringio.c
index ef942e74e..8b6c7531d 100644
--- a/python/src/py/objstringio.c
+++ b/python/src/py/objstringio.c
@@ -177,8 +177,7 @@ STATIC mp_obj_t stringio___exit__(size_t n_args, const mp_obj_t *args) {
 STATIC MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(stringio___exit___obj, 4, 4, stringio___exit__);
 
 STATIC mp_obj_stringio_t *stringio_new(const mp_obj_type_t *type) {
-    mp_obj_stringio_t *o = m_new_obj(mp_obj_stringio_t);
-    o->base.type = type;
+    mp_obj_stringio_t *o = mp_obj_malloc(mp_obj_stringio_t, type);
     o->pos = 0;
     o->ref_obj = MP_OBJ_NULL;
     return o;
diff --git a/python/src/py/objtuple.c b/python/src/py/objtuple.c
index 67d7bc356..e0cec8447 100644
--- a/python/src/py/objtuple.c
+++ b/python/src/py/objtuple.c
@@ -243,8 +243,7 @@ mp_obj_t mp_obj_new_tuple(size_t n, const mp_obj_t *items) {
     if (n == 0) {
         return mp_const_empty_tuple;
     }
-    mp_obj_tuple_t *o = m_new_obj_var(mp_obj_tuple_t, mp_obj_t, n);
-    o->base.type = &mp_type_tuple;
+    mp_obj_tuple_t *o = mp_obj_malloc_var(mp_obj_tuple_t, mp_obj_t, n, &mp_type_tuple);
     o->len = n;
     if (items) {
         for (size_t i = 0; i < n; i++) {
diff --git a/python/src/py/objtype.c b/python/src/py/objtype.c
index 508bab99d..37c1e3bd2 100644
--- a/python/src/py/objtype.c
+++ b/python/src/py/objtype.c
@@ -99,8 +99,7 @@ STATIC
 mp_obj_instance_t *mp_obj_new_instance(const mp_obj_type_t *class, const mp_obj_type_t **native_base) {
     size_t num_native_bases = instance_count_native_bases(class, native_base);
     assert(num_native_bases < 2);
-    mp_obj_instance_t *o = m_new_obj_var(mp_obj_instance_t, mp_obj_t, num_native_bases);
-    o->base.type = class;
+    mp_obj_instance_t *o = mp_obj_malloc_var(mp_obj_instance_t, mp_obj_t, num_native_bases, class);
     mp_map_init(&o->members, 0);
     // Initialise the native base-class slot (should be 1 at most) with a valid
     // object.  It doesn't matter which object, so long as it can be uniquely
@@ -549,6 +548,7 @@ retry:;
     } else if (dest[0] != MP_OBJ_NULL) {
         dest[2] = rhs_in;
         res = mp_call_method_n_kw(1, 0, dest);
+        res = op == MP_BINARY_OP_CONTAINS ? mp_obj_new_bool(mp_obj_is_true(res)) : res;
     } else {
         // If this was an inplace method, fallback to normal method
         // https://docs.python.org/3/reference/datamodel.html#object.__iadd__ :
@@ -579,6 +579,7 @@ STATIC void mp_obj_instance_load_attr(mp_obj_t self_in, qstr attr, mp_obj_t *des
     assert(mp_obj_is_instance_type(mp_obj_get_type(self_in)));
     mp_obj_instance_t *self = MP_OBJ_TO_PTR(self_in);
 
+    // Note: This is fast-path'ed in the VM for the MP_BC_LOAD_ATTR operation.
     mp_map_elem_t *elem = mp_map_lookup(&self->members, MP_OBJ_NEW_QSTR(attr), MP_MAP_LOOKUP);
     if (elem != NULL) {
         // object member, always treated as a value
diff --git a/python/src/py/objzip.c b/python/src/py/objzip.c
index 4abc917c3..81fa1d587 100644
--- a/python/src/py/objzip.c
+++ b/python/src/py/objzip.c
@@ -39,8 +39,7 @@ typedef struct _mp_obj_zip_t {
 STATIC mp_obj_t zip_make_new(const mp_obj_type_t *type, size_t n_args, size_t n_kw, const mp_obj_t *args) {
     mp_arg_check_num(n_args, n_kw, 0, MP_OBJ_FUN_ARGS_MAX, false);
 
-    mp_obj_zip_t *o = m_new_obj_var(mp_obj_zip_t, mp_obj_t, n_args);
-    o->base.type = type;
+    mp_obj_zip_t *o = mp_obj_malloc_var(mp_obj_zip_t, mp_obj_t, n_args, type);
     o->n_iters = n_args;
     for (size_t i = 0; i < n_args; i++) {
         o->iters[i] = mp_getiter(args[i], NULL);
diff --git a/python/src/py/parse.c b/python/src/py/parse.c
index ae3fa8ea6..14f5f6c10 100644
--- a/python/src/py/parse.c
+++ b/python/src/py/parse.c
@@ -291,6 +291,16 @@ STATIC void *parser_alloc(parser_t *parser, size_t num_bytes) {
     return ret;
 }
 
+#if MICROPY_COMP_CONST_TUPLE
+STATIC void parser_free_parse_node_struct(parser_t *parser, mp_parse_node_struct_t *pns) {
+    mp_parse_chunk_t *chunk = parser->cur_chunk;
+    if (chunk->data <= (byte *)pns && (byte *)pns < chunk->data + chunk->union_.used) {
+        size_t num_bytes = sizeof(mp_parse_node_struct_t) + sizeof(mp_parse_node_t) * MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
+        chunk->union_.used -= num_bytes;
+    }
+}
+#endif
+
 STATIC void push_rule(parser_t *parser, size_t src_line, uint8_t rule_id, size_t arg_i) {
     if (parser->rule_stack_top >= parser->rule_stack_alloc) {
         rule_stack_t *rs = m_renew(rule_stack_t, parser->rule_stack, parser->rule_stack_alloc, parser->rule_stack_alloc + MICROPY_ALLOC_PARSE_RULE_INC);
@@ -317,6 +327,13 @@ STATIC uint8_t pop_rule(parser_t *parser, size_t *arg_i, size_t *src_line) {
     return rule_id;
 }
 
+#if MICROPY_COMP_CONST_TUPLE
+STATIC uint8_t peek_rule(parser_t *parser, size_t n) {
+    assert(parser->rule_stack_top > n);
+    return parser->rule_stack[parser->rule_stack_top - 1 - n].rule_id;
+}
+#endif
+
 bool mp_parse_node_is_const_false(mp_parse_node_t pn) {
     return MP_PARSE_NODE_IS_TOKEN_KIND(pn, MP_TOKEN_KW_FALSE)
            || (MP_PARSE_NODE_IS_SMALL_INT(pn) && MP_PARSE_NODE_LEAF_SMALL_INT(pn) == 0);
@@ -333,18 +350,83 @@ bool mp_parse_node_get_int_maybe(mp_parse_node_t pn, mp_obj_t *o) {
         return true;
     } else if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, RULE_const_object)) {
         mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn;
-        #if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D
-        // nodes are 32-bit pointers, but need to extract 64-bit object
-        *o = (uint64_t)pns->nodes[0] | ((uint64_t)pns->nodes[1] << 32);
-        #else
-        *o = (mp_obj_t)pns->nodes[0];
-        #endif
+        *o = mp_parse_node_extract_const_object(pns);
         return mp_obj_is_int(*o);
     } else {
         return false;
     }
 }
 
+#if MICROPY_COMP_CONST_TUPLE || MICROPY_COMP_CONST
+STATIC bool mp_parse_node_is_const(mp_parse_node_t pn) {
+    if (MP_PARSE_NODE_IS_SMALL_INT(pn)) {
+        // Small integer.
+        return true;
+    } else if (MP_PARSE_NODE_IS_LEAF(pn)) {
+        // Possible str, or constant literal.
+        uintptr_t kind = MP_PARSE_NODE_LEAF_KIND(pn);
+        if (kind == MP_PARSE_NODE_STRING) {
+            return true;
+        } else if (kind == MP_PARSE_NODE_TOKEN) {
+            uintptr_t arg = MP_PARSE_NODE_LEAF_ARG(pn);
+            return arg == MP_TOKEN_KW_NONE
+                   || arg == MP_TOKEN_KW_FALSE
+                   || arg == MP_TOKEN_KW_TRUE
+                   || arg == MP_TOKEN_ELLIPSIS;
+        }
+    } else if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, RULE_const_object)) {
+        // Constant object.
+        return true;
+    } else if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, RULE_atom_paren)) {
+        // Possible empty tuple.
+        mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn;
+        return MP_PARSE_NODE_IS_NULL(pns->nodes[0]);
+    }
+    return false;
+}
+
+STATIC mp_obj_t mp_parse_node_convert_to_obj(mp_parse_node_t pn) {
+    assert(mp_parse_node_is_const(pn));
+    if (MP_PARSE_NODE_IS_SMALL_INT(pn)) {
+        mp_int_t arg = MP_PARSE_NODE_LEAF_SMALL_INT(pn);
+        #if MICROPY_DYNAMIC_COMPILER
+        mp_uint_t sign_mask = -((mp_uint_t)1 << (mp_dynamic_compiler.small_int_bits - 1));
+        if (!((arg & sign_mask) == 0 || (arg & sign_mask) == sign_mask)) {
+            // Integer doesn't fit in a small-int, so create a multi-precision int object.
+            return mp_obj_new_int_from_ll(arg);
+        }
+        #endif
+        return MP_OBJ_NEW_SMALL_INT(arg);
+    } else if (MP_PARSE_NODE_IS_LEAF(pn)) {
+        uintptr_t kind = MP_PARSE_NODE_LEAF_KIND(pn);
+        uintptr_t arg = MP_PARSE_NODE_LEAF_ARG(pn);
+        if (kind == MP_PARSE_NODE_STRING) {
+            return MP_OBJ_NEW_QSTR(arg);
+        } else {
+            assert(MP_PARSE_NODE_LEAF_KIND(pn) == MP_PARSE_NODE_TOKEN);
+            switch (arg) {
+                case MP_TOKEN_KW_NONE:
+                    return mp_const_none;
+                case MP_TOKEN_KW_FALSE:
+                    return mp_const_false;
+                case MP_TOKEN_KW_TRUE:
+                    return mp_const_true;
+                default:
+                    assert(arg == MP_TOKEN_ELLIPSIS);
+                    return MP_OBJ_FROM_PTR(&mp_const_ellipsis_obj);
+            }
+        }
+    } else if (MP_PARSE_NODE_IS_STRUCT_KIND(pn, RULE_const_object)) {
+        mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn;
+        return mp_parse_node_extract_const_object(pns);
+    } else {
+        assert(MP_PARSE_NODE_IS_STRUCT_KIND(pn, RULE_atom_paren));
+        assert(MP_PARSE_NODE_IS_NULL(((mp_parse_node_struct_t *)pn)->nodes[0]));
+        return mp_const_empty_tuple;
+    }
+}
+#endif
+
 size_t mp_parse_node_extract_list(mp_parse_node_t *pn, size_t pn_kind, mp_parse_node_t **nodes) {
     if (MP_PARSE_NODE_IS_NULL(*pn)) {
         *nodes = NULL;
@@ -388,9 +470,6 @@ void mp_parse_node_print(const mp_print_t *print, mp_parse_node_t pn, size_t ind
             case MP_PARSE_NODE_STRING:
                 mp_printf(print, "str(%s)\n", qstr_str(arg));
                 break;
-            case MP_PARSE_NODE_BYTES:
-                mp_printf(print, "bytes(%s)\n", qstr_str(arg));
-                break;
             default:
                 assert(MP_PARSE_NODE_LEAF_KIND(pn) == MP_PARSE_NODE_TOKEN);
                 mp_printf(print, "tok(%u)\n", (uint)arg);
@@ -400,11 +479,14 @@ void mp_parse_node_print(const mp_print_t *print, mp_parse_node_t pn, size_t ind
         // node must be a mp_parse_node_struct_t
         mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn;
         if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_const_object) {
+            mp_obj_t obj = mp_parse_node_extract_const_object(pns);
             #if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D
-            mp_printf(print, "literal const(%016llx)\n", (uint64_t)pns->nodes[0] | ((uint64_t)pns->nodes[1] << 32));
+            mp_printf(print, "literal const(%016llx)=", obj);
             #else
-            mp_printf(print, "literal const(%p)\n", (mp_obj_t)pns->nodes[0]);
+            mp_printf(print, "literal const(%p)=", obj);
             #endif
+            mp_obj_print_helper(print, obj, PRINT_REPR);
+            mp_printf(print, "\n");
         } else {
             size_t n = MP_PARSE_NODE_STRUCT_NUM_NODES(pns);
             #if MICROPY_DEBUG_PARSE_RULE_NAME
@@ -463,16 +545,28 @@ STATIC mp_parse_node_t make_node_const_object(parser_t *parser, size_t src_line,
     return (mp_parse_node_t)pn;
 }
 
-STATIC mp_parse_node_t mp_parse_node_new_small_int_checked(parser_t *parser, mp_obj_t o_val) {
-    (void)parser;
-    mp_int_t val = MP_OBJ_SMALL_INT_VALUE(o_val);
-    #if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D
-    // A parse node is only 32-bits and the small-int value must fit in 31-bits
-    if (((val ^ (val << 1)) & 0xffffffff80000000) != 0) {
-        return make_node_const_object(parser, 0, o_val);
+// Create a parse node represeting a constant object, possibly optimising the case of
+// an integer, by putting the (small) integer value directly in the parse node itself.
+STATIC mp_parse_node_t make_node_const_object_optimised(parser_t *parser, size_t src_line, mp_obj_t obj) {
+    if (mp_obj_is_small_int(obj)) {
+        mp_int_t val = MP_OBJ_SMALL_INT_VALUE(obj);
+        #if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D
+        // A parse node is only 32-bits and the small-int value must fit in 31-bits
+        if (((val ^ (val << 1)) & 0xffffffff80000000) != 0) {
+            return make_node_const_object(parser, src_line, obj);
+        }
+        #endif
+        #if MICROPY_DYNAMIC_COMPILER
+        // Check that the integer value fits in target runtime's small-int
+        mp_uint_t sign_mask = -((mp_uint_t)1 << (mp_dynamic_compiler.small_int_bits - 1));
+        if (!((val & sign_mask) == 0 || (val & sign_mask) == sign_mask)) {
+            return make_node_const_object(parser, src_line, obj);
+        }
+        #endif
+        return mp_parse_node_new_small_int(val);
+    } else {
+        return make_node_const_object(parser, src_line, obj);
     }
-    #endif
-    return mp_parse_node_new_small_int(val);
 }
 
 STATIC void push_result_token(parser_t *parser, uint8_t rule_id) {
@@ -485,11 +579,7 @@ STATIC void push_result_token(parser_t *parser, uint8_t rule_id) {
         mp_map_elem_t *elem;
         if (rule_id == RULE_atom
             && (elem = mp_map_lookup(&parser->consts, MP_OBJ_NEW_QSTR(id), MP_MAP_LOOKUP)) != NULL) {
-            if (mp_obj_is_small_int(elem->value)) {
-                pn = mp_parse_node_new_small_int_checked(parser, elem->value);
-            } else {
-                pn = make_node_const_object(parser, lex->tok_line, elem->value);
-            }
+            pn = make_node_const_object_optimised(parser, lex->tok_line, elem->value);
         } else {
             pn = mp_parse_node_new_leaf(MP_PARSE_NODE_ID, id);
         }
@@ -499,16 +589,12 @@ STATIC void push_result_token(parser_t *parser, uint8_t rule_id) {
         #endif
     } else if (lex->tok_kind == MP_TOKEN_INTEGER) {
         mp_obj_t o = mp_parse_num_integer(lex->vstr.buf, lex->vstr.len, 0, lex);
-        if (mp_obj_is_small_int(o)) {
-            pn = mp_parse_node_new_small_int_checked(parser, o);
-        } else {
-            pn = make_node_const_object(parser, lex->tok_line, o);
-        }
+        pn = make_node_const_object_optimised(parser, lex->tok_line, o);
     } else if (lex->tok_kind == MP_TOKEN_FLOAT_OR_IMAG) {
         mp_obj_t o = mp_parse_num_decimal(lex->vstr.buf, lex->vstr.len, true, false, lex);
         pn = make_node_const_object(parser, lex->tok_line, o);
-    } else if (lex->tok_kind == MP_TOKEN_STRING || lex->tok_kind == MP_TOKEN_BYTES) {
-        // Don't automatically intern all strings/bytes.  doc strings (which are usually large)
+    } else if (lex->tok_kind == MP_TOKEN_STRING) {
+        // Don't automatically intern all strings.  Doc strings (which are usually large)
         // will be discarded by the compiler, and so we shouldn't intern them.
         qstr qst = MP_QSTRnull;
         if (lex->vstr.len <= MICROPY_ALLOC_PARSE_INTERN_STRING_LEN) {
@@ -520,14 +606,16 @@ STATIC void push_result_token(parser_t *parser, uint8_t rule_id) {
         }
         if (qst != MP_QSTRnull) {
             // qstr exists, make a leaf node
-            pn = mp_parse_node_new_leaf(lex->tok_kind == MP_TOKEN_STRING ? MP_PARSE_NODE_STRING : MP_PARSE_NODE_BYTES, qst);
+            pn = mp_parse_node_new_leaf(MP_PARSE_NODE_STRING, qst);
         } else {
-            // not interned, make a node holding a pointer to the string/bytes object
-            mp_obj_t o = mp_obj_new_str_copy(
-                lex->tok_kind == MP_TOKEN_STRING ? &mp_type_str : &mp_type_bytes,
-                (const byte *)lex->vstr.buf, lex->vstr.len);
+            // not interned, make a node holding a pointer to the string object
+            mp_obj_t o = mp_obj_new_str_copy(&mp_type_str, (const byte *)lex->vstr.buf, lex->vstr.len);
             pn = make_node_const_object(parser, lex->tok_line, o);
         }
+    } else if (lex->tok_kind == MP_TOKEN_BYTES) {
+        // make a node holding a pointer to the bytes object
+        mp_obj_t o = mp_obj_new_bytes((const byte *)lex->vstr.buf, lex->vstr.len);
+        pn = make_node_const_object(parser, lex->tok_line, o);
     } else {
         pn = mp_parse_node_new_leaf(MP_PARSE_NODE_TOKEN, lex->tok_kind);
     }
@@ -551,6 +639,11 @@ STATIC MP_DEFINE_CONST_MAP(mp_constants_map, mp_constants_table);
 STATIC void push_result_rule(parser_t *parser, size_t src_line, uint8_t rule_id, size_t num_args);
 
 #if MICROPY_COMP_CONST_FOLDING
+#if MICROPY_COMP_CONST_FOLDING_COMPILER_WORKAROUND
+// Some versions of the xtensa-esp32-elf-gcc compiler generate wrong code if this
+// function is static, so provide a hook for them to work around this problem.
+MP_NOINLINE
+#endif
 STATIC bool fold_logical_constants(parser_t *parser, uint8_t rule_id, size_t *num_args) {
     if (rule_id == RULE_or_test
         || rule_id == RULE_and_test) {
@@ -715,14 +808,14 @@ STATIC bool fold_constants(parser_t *parser, uint8_t rule_id, size_t num_args) {
 
                 // get the value
                 mp_parse_node_t pn_value = ((mp_parse_node_struct_t *)((mp_parse_node_struct_t *)pn1)->nodes[1])->nodes[0];
-                mp_obj_t value;
-                if (!mp_parse_node_get_int_maybe(pn_value, &value)) {
+                if (!mp_parse_node_is_const(pn_value)) {
                     mp_obj_t exc = mp_obj_new_exception_msg(&mp_type_SyntaxError,
-                        MP_ERROR_TEXT("constant must be an integer"));
+                        MP_ERROR_TEXT("not a constant"));
                     mp_obj_exception_add_traceback(exc, parser->lexer->source_name,
                         ((mp_parse_node_struct_t *)pn1)->source_line, MP_QSTRnull);
                     nlr_raise(exc);
                 }
+                mp_obj_t value = mp_parse_node_convert_to_obj(pn_value);
 
                 // store the value in the table of dynamic constants
                 mp_map_elem_t *elem = mp_map_lookup(&parser->consts, MP_OBJ_NEW_QSTR(id), MP_MAP_LOOKUP_ADD_IF_NOT_FOUND);
@@ -784,21 +877,71 @@ STATIC bool fold_constants(parser_t *parser, uint8_t rule_id, size_t num_args) {
     for (size_t i = num_args; i > 0; i--) {
         pop_result(parser);
     }
-    if (mp_obj_is_small_int(arg0)) {
-        push_result_node(parser, mp_parse_node_new_small_int_checked(parser, arg0));
-    } else {
-        // TODO reuse memory for parse node struct?
-        push_result_node(parser, make_node_const_object(parser, 0, arg0));
-    }
+    push_result_node(parser, make_node_const_object_optimised(parser, 0, arg0));
 
     return true;
 }
 #endif
 
+#if MICROPY_COMP_CONST_TUPLE
+STATIC bool build_tuple_from_stack(parser_t *parser, size_t src_line, size_t num_args) {
+    for (size_t i = num_args; i > 0;) {
+        mp_parse_node_t pn = peek_result(parser, --i);
+        if (!mp_parse_node_is_const(pn)) {
+            return false;
+        }
+    }
+    mp_obj_tuple_t *tuple = MP_OBJ_TO_PTR(mp_obj_new_tuple(num_args, NULL));
+    for (size_t i = num_args; i > 0;) {
+        mp_parse_node_t pn = pop_result(parser);
+        tuple->items[--i] = mp_parse_node_convert_to_obj(pn);
+        if (MP_PARSE_NODE_IS_STRUCT(pn)) {
+            parser_free_parse_node_struct(parser, (mp_parse_node_struct_t *)pn);
+        }
+    }
+    push_result_node(parser, make_node_const_object(parser, src_line, MP_OBJ_FROM_PTR(tuple)));
+    return true;
+}
+
+STATIC bool build_tuple(parser_t *parser, size_t src_line, uint8_t rule_id, size_t num_args) {
+    if (rule_id == RULE_testlist_comp) {
+        if (peek_rule(parser, 0) == RULE_atom_paren) {
+            // Tuple of the form "(a,)".
+            return build_tuple_from_stack(parser, src_line, num_args);
+        }
+    }
+    if (rule_id == RULE_testlist_comp_3c) {
+        assert(peek_rule(parser, 0) == RULE_testlist_comp_3b);
+        assert(peek_rule(parser, 1) == RULE_testlist_comp);
+        if (peek_rule(parser, 2) == RULE_atom_paren) {
+            // Tuple of the form "(a, b)".
+            if (build_tuple_from_stack(parser, src_line, num_args)) {
+                parser->rule_stack_top -= 2; // discard 2 rules
+                return true;
+            }
+        }
+    }
+    if (rule_id == RULE_testlist_star_expr
+        || rule_id == RULE_testlist
+        || rule_id == RULE_subscriptlist) {
+        // Tuple of the form:
+        //  - x = a, b
+        //  - return a, b
+        //  - for x in a, b: pass
+        //  - x[a, b]
+        return build_tuple_from_stack(parser, src_line, num_args);
+    }
+
+    return false;
+}
+#endif
+
 STATIC void push_result_rule(parser_t *parser, size_t src_line, uint8_t rule_id, size_t num_args) {
-    // optimise away parenthesis around an expression if possible
+    // Simplify and optimise certain rules, to reduce memory usage and simplify the compiler.
     if (rule_id == RULE_atom_paren) {
-        // there should be just 1 arg for this rule
+        // Remove parenthesis around a single expression if possible.
+        // This atom_paren rule always has a single argument, and after this
+        // optimisation that argument is either NULL or testlist_comp.
         mp_parse_node_t pn = peek_result(parser, 0);
         if (MP_PARSE_NODE_IS_NULL(pn)) {
             // need to keep parenthesis for ()
@@ -808,6 +951,34 @@ STATIC void push_result_rule(parser_t *parser, size_t src_line, uint8_t rule_id,
             // parenthesis around a single expression, so it's just the expression
             return;
         }
+    } else if (rule_id == RULE_testlist_comp) {
+        // The testlist_comp rule can be the sole argument to either atom_parent
+        // or atom_bracket, for (...) and [...] respectively.
+        assert(num_args == 2);
+        mp_parse_node_t pn = peek_result(parser, 0);
+        if (MP_PARSE_NODE_IS_STRUCT(pn)) {
+            mp_parse_node_struct_t *pns = (mp_parse_node_struct_t *)pn;
+            if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_testlist_comp_3b) {
+                // tuple of one item, with trailing comma
+                pop_result(parser);
+                --num_args;
+            } else if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_testlist_comp_3c) {
+                // tuple of many items, convert testlist_comp_3c to testlist_comp
+                pop_result(parser);
+                assert(pn == peek_result(parser, 0));
+                pns->kind_num_nodes = rule_id | MP_PARSE_NODE_STRUCT_NUM_NODES(pns) << 8;
+                return;
+            } else if (MP_PARSE_NODE_STRUCT_KIND(pns) == RULE_comp_for) {
+                // generator expression
+            } else {
+                // tuple with 2 items
+            }
+        } else {
+            // tuple with 2 items
+        }
+    } else if (rule_id == RULE_testlist_comp_3c) {
+        // steal first arg of outer testlist_comp rule
+        ++num_args;
     }
 
     #if MICROPY_COMP_CONST_FOLDING
@@ -821,12 +992,23 @@ STATIC void push_result_rule(parser_t *parser, size_t src_line, uint8_t rule_id,
     }
     #endif
 
+    #if MICROPY_COMP_CONST_TUPLE
+    if (build_tuple(parser, src_line, rule_id, num_args)) {
+        // we built a tuple from this rule so return straightaway
+        return;
+    }
+    #endif
+
     mp_parse_node_struct_t *pn = parser_alloc(parser, sizeof(mp_parse_node_struct_t) + sizeof(mp_parse_node_t) * num_args);
     pn->source_line = src_line;
     pn->kind_num_nodes = (rule_id & 0xff) | (num_args << 8);
     for (size_t i = num_args; i > 0; i--) {
         pn->nodes[i - 1] = pop_result(parser);
     }
+    if (rule_id == RULE_testlist_comp_3c) {
+        // need to push something non-null to replace stolen first arg of testlist_comp
+        push_result_node(parser, (mp_parse_node_t)pn);
+    }
     push_result_node(parser, (mp_parse_node_t)pn);
 }
 
diff --git a/python/src/py/parse.h b/python/src/py/parse.h
index a6eb38004..5531e35cb 100644
--- a/python/src/py/parse.h
+++ b/python/src/py/parse.h
@@ -39,15 +39,13 @@ struct _mp_lexer_t;
 //  - xxxx...xx00: pointer to mp_parse_node_struct_t
 //  - xx...xx0010: an identifier; bits 4 and above are the qstr
 //  - xx...xx0110: a string; bits 4 and above are the qstr holding the value
-//  - xx...xx1010: a string of bytes; bits 4 and above are the qstr holding the value
-//  - xx...xx1110: a token; bits 4 and above are mp_token_kind_t
+//  - xx...xx1010: a token; bits 4 and above are mp_token_kind_t
 
 #define MP_PARSE_NODE_NULL      (0)
 #define MP_PARSE_NODE_SMALL_INT (0x1)
 #define MP_PARSE_NODE_ID        (0x02)
 #define MP_PARSE_NODE_STRING    (0x06)
-#define MP_PARSE_NODE_BYTES     (0x0a)
-#define MP_PARSE_NODE_TOKEN     (0x0e)
+#define MP_PARSE_NODE_TOKEN     (0x0a)
 
 typedef uintptr_t mp_parse_node_t; // must be pointer size
 
@@ -79,9 +77,20 @@ typedef struct _mp_parse_node_struct_t {
 static inline mp_parse_node_t mp_parse_node_new_small_int(mp_int_t val) {
     return (mp_parse_node_t)(MP_PARSE_NODE_SMALL_INT | ((mp_uint_t)val << 1));
 }
+
 static inline mp_parse_node_t mp_parse_node_new_leaf(size_t kind, mp_int_t arg) {
     return (mp_parse_node_t)(kind | ((mp_uint_t)arg << 4));
 }
+
+static inline mp_obj_t mp_parse_node_extract_const_object(mp_parse_node_struct_t *pns) {
+    #if MICROPY_OBJ_REPR == MICROPY_OBJ_REPR_D
+    // nodes are 32-bit pointers, but need to extract 64-bit object
+    return (uint64_t)pns->nodes[0] | ((uint64_t)pns->nodes[1] << 32);
+    #else
+    return (mp_obj_t)pns->nodes[0];
+    #endif
+}
+
 bool mp_parse_node_is_const_false(mp_parse_node_t pn);
 bool mp_parse_node_is_const_true(mp_parse_node_t pn);
 bool mp_parse_node_get_int_maybe(mp_parse_node_t pn, mp_obj_t *o);
diff --git a/python/src/py/parsenum.c b/python/src/py/parsenum.c
index 54cd2bf86..1cfe84257 100644
--- a/python/src/py/parsenum.c
+++ b/python/src/py/parsenum.c
@@ -98,7 +98,7 @@ mp_obj_t mp_parse_num_integer(const char *restrict str_, size_t len, int base, m
             break;
         }
 
-        // add next digit and check for overflow
+        // add next digi and check for overflow
         if (mp_small_int_mul_overflow(int_val, base)) {
             goto overflow;
         }
diff --git a/python/src/py/persistentcode.c b/python/src/py/persistentcode.c
index ac523990c..6304d1ff0 100644
--- a/python/src/py/persistentcode.c
+++ b/python/src/py/persistentcode.c
@@ -48,72 +48,6 @@
 #define MPY_FEATURE_ARCH_DYNAMIC MPY_FEATURE_ARCH
 #endif
 
-#if MICROPY_PERSISTENT_CODE_LOAD || (MICROPY_PERSISTENT_CODE_SAVE && !MICROPY_DYNAMIC_COMPILER)
-// The bytecode will depend on the number of bits in a small-int, and
-// this function computes that (could make it a fixed constant, but it
-// would need to be defined in mpconfigport.h).
-STATIC int mp_small_int_bits(void) {
-    mp_int_t i = MP_SMALL_INT_MAX;
-    int n = 1;
-    while (i != 0) {
-        i >>= 1;
-        ++n;
-    }
-    return n;
-}
-#endif
-
-#define QSTR_WINDOW_SIZE (32)
-
-typedef struct _qstr_window_t {
-    uint16_t idx; // indexes the head of the window
-    uint16_t window[QSTR_WINDOW_SIZE];
-} qstr_window_t;
-
-// Push a qstr to the head of the window, and the tail qstr is overwritten
-STATIC void qstr_window_push(qstr_window_t *qw, qstr qst) {
-    qw->idx = (qw->idx + 1) % QSTR_WINDOW_SIZE;
-    qw->window[qw->idx] = qst;
-}
-
-// Pull an existing qstr from within the window to the head of the window
-STATIC qstr qstr_window_pull(qstr_window_t *qw, size_t idx) {
-    qstr qst = qw->window[idx];
-    if (idx > qw->idx) {
-        memmove(&qw->window[idx], &qw->window[idx + 1], (QSTR_WINDOW_SIZE - idx - 1) * sizeof(uint16_t));
-        qw->window[QSTR_WINDOW_SIZE - 1] = qw->window[0];
-        idx = 0;
-    }
-    memmove(&qw->window[idx], &qw->window[idx + 1], (qw->idx - idx) * sizeof(uint16_t));
-    qw->window[qw->idx] = qst;
-    return qst;
-}
-
-#if MICROPY_PERSISTENT_CODE_LOAD
-
-// Access a qstr at the given index, relative to the head of the window (0=head)
-STATIC qstr qstr_window_access(qstr_window_t *qw, size_t idx) {
-    return qstr_window_pull(qw, (qw->idx + QSTR_WINDOW_SIZE - idx) % QSTR_WINDOW_SIZE);
-}
-
-#endif
-
-#if MICROPY_PERSISTENT_CODE_SAVE
-
-// Insert a qstr at the head of the window, either by pulling an existing one or pushing a new one
-STATIC size_t qstr_window_insert(qstr_window_t *qw, qstr qst) {
-    for (size_t idx = 0; idx < QSTR_WINDOW_SIZE; ++idx) {
-        if (qw->window[idx] == qst) {
-            qstr_window_pull(qw, idx);
-            return (qw->idx + QSTR_WINDOW_SIZE - idx) % QSTR_WINDOW_SIZE;
-        }
-    }
-    qstr_window_push(qw, qst);
-    return QSTR_WINDOW_SIZE;
-}
-
-#endif
-
 typedef struct _bytecode_prelude_t {
     uint n_state;
     uint n_exc_stack;
@@ -124,23 +58,6 @@ typedef struct _bytecode_prelude_t {
     uint code_info_size;
 } bytecode_prelude_t;
 
-// ip will point to start of opcodes
-// return value will point to simple_name, source_file qstrs
-STATIC byte *extract_prelude(const byte **ip, bytecode_prelude_t *prelude) {
-    MP_BC_PRELUDE_SIG_DECODE(*ip);
-    prelude->n_state = n_state;
-    prelude->n_exc_stack = n_exc_stack;
-    prelude->scope_flags = scope_flags;
-    prelude->n_pos_args = n_pos_args;
-    prelude->n_kwonly_args = n_kwonly_args;
-    prelude->n_def_pos_args = n_def_pos_args;
-    MP_BC_PRELUDE_SIZE_DECODE(*ip);
-    byte *ip_info = (byte *)*ip;
-    *ip += n_info;
-    *ip += n_cell;
-    return ip_info;
-}
-
 #endif // MICROPY_PERSISTENT_CODE_LOAD || MICROPY_PERSISTENT_CODE_SAVE
 
 #if MICROPY_PERSISTENT_CODE_LOAD
@@ -148,47 +65,17 @@ STATIC byte *extract_prelude(const byte **ip, bytecode_prelude_t *prelude) {
 #include "py/parsenum.h"
 
 STATIC int read_byte(mp_reader_t *reader);
-STATIC size_t read_uint(mp_reader_t *reader, byte **out);
+STATIC size_t read_uint(mp_reader_t *reader);
 
 #if MICROPY_EMIT_MACHINE_CODE
 
 typedef struct _reloc_info_t {
     mp_reader_t *reader;
-    mp_uint_t *const_table;
+    mp_module_context_t *context;
+    uint8_t *rodata;
+    uint8_t *bss;
 } reloc_info_t;
 
-#if MICROPY_EMIT_THUMB
-STATIC void asm_thumb_rewrite_mov(uint8_t *pc, uint16_t val) {
-    // high part
-    *(uint16_t *)pc = (*(uint16_t *)pc & 0xfbf0) | (val >> 1 & 0x0400) | (val >> 12);
-    // low part
-    *(uint16_t *)(pc + 2) = (*(uint16_t *)(pc + 2) & 0x0f00) | (val << 4 & 0x7000) | (val & 0x00ff);
-
-}
-#endif
-
-STATIC void arch_link_qstr(uint8_t *pc, bool is_obj, qstr qst) {
-    mp_uint_t val = qst;
-    if (is_obj) {
-        val = (mp_uint_t)MP_OBJ_NEW_QSTR(qst);
-    }
-    #if MICROPY_EMIT_X86 || MICROPY_EMIT_X64 || MICROPY_EMIT_ARM || MICROPY_EMIT_XTENSA || MICROPY_EMIT_XTENSAWIN
-    pc[0] = val & 0xff;
-    pc[1] = (val >> 8) & 0xff;
-    pc[2] = (val >> 16) & 0xff;
-    pc[3] = (val >> 24) & 0xff;
-    #elif MICROPY_EMIT_THUMB
-    if (is_obj) {
-        // qstr object, movw and movt
-        asm_thumb_rewrite_mov(pc, val); // movw
-        asm_thumb_rewrite_mov(pc + 4, val >> 16); // movt
-    } else {
-        // qstr number, movw instruction
-        asm_thumb_rewrite_mov(pc, val); // movw
-    }
-    #endif
-}
-
 void mp_native_relocate(void *ri_in, uint8_t *text, uintptr_t reloc_text) {
     // Relocate native code
     reloc_info_t *ri = ri_in;
@@ -197,13 +84,13 @@ void mp_native_relocate(void *ri_in, uint8_t *text, uintptr_t reloc_text) {
     while ((op = read_byte(ri->reader)) != 0xff) {
         if (op & 1) {
             // Point to new location to make adjustments
-            size_t addr = read_uint(ri->reader, NULL);
+            size_t addr = read_uint(ri->reader);
             if ((addr & 1) == 0) {
                 // Point to somewhere in text
                 addr_to_adjust = &((uintptr_t *)text)[addr >> 1];
             } else {
                 // Point to somewhere in rodata
-                addr_to_adjust = &((uintptr_t *)ri->const_table[1])[addr >> 1];
+                addr_to_adjust = &((uintptr_t *)ri->rodata)[addr >> 1];
             }
         }
         op >>= 1;
@@ -212,22 +99,31 @@ void mp_native_relocate(void *ri_in, uint8_t *text, uintptr_t reloc_text) {
         if (op <= 5) {
             if (op & 1) {
                 // Read in number of adjustments to make
-                n = read_uint(ri->reader, NULL);
+                n = read_uint(ri->reader);
             }
             op >>= 1;
             if (op == 0) {
                 // Destination is text
                 dest = reloc_text;
+            } else if (op == 1) {
+                // Destination is rodata
+                dest = (uintptr_t)ri->rodata;
             } else {
-                // Destination is rodata (op=1) or bss (op=1 if no rodata, else op=2)
-                dest = ri->const_table[op];
+                // Destination is bss
+                dest = (uintptr_t)ri->bss;
             }
         } else if (op == 6) {
+            // Destination is qstr_table
+            dest = (uintptr_t)ri->context->constants.qstr_table;
+        } else if (op == 7) {
+            // Destination is obj_table
+            dest = (uintptr_t)ri->context->constants.obj_table;
+        } else if (op == 8) {
             // Destination is mp_fun_table itself
             dest = (uintptr_t)&mp_fun_table;
         } else {
             // Destination is an entry in mp_fun_table
-            dest = ((uintptr_t *)&mp_fun_table)[op - 7];
+            dest = ((uintptr_t *)&mp_fun_table)[op - 9];
         }
         while (n--) {
             *addr_to_adjust++ += dest;
@@ -247,14 +143,10 @@ STATIC void read_bytes(mp_reader_t *reader, byte *buf, size_t len) {
     }
 }
 
-STATIC size_t read_uint(mp_reader_t *reader, byte **out) {
+STATIC size_t read_uint(mp_reader_t *reader) {
     size_t unum = 0;
     for (;;) {
         byte b = reader->readbyte(reader->data);
-        if (out != NULL) {
-            **out = b;
-            ++*out;
-        }
         unum = (unum << 7) | (b & 0x7f);
         if ((b & 0x80) == 0) {
             break;
@@ -263,97 +155,69 @@ STATIC size_t read_uint(mp_reader_t *reader, byte **out) {
     return unum;
 }
 
-STATIC qstr load_qstr(mp_reader_t *reader, qstr_window_t *qw) {
-    size_t len = read_uint(reader, NULL);
-    if (len == 0) {
-        // static qstr
-        return read_byte(reader);
-    }
+STATIC qstr load_qstr(mp_reader_t *reader) {
+    size_t len = read_uint(reader);
     if (len & 1) {
-        // qstr in window
-        return qstr_window_access(qw, len >> 1);
+        // static qstr
+        return len >> 1;
     }
     len >>= 1;
     char *str = m_new(char, len);
     read_bytes(reader, (byte *)str, len);
+    read_byte(reader); // read and discard null terminator
     qstr qst = qstr_from_strn(str, len);
     m_del(char, str, len);
-    qstr_window_push(qw, qst);
     return qst;
 }
 
 STATIC mp_obj_t load_obj(mp_reader_t *reader) {
     byte obj_type = read_byte(reader);
-    if (obj_type == 'e') {
+    #if MICROPY_EMIT_MACHINE_CODE
+    if (obj_type == MP_PERSISTENT_OBJ_FUN_TABLE) {
+        return MP_OBJ_FROM_PTR(&mp_fun_table);
+    } else
+    #endif
+    if (obj_type == MP_PERSISTENT_OBJ_NONE) {
+        return mp_const_none;
+    } else if (obj_type == MP_PERSISTENT_OBJ_FALSE) {
+        return mp_const_false;
+    } else if (obj_type == MP_PERSISTENT_OBJ_TRUE) {
+        return mp_const_true;
+    } else if (obj_type == MP_PERSISTENT_OBJ_ELLIPSIS) {
         return MP_OBJ_FROM_PTR(&mp_const_ellipsis_obj);
     } else {
-        size_t len = read_uint(reader, NULL);
+        size_t len = read_uint(reader);
+        if (len == 0 && obj_type == MP_PERSISTENT_OBJ_BYTES) {
+            read_byte(reader); // skip null terminator
+            return mp_const_empty_bytes;
+        } else if (obj_type == MP_PERSISTENT_OBJ_TUPLE) {
+            mp_obj_tuple_t *tuple = MP_OBJ_TO_PTR(mp_obj_new_tuple(len, NULL));
+            for (size_t i = 0; i < len; ++i) {
+                tuple->items[i] = load_obj(reader);
+            }
+            return MP_OBJ_FROM_PTR(tuple);
+        }
         vstr_t vstr;
         vstr_init_len(&vstr, len);
         read_bytes(reader, (byte *)vstr.buf, len);
-        if (obj_type == 's' || obj_type == 'b') {
-            return mp_obj_new_str_from_vstr(obj_type == 's' ? &mp_type_str : &mp_type_bytes, &vstr);
-        } else if (obj_type == 'i') {
+        if (obj_type == MP_PERSISTENT_OBJ_STR || obj_type == MP_PERSISTENT_OBJ_BYTES) {
+            read_byte(reader); // skip null terminator
+            return mp_obj_new_str_from_vstr(obj_type == MP_PERSISTENT_OBJ_STR ? &mp_type_str : &mp_type_bytes, &vstr);
+        } else if (obj_type == MP_PERSISTENT_OBJ_INT) {
             return mp_parse_num_integer(vstr.buf, vstr.len, 10, NULL);
         } else {
-            assert(obj_type == 'f' || obj_type == 'c');
-            return mp_parse_num_decimal(vstr.buf, vstr.len, obj_type == 'c', false, NULL);
+            assert(obj_type == MP_PERSISTENT_OBJ_FLOAT || obj_type == MP_PERSISTENT_OBJ_COMPLEX);
+            return mp_parse_num_decimal(vstr.buf, vstr.len, obj_type == MP_PERSISTENT_OBJ_COMPLEX, false, NULL);
         }
     }
 }
 
-STATIC void load_prelude_qstrs(mp_reader_t *reader, qstr_window_t *qw, byte *ip) {
-    qstr simple_name = load_qstr(reader, qw);
-    ip[0] = simple_name;
-    ip[1] = simple_name >> 8;
-    qstr source_file = load_qstr(reader, qw);
-    ip[2] = source_file;
-    ip[3] = source_file >> 8;
-}
-
-STATIC void load_prelude(mp_reader_t *reader, qstr_window_t *qw, byte **ip, bytecode_prelude_t *prelude) {
-    // Read in the prelude header
-    byte *ip_read = *ip;
-    read_uint(reader, &ip_read);                    // read in n_state/etc (is effectively a var-uint)
-    read_uint(reader, &ip_read);                    // read in n_info/n_cell (is effectively a var-uint)
-
-    // Prelude header has been read into *ip, now decode and extract values from it
-    extract_prelude((const byte **)ip, prelude);
-
-    // Load qstrs in prelude
-    load_prelude_qstrs(reader, qw, ip_read);
-    ip_read += 4;
-
-    // Read remaining code info
-    read_bytes(reader, ip_read, *ip - ip_read);
-}
-
-STATIC void load_bytecode(mp_reader_t *reader, qstr_window_t *qw, byte *ip, byte *ip_top) {
-    while (ip < ip_top) {
-        *ip = read_byte(reader);
-        size_t sz;
-        uint f = mp_opcode_format(ip, &sz, false);
-        ++ip;
-        --sz;
-        if (f == MP_BC_FORMAT_QSTR) {
-            qstr qst = load_qstr(reader, qw);
-            *ip++ = qst;
-            *ip++ = qst >> 8;
-            sz -= 2;
-        } else if (f == MP_BC_FORMAT_VAR_UINT) {
-            while ((*ip++ = read_byte(reader)) & 0x80) {
-            }
-        }
-        read_bytes(reader, ip, sz);
-        ip += sz;
-    }
-}
-
-STATIC mp_raw_code_t *load_raw_code(mp_reader_t *reader, qstr_window_t *qw) {
+STATIC mp_raw_code_t *load_raw_code(mp_reader_t *reader, mp_module_context_t *context) {
     // Load function kind and data length
-    size_t kind_len = read_uint(reader, NULL);
+    size_t kind_len = read_uint(reader);
     int kind = (kind_len & 3) + MP_CODE_BYTECODE;
-    size_t fun_data_len = kind_len >> 2;
+    bool has_children = !!(kind_len & 4);
+    size_t fun_data_len = kind_len >> 3;
 
     #if !MICROPY_EMIT_MACHINE_CODE
     if (kind != MP_CODE_BYTECODE) {
@@ -362,23 +226,18 @@ STATIC mp_raw_code_t *load_raw_code(mp_reader_t *reader, qstr_window_t *qw) {
     #endif
 
     uint8_t *fun_data = NULL;
-    bytecode_prelude_t prelude = {0};
     #if MICROPY_EMIT_MACHINE_CODE
     size_t prelude_offset = 0;
-    mp_uint_t type_sig = 0;
-    size_t n_qstr_link = 0;
+    mp_uint_t native_scope_flags = 0;
+    mp_uint_t native_n_pos_args = 0;
+    mp_uint_t native_type_sig = 0;
     #endif
 
     if (kind == MP_CODE_BYTECODE) {
         // Allocate memory for the bytecode
         fun_data = m_new(uint8_t, fun_data_len);
-
-        // Load prelude
-        byte *ip = fun_data;
-        load_prelude(reader, qw, &ip, &prelude);
-
         // Load bytecode
-        load_bytecode(reader, qw, ip, fun_data + fun_data_len);
+        read_bytes(reader, fun_data, fun_data_len);
 
     #if MICROPY_EMIT_MACHINE_CODE
     } else {
@@ -387,135 +246,104 @@ STATIC mp_raw_code_t *load_raw_code(mp_reader_t *reader, qstr_window_t *qw) {
         MP_PLAT_ALLOC_EXEC(fun_data_len, (void **)&fun_data, &fun_alloc);
         read_bytes(reader, fun_data, fun_data_len);
 
-        if (kind == MP_CODE_NATIVE_PY || kind == MP_CODE_NATIVE_VIPER) {
-            // Parse qstr link table and link native code
-            n_qstr_link = read_uint(reader, NULL);
-            for (size_t i = 0; i < n_qstr_link; ++i) {
-                size_t off = read_uint(reader, NULL);
-                qstr qst = load_qstr(reader, qw);
-                uint8_t *dest = fun_data + (off >> 2);
-                if ((off & 3) == 0) {
-                    // Generic 16-bit link
-                    dest[0] = qst & 0xff;
-                    dest[1] = (qst >> 8) & 0xff;
-                } else if ((off & 3) == 3) {
-                    // Generic, aligned qstr-object link
-                    *(mp_obj_t *)dest = MP_OBJ_NEW_QSTR(qst);
-                } else {
-                    // Architecture-specific link
-                    arch_link_qstr(dest, (off & 3) == 2, qst);
-                }
-            }
-        }
-
         if (kind == MP_CODE_NATIVE_PY) {
-            // Extract prelude for later use
-            prelude_offset = read_uint(reader, NULL);
+            // Read prelude offset within fun_data, and extract scope flags.
+            prelude_offset = read_uint(reader);
             const byte *ip = fun_data + prelude_offset;
-            byte *ip_info = extract_prelude(&ip, &prelude);
-            // Load qstrs in prelude
-            load_prelude_qstrs(reader, qw, ip_info);
+            MP_BC_PRELUDE_SIG_DECODE(ip);
+            native_scope_flags = scope_flags;
         } else {
-            // Load basic scope info for viper and asm
-            prelude.scope_flags = read_uint(reader, NULL);
-            prelude.n_pos_args = 0;
-            prelude.n_kwonly_args = 0;
+            // Load basic scope info for viper and asm.
+            native_scope_flags = read_uint(reader);
             if (kind == MP_CODE_NATIVE_ASM) {
-                prelude.n_pos_args = read_uint(reader, NULL);
-                type_sig = read_uint(reader, NULL);
+                native_n_pos_args = read_uint(reader);
+                native_type_sig = read_uint(reader);
             }
         }
     #endif
     }
 
-    size_t n_obj = 0;
-    size_t n_raw_code = 0;
-    mp_uint_t *const_table = NULL;
+    size_t n_children = 0;
+    mp_raw_code_t **children = NULL;
 
-    if (kind != MP_CODE_NATIVE_ASM) {
-        // Load constant table for bytecode, native and viper
-
-        // Number of entries in constant table
-        n_obj = read_uint(reader, NULL);
-        n_raw_code = read_uint(reader, NULL);
-
-        // Allocate constant table
-        size_t n_alloc = prelude.n_pos_args + prelude.n_kwonly_args + n_obj + n_raw_code;
-        #if MICROPY_EMIT_MACHINE_CODE
-        if (kind != MP_CODE_BYTECODE) {
-            ++n_alloc; // additional entry for mp_fun_table
-            if (prelude.scope_flags & MP_SCOPE_FLAG_VIPERRODATA) {
-                ++n_alloc; // additional entry for rodata
-            }
-            if (prelude.scope_flags & MP_SCOPE_FLAG_VIPERBSS) {
-                ++n_alloc; // additional entry for BSS
-            }
-        }
-        #endif
-
-        const_table = m_new(mp_uint_t, n_alloc);
-        mp_uint_t *ct = const_table;
-
-        // Load function argument names (initial entries in const_table)
-        // (viper has n_pos_args=n_kwonly_args=0 so doesn't load any qstrs here)
-        for (size_t i = 0; i < prelude.n_pos_args + prelude.n_kwonly_args; ++i) {
-            *ct++ = (mp_uint_t)MP_OBJ_NEW_QSTR(load_qstr(reader, qw));
+    #if MICROPY_EMIT_MACHINE_CODE
+    // Load optional BSS/rodata for viper.
+    uint8_t *rodata = NULL;
+    uint8_t *bss = NULL;
+    if (kind == MP_CODE_NATIVE_VIPER) {
+        size_t rodata_size = 0;
+        if (native_scope_flags & MP_SCOPE_FLAG_VIPERRODATA) {
+            rodata_size = read_uint(reader);
         }
 
-        #if MICROPY_EMIT_MACHINE_CODE
-        if (kind != MP_CODE_BYTECODE) {
-            // Populate mp_fun_table entry
-            *ct++ = (mp_uint_t)(uintptr_t)&mp_fun_table;
+        size_t bss_size = 0;
+        if (native_scope_flags & MP_SCOPE_FLAG_VIPERBSS) {
+            bss_size = read_uint(reader);
+        }
 
-            // Allocate and load rodata if needed
-            if (prelude.scope_flags & MP_SCOPE_FLAG_VIPERRODATA) {
-                size_t size = read_uint(reader, NULL);
-                uint8_t *rodata = m_new(uint8_t, size);
-                read_bytes(reader, rodata, size);
-                *ct++ = (uintptr_t)rodata;
+        if (rodata_size + bss_size != 0) {
+            bss_size = (uintptr_t)MP_ALIGN(bss_size, sizeof(uintptr_t));
+            uint8_t *data = m_new0(uint8_t, bss_size + rodata_size);
+            bss = data;
+            rodata = bss + bss_size;
+            if (native_scope_flags & MP_SCOPE_FLAG_VIPERRODATA) {
+                read_bytes(reader, rodata, rodata_size);
             }
 
-            // Allocate BSS if needed
-            if (prelude.scope_flags & MP_SCOPE_FLAG_VIPERBSS) {
-                size_t size = read_uint(reader, NULL);
-                uint8_t *bss = m_new0(uint8_t, size);
-                *ct++ = (uintptr_t)bss;
-            }
+            // Viper code with BSS/rodata should not have any children.
+            // Reuse the children pointer to reference the BSS/rodata
+            // memory so that it is not reclaimed by the GC.
+            assert(!has_children);
+            children = (void *)data;
         }
-        #endif
+    }
+    #endif
 
-        // Load constant objects and raw code children
-        for (size_t i = 0; i < n_obj; ++i) {
-            *ct++ = (mp_uint_t)load_obj(reader);
-        }
-        for (size_t i = 0; i < n_raw_code; ++i) {
-            *ct++ = (mp_uint_t)(uintptr_t)load_raw_code(reader, qw);
+    // Load children if any.
+    if (has_children) {
+        n_children = read_uint(reader);
+        children = m_new(mp_raw_code_t *, n_children + (kind == MP_CODE_NATIVE_PY));
+        for (size_t i = 0; i < n_children; ++i) {
+            children[i] = load_raw_code(reader, context);
         }
     }
 
     // Create raw_code and return it
     mp_raw_code_t *rc = mp_emit_glue_new_raw_code();
     if (kind == MP_CODE_BYTECODE) {
+        const byte *ip = fun_data;
+        MP_BC_PRELUDE_SIG_DECODE(ip);
         // Assign bytecode to raw code object
         mp_emit_glue_assign_bytecode(rc, fun_data,
             #if MICROPY_PERSISTENT_CODE_SAVE || MICROPY_DEBUG_PRINTERS
             fun_data_len,
             #endif
-            const_table,
+            children,
             #if MICROPY_PERSISTENT_CODE_SAVE
-            n_obj, n_raw_code,
+            n_children,
             #endif
-            prelude.scope_flags);
+            scope_flags);
 
     #if MICROPY_EMIT_MACHINE_CODE
     } else {
+        const uint8_t *prelude_ptr;
+        #if MICROPY_EMIT_NATIVE_PRELUDE_SEPARATE_FROM_MACHINE_CODE
+        if (kind == MP_CODE_NATIVE_PY) {
+            // Executable code cannot be accessed byte-wise on this architecture, so copy
+            // the prelude to a separate memory region that is byte-wise readable.
+            void *buf = fun_data + prelude_offset;
+            size_t n = fun_data_len - prelude_offset;
+            prelude_ptr = memcpy(m_new(uint8_t, n), buf, n);
+        }
+        #endif
+
         // Relocate and commit code to executable address space
-        reloc_info_t ri = {reader, const_table};
+        reloc_info_t ri = {reader, context, rodata, bss};
         #if defined(MP_PLAT_COMMIT_EXEC)
-        void *opt_ri = (prelude.scope_flags & MP_SCOPE_FLAG_VIPERRELOC) ? &ri : NULL;
+        void *opt_ri = (native_scope_flags & MP_SCOPE_FLAG_VIPERRELOC) ? &ri : NULL;
         fun_data = MP_PLAT_COMMIT_EXEC(fun_data, fun_data_len, opt_ri);
         #else
-        if (prelude.scope_flags & MP_SCOPE_FLAG_VIPERRELOC) {
+        if (native_scope_flags & MP_SCOPE_FLAG_VIPERRELOC) {
             #if MICROPY_PERSISTENT_CODE_TRACK_RELOC_CODE
             // If native code needs relocations then it's not guaranteed that a pointer to
             // the head of `buf` (containing the machine code) will be retained for the GC
@@ -532,28 +360,39 @@ STATIC mp_raw_code_t *load_raw_code(mp_reader_t *reader, qstr_window_t *qw) {
         }
         #endif
 
+        if (kind == MP_CODE_NATIVE_PY) {
+            #if !MICROPY_EMIT_NATIVE_PRELUDE_SEPARATE_FROM_MACHINE_CODE
+            prelude_ptr = fun_data + prelude_offset;
+            #endif
+            if (n_children == 0) {
+                children = (void *)prelude_ptr;
+            } else {
+                children[n_children] = (void *)prelude_ptr;
+            }
+        }
+
         // Assign native code to raw code object
         mp_emit_glue_assign_native(rc, kind,
-            fun_data, fun_data_len, const_table,
+            fun_data, fun_data_len,
+            children,
             #if MICROPY_PERSISTENT_CODE_SAVE
+            n_children,
             prelude_offset,
-            n_obj, n_raw_code,
-            n_qstr_link, NULL,
             #endif
-            prelude.n_pos_args, prelude.scope_flags, type_sig);
+            native_scope_flags, native_n_pos_args, native_type_sig
+            );
     #endif
     }
     return rc;
 }
 
-mp_raw_code_t *mp_raw_code_load(mp_reader_t *reader) {
+mp_compiled_module_t mp_raw_code_load(mp_reader_t *reader, mp_module_context_t *context) {
     byte header[4];
     read_bytes(reader, header, sizeof(header));
     if (header[0] != 'M'
         || header[1] != MPY_VERSION
         || MPY_FEATURE_DECODE_FLAGS(header[2]) != MPY_FEATURE_FLAGS
-        || header[3] > mp_small_int_bits()
-        || read_uint(reader, NULL) > QSTR_WINDOW_SIZE) {
+        || header[3] > MP_SMALL_INT_BITS) {
         mp_raise_ValueError(MP_ERROR_TEXT("incompatible .mpy file"));
     }
     if (MPY_FEATURE_DECODE_ARCH(header[2]) != MP_NATIVE_ARCH_NONE) {
@@ -562,25 +401,49 @@ mp_raw_code_t *mp_raw_code_load(mp_reader_t *reader) {
             mp_raise_ValueError(MP_ERROR_TEXT("incompatible .mpy arch"));
         }
     }
-    qstr_window_t qw;
-    qw.idx = 0;
-    mp_raw_code_t *rc = load_raw_code(reader, &qw);
+
+    size_t n_qstr = read_uint(reader);
+    size_t n_obj = read_uint(reader);
+    mp_module_context_alloc_tables(context, n_qstr, n_obj);
+
+    // Load qstrs.
+    for (size_t i = 0; i < n_qstr; ++i) {
+        context->constants.qstr_table[i] = load_qstr(reader);
+    }
+
+    // Load constant objects.
+    for (size_t i = 0; i < n_obj; ++i) {
+        context->constants.obj_table[i] = load_obj(reader);
+    }
+
+    // Load top-level module.
+    mp_compiled_module_t cm2;
+    cm2.rc = load_raw_code(reader, context);
+    cm2.context = context;
+
+    #if MICROPY_PERSISTENT_CODE_SAVE
+    cm2.has_native = MPY_FEATURE_DECODE_ARCH(header[2]) != MP_NATIVE_ARCH_NONE;
+    cm2.n_qstr = n_qstr;
+    cm2.n_obj = n_obj;
+    #endif
+
     reader->close(reader->data);
-    return rc;
+
+    return cm2;
 }
 
-mp_raw_code_t *mp_raw_code_load_mem(const byte *buf, size_t len) {
+mp_compiled_module_t mp_raw_code_load_mem(const byte *buf, size_t len, mp_module_context_t *context) {
     mp_reader_t reader;
     mp_reader_new_mem(&reader, buf, len, 0);
-    return mp_raw_code_load(&reader);
+    return mp_raw_code_load(&reader, context);
 }
 
 #if MICROPY_HAS_FILE_READER
 
-mp_raw_code_t *mp_raw_code_load_file(const char *filename) {
+mp_compiled_module_t mp_raw_code_load_file(const char *filename, mp_module_context_t *context) {
     mp_reader_t reader;
     mp_reader_new_file(&reader, filename);
-    return mp_raw_code_load(&reader);
+    return mp_raw_code_load(&reader, context);
 }
 
 #endif // MICROPY_HAS_FILE_READER
@@ -607,54 +470,72 @@ STATIC void mp_print_uint(mp_print_t *print, size_t n) {
     print->print_strn(print->data, (char *)p, buf + sizeof(buf) - p);
 }
 
-STATIC void save_qstr(mp_print_t *print, qstr_window_t *qw, qstr qst) {
+STATIC void save_qstr(mp_print_t *print, qstr qst) {
     if (qst <= QSTR_LAST_STATIC) {
         // encode static qstr
-        byte buf[2] = {0, qst & 0xff};
-        mp_print_bytes(print, buf, 2);
-        return;
-    }
-    size_t idx = qstr_window_insert(qw, qst);
-    if (idx < QSTR_WINDOW_SIZE) {
-        // qstr found in window, encode index to it
-        mp_print_uint(print, idx << 1 | 1);
+        mp_print_uint(print, qst << 1 | 1);
         return;
     }
     size_t len;
     const byte *str = qstr_data(qst, &len);
     mp_print_uint(print, len << 1);
-    mp_print_bytes(print, str, len);
+    mp_print_bytes(print, str, len + 1); // +1 to store null terminator
 }
 
 STATIC void save_obj(mp_print_t *print, mp_obj_t o) {
+    #if MICROPY_EMIT_MACHINE_CODE
+    if (o == MP_OBJ_FROM_PTR(&mp_fun_table)) {
+        byte obj_type = MP_PERSISTENT_OBJ_FUN_TABLE;
+        mp_print_bytes(print, &obj_type, 1);
+    } else
+    #endif
     if (mp_obj_is_str_or_bytes(o)) {
         byte obj_type;
         if (mp_obj_is_str(o)) {
-            obj_type = 's';
+            obj_type = MP_PERSISTENT_OBJ_STR;
         } else {
-            obj_type = 'b';
+            obj_type = MP_PERSISTENT_OBJ_BYTES;
         }
         size_t len;
         const char *str = mp_obj_str_get_data(o, &len);
         mp_print_bytes(print, &obj_type, 1);
         mp_print_uint(print, len);
-        mp_print_bytes(print, (const byte *)str, len);
-    } else if (MP_OBJ_TO_PTR(o) == &mp_const_ellipsis_obj) {
-        byte obj_type = 'e';
+        mp_print_bytes(print, (const byte *)str, len + 1); // +1 to store null terminator
+    } else if (o == mp_const_none) {
+        byte obj_type = MP_PERSISTENT_OBJ_NONE;
         mp_print_bytes(print, &obj_type, 1);
+    } else if (o == mp_const_false) {
+        byte obj_type = MP_PERSISTENT_OBJ_FALSE;
+        mp_print_bytes(print, &obj_type, 1);
+    } else if (o == mp_const_true) {
+        byte obj_type = MP_PERSISTENT_OBJ_TRUE;
+        mp_print_bytes(print, &obj_type, 1);
+    } else if (MP_OBJ_TO_PTR(o) == &mp_const_ellipsis_obj) {
+        byte obj_type = MP_PERSISTENT_OBJ_ELLIPSIS;
+        mp_print_bytes(print, &obj_type, 1);
+    } else if (mp_obj_is_type(o, &mp_type_tuple)) {
+        size_t len;
+        mp_obj_t *items;
+        mp_obj_tuple_get(o, &len, &items);
+        byte obj_type = MP_PERSISTENT_OBJ_TUPLE;
+        mp_print_bytes(print, &obj_type, 1);
+        mp_print_uint(print, len);
+        for (size_t i = 0; i < len; ++i) {
+            save_obj(print, items[i]);
+        }
     } else {
         // we save numbers using a simplistic text representation
         // TODO could be improved
         byte obj_type;
-        if (mp_obj_is_type(o, &mp_type_int)) {
-            obj_type = 'i';
+        if (mp_obj_is_int(o)) {
+            obj_type = MP_PERSISTENT_OBJ_INT;
         #if MICROPY_PY_BUILTINS_COMPLEX
         } else if (mp_obj_is_type(o, &mp_type_complex)) {
-            obj_type = 'c';
+            obj_type = MP_PERSISTENT_OBJ_COMPLEX;
         #endif
         } else {
             assert(mp_obj_is_float(o));
-            obj_type = 'f';
+            obj_type = MP_PERSISTENT_OBJ_FLOAT;
         }
         vstr_t vstr;
         mp_print_t pr;
@@ -667,142 +548,41 @@ STATIC void save_obj(mp_print_t *print, mp_obj_t o) {
     }
 }
 
-STATIC void save_prelude_qstrs(mp_print_t *print, qstr_window_t *qw, const byte *ip) {
-    save_qstr(print, qw, ip[0] | (ip[1] << 8)); // simple_name
-    save_qstr(print, qw, ip[2] | (ip[3] << 8)); // source_file
-}
-
-STATIC void save_bytecode(mp_print_t *print, qstr_window_t *qw, const byte *ip, const byte *ip_top) {
-    while (ip < ip_top) {
-        size_t sz;
-        uint f = mp_opcode_format(ip, &sz, true);
-        if (f == MP_BC_FORMAT_QSTR) {
-            mp_print_bytes(print, ip, 1);
-            qstr qst = ip[1] | (ip[2] << 8);
-            save_qstr(print, qw, qst);
-            ip += 3;
-            sz -= 3;
-        }
-        mp_print_bytes(print, ip, sz);
-        ip += sz;
-    }
-}
-
-STATIC void save_raw_code(mp_print_t *print, mp_raw_code_t *rc, qstr_window_t *qstr_window) {
+STATIC void save_raw_code(mp_print_t *print, const mp_raw_code_t *rc) {
     // Save function kind and data length
-    mp_print_uint(print, (rc->fun_data_len << 2) | (rc->kind - MP_CODE_BYTECODE));
+    mp_print_uint(print, (rc->fun_data_len << 3) | ((rc->n_children != 0) << 2) | (rc->kind - MP_CODE_BYTECODE));
 
-    bytecode_prelude_t prelude;
+    // Save function code.
+    mp_print_bytes(print, rc->fun_data, rc->fun_data_len);
 
-    if (rc->kind == MP_CODE_BYTECODE) {
-        // Extract prelude
-        const byte *ip = rc->fun_data;
-        const byte *ip_info = extract_prelude(&ip, &prelude);
-
-        // Save prelude
-        mp_print_bytes(print, rc->fun_data, ip_info - (const byte *)rc->fun_data);
-        save_prelude_qstrs(print, qstr_window, ip_info);
-        ip_info += 4;
-        mp_print_bytes(print, ip_info, ip - ip_info);
-
-        // Save bytecode
-        const byte *ip_top = (const byte *)rc->fun_data + rc->fun_data_len;
-        save_bytecode(print, qstr_window, ip, ip_top);
     #if MICROPY_EMIT_MACHINE_CODE
-    } else {
-        // Save native code
-        mp_print_bytes(print, rc->fun_data, rc->fun_data_len);
-
-        if (rc->kind == MP_CODE_NATIVE_PY || rc->kind == MP_CODE_NATIVE_VIPER) {
-            // Save qstr link table for native code
-            mp_print_uint(print, rc->n_qstr);
-            for (size_t i = 0; i < rc->n_qstr; ++i) {
-                mp_print_uint(print, rc->qstr_link[i].off);
-                save_qstr(print, qstr_window, rc->qstr_link[i].qst);
-            }
-        }
-
-        if (rc->kind == MP_CODE_NATIVE_PY) {
-            // Save prelude size
-            mp_print_uint(print, rc->prelude_offset);
-
-            // Extract prelude and save qstrs in prelude
-            const byte *ip = (const byte *)rc->fun_data + rc->prelude_offset;
-            const byte *ip_info = extract_prelude(&ip, &prelude);
-            save_prelude_qstrs(print, qstr_window, ip_info);
-        } else {
-            // Save basic scope info for viper and asm
-            mp_print_uint(print, rc->scope_flags & MP_SCOPE_FLAG_ALL_SIG);
-            prelude.n_pos_args = 0;
-            prelude.n_kwonly_args = 0;
-            if (rc->kind == MP_CODE_NATIVE_ASM) {
-                mp_print_uint(print, rc->n_pos_args);
-                mp_print_uint(print, rc->type_sig);
-            }
+    if (rc->kind == MP_CODE_NATIVE_PY) {
+        // Save prelude size
+        mp_print_uint(print, rc->prelude_offset);
+    } else if (rc->kind == MP_CODE_NATIVE_VIPER || rc->kind == MP_CODE_NATIVE_ASM) {
+        // Save basic scope info for viper and asm
+        mp_print_uint(print, rc->scope_flags & MP_SCOPE_FLAG_ALL_SIG);
+        if (rc->kind == MP_CODE_NATIVE_ASM) {
+            mp_print_uint(print, rc->n_pos_args);
+            mp_print_uint(print, rc->type_sig);
         }
+    }
     #endif
-    }
 
-    if (rc->kind != MP_CODE_NATIVE_ASM) {
-        // Save constant table for bytecode, native and viper
-
-        // Number of entries in constant table
-        mp_print_uint(print, rc->n_obj);
-        mp_print_uint(print, rc->n_raw_code);
-
-        const mp_uint_t *const_table = rc->const_table;
-
-        // Save function argument names (initial entries in const_table)
-        // (viper has n_pos_args=n_kwonly_args=0 so doesn't save any qstrs here)
-        for (size_t i = 0; i < prelude.n_pos_args + prelude.n_kwonly_args; ++i) {
-            mp_obj_t o = (mp_obj_t)*const_table++;
-            save_qstr(print, qstr_window, MP_OBJ_QSTR_VALUE(o));
-        }
-
-        if (rc->kind != MP_CODE_BYTECODE) {
-            // Skip saving mp_fun_table entry
-            ++const_table;
-        }
-
-        // Save constant objects and raw code children
-        for (size_t i = 0; i < rc->n_obj; ++i) {
-            save_obj(print, (mp_obj_t)*const_table++);
-        }
-        for (size_t i = 0; i < rc->n_raw_code; ++i) {
-            save_raw_code(print, (mp_raw_code_t *)(uintptr_t)*const_table++, qstr_window);
+    if (rc->n_children) {
+        mp_print_uint(print, rc->n_children);
+        for (size_t i = 0; i < rc->n_children; ++i) {
+            save_raw_code(print, rc->children[i]);
         }
     }
 }
 
-STATIC bool mp_raw_code_has_native(mp_raw_code_t *rc) {
-    if (rc->kind != MP_CODE_BYTECODE) {
-        return true;
-    }
-
-    const byte *ip = rc->fun_data;
-    bytecode_prelude_t prelude;
-    extract_prelude(&ip, &prelude);
-
-    const mp_uint_t *const_table = rc->const_table
-        + prelude.n_pos_args + prelude.n_kwonly_args
-        + rc->n_obj;
-
-    for (size_t i = 0; i < rc->n_raw_code; ++i) {
-        if (mp_raw_code_has_native((mp_raw_code_t *)(uintptr_t)*const_table++)) {
-            return true;
-        }
-    }
-
-    return false;
-}
-
-void mp_raw_code_save(mp_raw_code_t *rc, mp_print_t *print) {
+void mp_raw_code_save(mp_compiled_module_t *cm, mp_print_t *print) {
     // header contains:
     //  byte  'M'
     //  byte  version
     //  byte  feature flags
     //  byte  number of bits in a small int
-    //  uint  size of qstr window
     byte header[4] = {
         'M',
         MPY_VERSION,
@@ -810,19 +590,30 @@ void mp_raw_code_save(mp_raw_code_t *rc, mp_print_t *print) {
         #if MICROPY_DYNAMIC_COMPILER
         mp_dynamic_compiler.small_int_bits,
         #else
-        mp_small_int_bits(),
+        MP_SMALL_INT_BITS,
         #endif
     };
-    if (mp_raw_code_has_native(rc)) {
+    if (cm->has_native) {
         header[2] |= MPY_FEATURE_ENCODE_ARCH(MPY_FEATURE_ARCH_DYNAMIC);
     }
     mp_print_bytes(print, header, sizeof(header));
-    mp_print_uint(print, QSTR_WINDOW_SIZE);
 
-    qstr_window_t qw;
-    qw.idx = 0;
-    memset(qw.window, 0, sizeof(qw.window));
-    save_raw_code(print, rc, &qw);
+    // Number of entries in constant table.
+    mp_print_uint(print, cm->n_qstr);
+    mp_print_uint(print, cm->n_obj);
+
+    // Save qstrs.
+    for (size_t i = 0; i < cm->n_qstr; ++i) {
+        save_qstr(print, cm->context->constants.qstr_table[i]);
+    }
+
+    // Save constant objects.
+    for (size_t i = 0; i < cm->n_obj; ++i) {
+        save_obj(print, (mp_obj_t)cm->context->constants.obj_table[i]);
+    }
+
+    // Save outer raw code, which will save all its child raw codes.
+    save_raw_code(print, cm->rc);
 }
 
 #if MICROPY_PERSISTENT_CODE_SAVE_FILE
@@ -839,12 +630,12 @@ STATIC void fd_print_strn(void *env, const char *str, size_t len) {
     (void)ret;
 }
 
-void mp_raw_code_save_file(mp_raw_code_t *rc, const char *filename) {
+void mp_raw_code_save_file(mp_compiled_module_t *cm, const char *filename) {
     MP_THREAD_GIL_EXIT();
     int fd = open(filename, O_WRONLY | O_CREAT | O_TRUNC, 0644);
     MP_THREAD_GIL_ENTER();
     mp_print_t fd_print = {(void *)(intptr_t)fd, fd_print_strn};
-    mp_raw_code_save(rc, &fd_print);
+    mp_raw_code_save(cm, &fd_print);
     MP_THREAD_GIL_EXIT();
     close(fd);
     MP_THREAD_GIL_ENTER();
diff --git a/python/src/py/persistentcode.h b/python/src/py/persistentcode.h
index 8769ef584..29ccce4a3 100644
--- a/python/src/py/persistentcode.h
+++ b/python/src/py/persistentcode.h
@@ -31,7 +31,7 @@
 #include "py/emitglue.h"
 
 // The current version of .mpy files
-#define MPY_VERSION 5
+#define MPY_VERSION 6
 
 // Macros to encode/decode flags to/from the feature byte
 #define MPY_FEATURE_ENCODE_FLAGS(flags) (flags)
@@ -41,17 +41,12 @@
 #define MPY_FEATURE_ENCODE_ARCH(arch) ((arch) << 2)
 #define MPY_FEATURE_DECODE_ARCH(feat) ((feat) >> 2)
 
-// The feature flag bits encode the compile-time config options that
-// affect the generate bytecode.
-#define MPY_FEATURE_FLAGS ( \
-    ((MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE) << 0) \
-    | ((MICROPY_PY_BUILTINS_STR_UNICODE) << 1) \
-    )
+// The feature flag bits encode the compile-time config options that affect
+// the generate bytecode. Note: no longer used.
+// (formerly MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE and MICROPY_PY_BUILTINS_STR_UNICODE).
+#define MPY_FEATURE_FLAGS (0)
 // This is a version of the flags that can be configured at runtime.
-#define MPY_FEATURE_FLAGS_DYNAMIC ( \
-    ((MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE_DYNAMIC) << 0) \
-    | ((MICROPY_PY_BUILTINS_STR_UNICODE_DYNAMIC) << 1) \
-    )
+#define MPY_FEATURE_FLAGS_DYNAMIC (0)
 
 // Define the host architecture
 #if MICROPY_EMIT_X86
@@ -68,7 +63,7 @@
             #define MPY_FEATURE_ARCH (MP_NATIVE_ARCH_ARMV7EM)
         #endif
     #else
-        #define MPY_FEATURE_ARCH (MP_NATIVE_ARCH_ARMV7M)
+        #define MPY_FEATURE_ARCH (MP_NATIVE_ARCH_ARMV6M)
     #endif
     #define MPY_FEATURE_ARCH_TEST(x) (MP_NATIVE_ARCH_ARMV6M <= (x) && (x) <= MPY_FEATURE_ARCH)
 #elif MICROPY_EMIT_ARM
@@ -103,12 +98,26 @@ enum {
     MP_NATIVE_ARCH_XTENSAWIN,
 };
 
-mp_raw_code_t *mp_raw_code_load(mp_reader_t *reader);
-mp_raw_code_t *mp_raw_code_load_mem(const byte *buf, size_t len);
-mp_raw_code_t *mp_raw_code_load_file(const char *filename);
+enum {
+    MP_PERSISTENT_OBJ_FUN_TABLE = 0,
+    MP_PERSISTENT_OBJ_NONE,
+    MP_PERSISTENT_OBJ_FALSE,
+    MP_PERSISTENT_OBJ_TRUE,
+    MP_PERSISTENT_OBJ_ELLIPSIS,
+    MP_PERSISTENT_OBJ_STR,
+    MP_PERSISTENT_OBJ_BYTES,
+    MP_PERSISTENT_OBJ_INT,
+    MP_PERSISTENT_OBJ_FLOAT,
+    MP_PERSISTENT_OBJ_COMPLEX,
+    MP_PERSISTENT_OBJ_TUPLE,
+};
 
-void mp_raw_code_save(mp_raw_code_t *rc, mp_print_t *print);
-void mp_raw_code_save_file(mp_raw_code_t *rc, const char *filename);
+mp_compiled_module_t mp_raw_code_load(mp_reader_t *reader, mp_module_context_t *ctx);
+mp_compiled_module_t mp_raw_code_load_mem(const byte *buf, size_t len, mp_module_context_t *ctx);
+mp_compiled_module_t mp_raw_code_load_file(const char *filename, mp_module_context_t *ctx);
+
+void mp_raw_code_save(mp_compiled_module_t *cm, mp_print_t *print);
+void mp_raw_code_save_file(mp_compiled_module_t *cm, const char *filename);
 
 void mp_native_relocate(void *reloc, uint8_t *text, uintptr_t reloc_text);
 
diff --git a/python/src/py/profile.c b/python/src/py/profile.c
index 054a0f9e6..4e23e9eac 100644
--- a/python/src/py/profile.c
+++ b/python/src/py/profile.c
@@ -27,14 +27,16 @@
 #include "py/profile.h"
 #include "py/bc0.h"
 #include "py/gc.h"
+#include "py/objfun.h"
 
 #if MICROPY_PY_SYS_SETTRACE
 
 #define prof_trace_cb MP_STATE_THREAD(prof_trace_callback)
+#define QSTR_MAP(context, idx) (context->constants.qstr_table[idx])
 
 STATIC uint mp_prof_bytecode_lineno(const mp_raw_code_t *rc, size_t bc) {
     const mp_bytecode_prelude_t *prelude = &rc->prelude;
-    return mp_bytecode_get_source_line(prelude->line_info, bc);
+    return mp_bytecode_get_source_line(prelude->line_info, prelude->line_info_top, bc);
 }
 
 void mp_prof_extract_prelude(const byte *bytecode, mp_bytecode_prelude_t *prelude) {
@@ -50,13 +52,14 @@ void mp_prof_extract_prelude(const byte *bytecode, mp_bytecode_prelude_t *prelud
 
     MP_BC_PRELUDE_SIZE_DECODE(ip);
 
-    prelude->line_info = ip + 4;
+    prelude->line_info_top = ip + n_info;
     prelude->opcodes = ip + n_info + n_cell;
 
-    qstr block_name = ip[0] | (ip[1] << 8);
-    qstr source_file = ip[2] | (ip[3] << 8);
-    prelude->qstr_block_name = block_name;
-    prelude->qstr_source_file = source_file;
+    prelude->qstr_block_name_idx = mp_decode_uint_value(ip);
+    for (size_t i = 0; i < 1 + n_pos_args + n_kwonly_args; ++i) {
+        ip = mp_decode_uint_skip(ip);
+    }
+    prelude->line_info = ip;
 }
 
 /******************************************************************************/
@@ -69,22 +72,19 @@ STATIC void code_print(const mp_print_t *print, mp_obj_t o_in, mp_print_kind_t k
     const mp_bytecode_prelude_t *prelude = &rc->prelude;
     mp_printf(print,
         "<code object %q at 0x%p, file \"%q\", line %d>",
-        prelude->qstr_block_name,
+        QSTR_MAP(o->context, prelude->qstr_block_name_idx),
         o,
-        prelude->qstr_source_file,
+        QSTR_MAP(o->context, 0),
         rc->line_of_definition
         );
 }
 
-STATIC mp_obj_tuple_t *code_consts(const mp_raw_code_t *rc) {
-    const mp_bytecode_prelude_t *prelude = &rc->prelude;
-    int start = prelude->n_pos_args + prelude->n_kwonly_args + rc->n_obj;
-    int stop = prelude->n_pos_args + prelude->n_kwonly_args + rc->n_obj + rc->n_raw_code;
-    mp_obj_tuple_t *consts = MP_OBJ_TO_PTR(mp_obj_new_tuple(stop - start + 1, NULL));
+STATIC mp_obj_tuple_t *code_consts(const mp_module_context_t *context, const mp_raw_code_t *rc) {
+    mp_obj_tuple_t *consts = MP_OBJ_TO_PTR(mp_obj_new_tuple(rc->n_children + 1, NULL));
 
     size_t const_no = 0;
-    for (int i = start; i < stop; ++i) {
-        mp_obj_t code = mp_obj_new_code((const mp_raw_code_t *)MP_OBJ_TO_PTR(rc->const_table[i]));
+    for (size_t i = 0; i < rc->n_children; ++i) {
+        mp_obj_t code = mp_obj_new_code(context, rc->children[i]);
         if (code == MP_OBJ_NULL) {
             m_malloc_fail(sizeof(mp_obj_code_t));
         }
@@ -149,16 +149,16 @@ STATIC void code_attr(mp_obj_t self_in, qstr attr, mp_obj_t *dest) {
                 );
             break;
         case MP_QSTR_co_consts:
-            dest[0] = MP_OBJ_FROM_PTR(code_consts(rc));
+            dest[0] = MP_OBJ_FROM_PTR(code_consts(o->context, rc));
             break;
         case MP_QSTR_co_filename:
-            dest[0] = MP_OBJ_NEW_QSTR(prelude->qstr_source_file);
+            dest[0] = MP_OBJ_NEW_QSTR(QSTR_MAP(o->context, 0));
             break;
         case MP_QSTR_co_firstlineno:
             dest[0] = MP_OBJ_NEW_SMALL_INT(mp_prof_bytecode_lineno(rc, 0));
             break;
         case MP_QSTR_co_name:
-            dest[0] = MP_OBJ_NEW_QSTR(prelude->qstr_block_name);
+            dest[0] = MP_OBJ_NEW_QSTR(QSTR_MAP(o->context, prelude->qstr_block_name_idx));
             break;
         case MP_QSTR_co_names:
             dest[0] = MP_OBJ_FROM_PTR(o->dict_locals);
@@ -180,12 +180,13 @@ const mp_obj_type_t mp_type_settrace_codeobj = {
     .attr = code_attr,
 };
 
-mp_obj_t mp_obj_new_code(const mp_raw_code_t *rc) {
+mp_obj_t mp_obj_new_code(const mp_module_context_t *context, const mp_raw_code_t *rc) {
     mp_obj_code_t *o = m_new_obj_maybe(mp_obj_code_t);
     if (o == NULL) {
         return MP_OBJ_NULL;
     }
     o->base.type = &mp_type_settrace_codeobj;
+    o->context = context;
     o->rc = rc;
     o->dict_locals = mp_locals_get(); // this is a wrong! how to do this properly?
     o->lnotab = MP_OBJ_NULL;
@@ -204,9 +205,9 @@ STATIC void frame_print(const mp_print_t *print, mp_obj_t o_in, mp_print_kind_t
     mp_printf(print,
         "<frame at 0x%p, file '%q', line %d, code %q>",
         frame,
-        prelude->qstr_source_file,
+        QSTR_MAP(code->context, 0),
         frame->lineno,
-        prelude->qstr_block_name
+        QSTR_MAP(code->context, prelude->qstr_block_name_idx)
         );
 }
 
@@ -229,7 +230,7 @@ STATIC void frame_attr(mp_obj_t self_in, qstr attr, mp_obj_t *dest) {
             dest[0] = MP_OBJ_FROM_PTR(o->code);
             break;
         case MP_QSTR_f_globals:
-            dest[0] = MP_OBJ_FROM_PTR(o->code_state->fun_bc->globals);
+            dest[0] = MP_OBJ_FROM_PTR(o->code_state->fun_bc->context->module.globals);
             break;
         case MP_QSTR_f_lasti:
             dest[0] = MP_OBJ_NEW_SMALL_INT(o->lasti);
@@ -258,7 +259,7 @@ mp_obj_t mp_obj_new_frame(const mp_code_state_t *code_state) {
         return MP_OBJ_NULL;
     }
 
-    mp_obj_code_t *code = o->code = MP_OBJ_TO_PTR(mp_obj_new_code(code_state->fun_bc->rc));
+    mp_obj_code_t *code = o->code = MP_OBJ_TO_PTR(mp_obj_new_code(code_state->fun_bc->context, code_state->fun_bc->rc));
     if (code == NULL) {
         return MP_OBJ_NULL;
     }
@@ -540,9 +541,6 @@ STATIC const byte *mp_prof_opcode_decode(const byte *ip, const mp_uint_t *const_
             instruction->qstr_opname = MP_QSTR_LOAD_NAME;
             instruction->arg = qst;
             instruction->argobj = MP_OBJ_NEW_QSTR(qst);
-            if (MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE) {
-                instruction->argobjex_cache = MP_OBJ_NEW_SMALL_INT(*ip++);
-            }
             break;
 
         case MP_BC_LOAD_GLOBAL:
@@ -550,9 +548,6 @@ STATIC const byte *mp_prof_opcode_decode(const byte *ip, const mp_uint_t *const_
             instruction->qstr_opname = MP_QSTR_LOAD_GLOBAL;
             instruction->arg = qst;
             instruction->argobj = MP_OBJ_NEW_QSTR(qst);
-            if (MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE) {
-                instruction->argobjex_cache = MP_OBJ_NEW_SMALL_INT(*ip++);
-            }
             break;
 
         case MP_BC_LOAD_ATTR:
@@ -560,9 +555,6 @@ STATIC const byte *mp_prof_opcode_decode(const byte *ip, const mp_uint_t *const_
             instruction->qstr_opname = MP_QSTR_LOAD_ATTR;
             instruction->arg = qst;
             instruction->argobj = MP_OBJ_NEW_QSTR(qst);
-            if (MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE) {
-                instruction->argobjex_cache = MP_OBJ_NEW_SMALL_INT(*ip++);
-            }
             break;
 
         case MP_BC_LOAD_METHOD:
@@ -618,9 +610,6 @@ STATIC const byte *mp_prof_opcode_decode(const byte *ip, const mp_uint_t *const_
             instruction->qstr_opname = MP_QSTR_STORE_ATTR;
             instruction->arg = qst;
             instruction->argobj = MP_OBJ_NEW_QSTR(qst);
-            if (MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE) {
-                instruction->argobjex_cache = MP_OBJ_NEW_SMALL_INT(*ip++);
-            }
             break;
 
         case MP_BC_STORE_SUBSCR:
diff --git a/python/src/py/profile.h b/python/src/py/profile.h
index 64e207d04..7f3f91403 100644
--- a/python/src/py/profile.h
+++ b/python/src/py/profile.h
@@ -34,7 +34,9 @@
 #define mp_prof_is_executing MP_STATE_THREAD(prof_callback_is_executing)
 
 typedef struct _mp_obj_code_t {
+    // TODO this was 4 words
     mp_obj_base_t base;
+    const mp_module_context_t *context;
     const mp_raw_code_t *rc;
     mp_obj_dict_t *dict_locals;
     mp_obj_t lnotab;
@@ -53,7 +55,7 @@ typedef struct _mp_obj_frame_t {
 
 void mp_prof_extract_prelude(const byte *bytecode, mp_bytecode_prelude_t *prelude);
 
-mp_obj_t mp_obj_new_code(const mp_raw_code_t *rc);
+mp_obj_t mp_obj_new_code(const mp_module_context_t *mc, const mp_raw_code_t *rc);
 mp_obj_t mp_obj_new_frame(const mp_code_state_t *code_state);
 
 // This is the implementation for the sys.settrace
diff --git a/python/src/py/py.mk b/python/src/py/py.mk
index 609ba6cae..57fdb6d9b 100644
--- a/python/src/py/py.mk
+++ b/python/src/py/py.mk
@@ -176,6 +176,7 @@ PY_EXTMOD_O_BASENAME = \
 	extmod/moduasyncio.o \
 	extmod/moductypes.o \
 	extmod/modujson.o \
+	extmod/moduos.o \
 	extmod/modure.o \
 	extmod/moduzlib.o \
 	extmod/moduheapq.o \
@@ -189,11 +190,13 @@ PY_EXTMOD_O_BASENAME = \
 	extmod/machine_pinbase.o \
 	extmod/machine_signal.o \
 	extmod/machine_pulse.o \
+	extmod/machine_pwm.o \
 	extmod/machine_i2c.o \
 	extmod/machine_spi.o \
 	extmod/modbluetooth.o \
 	extmod/modussl_axtls.o \
 	extmod/modussl_mbedtls.o \
+	extmod/moduplatform.o\
 	extmod/modurandom.o \
 	extmod/moduselect.o \
 	extmod/moduwebsocket.o \
@@ -225,16 +228,6 @@ ifneq ($(FROZEN_MANIFEST),)
 PY_O += $(BUILD)/$(BUILD)/frozen_content.o
 endif
 
-# object file for frozen files
-ifneq ($(FROZEN_DIR),)
-PY_O += $(BUILD)/$(BUILD)/frozen.o
-endif
-
-# object file for frozen bytecode (frozen .mpy files)
-ifneq ($(FROZEN_MPY_DIR),)
-PY_O += $(BUILD)/$(BUILD)/frozen_mpy.o
-endif
-
 # Sources that may contain qstrings
 SRC_QSTR_IGNORE = py/nlr%
 SRC_QSTR += $(SRC_MOD) $(filter-out $(SRC_QSTR_IGNORE),$(PY_CORE_O_BASENAME:.o=.c)) $(PY_EXTMOD_O_BASENAME:.o=.c)
@@ -266,9 +259,9 @@ $(HEADER_BUILD)/compressed.data.h: $(HEADER_BUILD)/compressed.collected
 	$(Q)$(PYTHON) $(PY_SRC)/makecompresseddata.py $< > $@
 
 # build a list of registered modules for py/objmodule.c.
-$(HEADER_BUILD)/moduledefs.h: $(SRC_QSTR) $(QSTR_GLOBAL_DEPENDENCIES) | $(HEADER_BUILD)/mpversion.h
+$(HEADER_BUILD)/moduledefs.h: $(HEADER_BUILD)/moduledefs.collected
 	@$(ECHO) "GEN $@"
-	$(Q)$(PYTHON) $(PY_SRC)/makemoduledefs.py --vpath="., $(TOP), $(USER_C_MODULES)" $(SRC_QSTR) > $@
+	$(Q)$(PYTHON) $(PY_SRC)/makemoduledefs.py $< > $@
 
 # Standard C functions like memset need to be compiled with special flags so
 # the compiler does not optimise these functions in terms of themselves.
diff --git a/python/src/py/qstr.c b/python/src/py/qstr.c
index c14ec5ae0..f9ca10683 100644
--- a/python/src/py/qstr.c
+++ b/python/src/py/qstr.c
@@ -35,7 +35,6 @@
 
 // NOTE: we are using linear arrays to store and search for qstr's (unique strings, interned strings)
 // ultimately we will replace this with a static hash table of some kind
-// also probably need to include the length in the string data, to allow null bytes in the string
 
 #if MICROPY_DEBUG_VERBOSE // print debugging info
 #define DEBUG_printf DEBUG_printf
@@ -44,34 +43,9 @@
 #endif
 
 // A qstr is an index into the qstr pool.
-// The data for a qstr contains (hash, length, data):
-//  - hash (configurable number of bytes)
-//  - length (configurable number of bytes)
-//  - data ("length" number of bytes)
-//  - \0 terminated (so they can be printed using printf)
+// The data for a qstr is \0 terminated (so they can be printed using printf)
 
-#if MICROPY_QSTR_BYTES_IN_HASH == 1
-    #define Q_HASH_MASK (0xff)
-    #define Q_GET_HASH(q) ((mp_uint_t)(q)[0])
-    #define Q_SET_HASH(q, hash) do { (q)[0] = (hash); } while (0)
-#elif MICROPY_QSTR_BYTES_IN_HASH == 2
-    #define Q_HASH_MASK (0xffff)
-    #define Q_GET_HASH(q) ((mp_uint_t)(q)[0] | ((mp_uint_t)(q)[1] << 8))
-    #define Q_SET_HASH(q, hash) do { (q)[0] = (hash); (q)[1] = (hash) >> 8; } while (0)
-#else
-    #error unimplemented qstr hash decoding
-#endif
-#define Q_GET_ALLOC(q)  (MICROPY_QSTR_BYTES_IN_HASH + MICROPY_QSTR_BYTES_IN_LEN + Q_GET_LENGTH(q) + 1)
-#define Q_GET_DATA(q)   ((q) + MICROPY_QSTR_BYTES_IN_HASH + MICROPY_QSTR_BYTES_IN_LEN)
-#if MICROPY_QSTR_BYTES_IN_LEN == 1
-    #define Q_GET_LENGTH(q) ((q)[MICROPY_QSTR_BYTES_IN_HASH])
-    #define Q_SET_LENGTH(q, len) do { (q)[MICROPY_QSTR_BYTES_IN_HASH] = (len); } while (0)
-#elif MICROPY_QSTR_BYTES_IN_LEN == 2
-    #define Q_GET_LENGTH(q) ((q)[MICROPY_QSTR_BYTES_IN_HASH] | ((q)[MICROPY_QSTR_BYTES_IN_HASH + 1] << 8))
-    #define Q_SET_LENGTH(q, len) do { (q)[MICROPY_QSTR_BYTES_IN_HASH] = (len); (q)[MICROPY_QSTR_BYTES_IN_HASH + 1] = (len) >> 8; } while (0)
-#else
-    #error unimplemented qstr length decoding
-#endif
+#define Q_HASH_MASK ((1 << (8 * MICROPY_QSTR_BYTES_IN_HASH)) - 1)
 
 #if MICROPY_PY_THREAD && !MICROPY_PY_THREAD_GIL
 #define QSTR_ENTER() mp_thread_mutex_lock(&MP_STATE_VM(qstr_mutex), 1)
@@ -100,14 +74,32 @@ mp_uint_t qstr_compute_hash(const byte *data, size_t len) {
     return hash;
 }
 
+const qstr_hash_t mp_qstr_const_hashes[] = {
+    #ifndef NO_QSTR
+#define QDEF(id, hash, len, str) hash,
+    #include "genhdr/qstrdefs.generated.h"
+#undef QDEF
+    #endif
+};
+
+const qstr_len_t mp_qstr_const_lengths[] = {
+    #ifndef NO_QSTR
+#define QDEF(id, hash, len, str) len,
+    #include "genhdr/qstrdefs.generated.h"
+#undef QDEF
+    #endif
+};
+
 const qstr_pool_t mp_qstr_const_pool = {
     NULL,               // no previous pool
     0,                  // no previous pool
     MICROPY_ALLOC_QSTR_ENTRIES_INIT,
     MP_QSTRnumber_of,   // corresponds to number of strings in array just below
+    (qstr_hash_t *)mp_qstr_const_hashes,
+    (qstr_len_t *)mp_qstr_const_lengths,
     {
         #ifndef NO_QSTR
-#define QDEF(id, str) str,
+#define QDEF(id, hash, len, str) str,
         #include "genhdr/qstrdefs.generated.h"
 #undef QDEF
         #endif
@@ -130,19 +122,21 @@ void qstr_init(void) {
     #endif
 }
 
-STATIC const byte *find_qstr(qstr q) {
+STATIC const qstr_pool_t *find_qstr(qstr *q) {
     // search pool for this qstr
     // total_prev_len==0 in the final pool, so the loop will always terminate
-    qstr_pool_t *pool = MP_STATE_VM(last_pool);
-    while (q < pool->total_prev_len) {
+    const qstr_pool_t *pool = MP_STATE_VM(last_pool);
+    while (*q < pool->total_prev_len) {
         pool = pool->prev;
     }
-    return pool->qstrs[q - pool->total_prev_len];
+    *q -= pool->total_prev_len;
+    assert(*q < pool->len);
+    return pool;
 }
 
 // qstr_mutex must be taken while in this function
-STATIC qstr qstr_add(const byte *q_ptr) {
-    DEBUG_printf("QSTR: add hash=%d len=%d data=%.*s\n", Q_GET_HASH(q_ptr), Q_GET_LENGTH(q_ptr), Q_GET_LENGTH(q_ptr), Q_GET_DATA(q_ptr));
+STATIC qstr qstr_add(mp_uint_t hash, mp_uint_t len, const char *q_ptr) {
+    DEBUG_printf("QSTR: add hash=%d len=%d data=%.*s\n", hash, len, len, q_ptr);
 
     // make sure we have room in the pool for a new qstr
     if (MP_STATE_VM(last_pool)->len >= MP_STATE_VM(last_pool)->alloc) {
@@ -151,11 +145,21 @@ STATIC qstr qstr_add(const byte *q_ptr) {
         // Put a lower bound on the allocation size in case the extra qstr pool has few entries
         new_alloc = MAX(MICROPY_ALLOC_QSTR_ENTRIES_INIT, new_alloc);
         #endif
-        qstr_pool_t *pool = m_new_obj_var_maybe(qstr_pool_t, const char *, new_alloc);
+        mp_uint_t pool_size = sizeof(qstr_pool_t)
+            + (sizeof(const char *) + sizeof(qstr_hash_t) + sizeof(qstr_len_t)) * new_alloc;
+        qstr_pool_t *pool = (qstr_pool_t *)m_malloc_maybe(pool_size);
         if (pool == NULL) {
+            // Keep qstr_last_chunk consistent with qstr_pool_t: qstr_last_chunk is not scanned
+            // at garbage collection since it's reachable from a qstr_pool_t.  And the caller of
+            // this function expects q_ptr to be stored in a qstr_pool_t so it can be reached
+            // by the collector.  If qstr_pool_t allocation failed, qstr_last_chunk needs to be
+            // NULL'd.  Otherwise it may become a dangling pointer at the next garbage collection.
+            MP_STATE_VM(qstr_last_chunk) = NULL;
             QSTR_EXIT();
             m_malloc_fail(new_alloc);
         }
+        pool->hashes = (qstr_hash_t *)(pool->qstrs + new_alloc);
+        pool->lengths = (qstr_len_t *)(pool->hashes + new_alloc);
         pool->prev = MP_STATE_VM(last_pool);
         pool->total_prev_len = MP_STATE_VM(last_pool)->total_prev_len + MP_STATE_VM(last_pool)->len;
         pool->alloc = new_alloc;
@@ -165,10 +169,14 @@ STATIC qstr qstr_add(const byte *q_ptr) {
     }
 
     // add the new qstr
-    MP_STATE_VM(last_pool)->qstrs[MP_STATE_VM(last_pool)->len++] = q_ptr;
+    mp_uint_t at = MP_STATE_VM(last_pool)->len;
+    MP_STATE_VM(last_pool)->hashes[at] = hash;
+    MP_STATE_VM(last_pool)->lengths[at] = len;
+    MP_STATE_VM(last_pool)->qstrs[at] = q_ptr;
+    MP_STATE_VM(last_pool)->len++;
 
     // return id for the newly-added qstr
-    return MP_STATE_VM(last_pool)->total_prev_len + MP_STATE_VM(last_pool)->len - 1;
+    return MP_STATE_VM(last_pool)->total_prev_len + at;
 }
 
 qstr qstr_find_strn(const char *str, size_t str_len) {
@@ -176,10 +184,11 @@ qstr qstr_find_strn(const char *str, size_t str_len) {
     mp_uint_t str_hash = qstr_compute_hash((const byte *)str, str_len);
 
     // search pools for the data
-    for (qstr_pool_t *pool = MP_STATE_VM(last_pool); pool != NULL; pool = pool->prev) {
-        for (const byte **q = pool->qstrs, **q_top = pool->qstrs + pool->len; q < q_top; q++) {
-            if (Q_GET_HASH(*q) == str_hash && Q_GET_LENGTH(*q) == str_len && memcmp(Q_GET_DATA(*q), str, str_len) == 0) {
-                return pool->total_prev_len + (q - pool->qstrs);
+    for (const qstr_pool_t *pool = MP_STATE_VM(last_pool); pool != NULL; pool = pool->prev) {
+        for (mp_uint_t at = 0, top = pool->len; at < top; at++) {
+            if (pool->hashes[at] == str_hash && pool->lengths[at] == str_len
+                && memcmp(pool->qstrs[at], str, str_len) == 0) {
+                return pool->total_prev_len + at;
             }
         }
     }
@@ -205,14 +214,14 @@ qstr qstr_from_strn(const char *str, size_t len) {
         }
 
         // compute number of bytes needed to intern this string
-        size_t n_bytes = MICROPY_QSTR_BYTES_IN_HASH + MICROPY_QSTR_BYTES_IN_LEN + len + 1;
+        size_t n_bytes = len + 1;
 
         if (MP_STATE_VM(qstr_last_chunk) != NULL && MP_STATE_VM(qstr_last_used) + n_bytes > MP_STATE_VM(qstr_last_alloc)) {
             // not enough room at end of previously interned string so try to grow
-            byte *new_p = m_renew_maybe(byte, MP_STATE_VM(qstr_last_chunk), MP_STATE_VM(qstr_last_alloc), MP_STATE_VM(qstr_last_alloc) + n_bytes, false);
+            char *new_p = m_renew_maybe(char, MP_STATE_VM(qstr_last_chunk), MP_STATE_VM(qstr_last_alloc), MP_STATE_VM(qstr_last_alloc) + n_bytes, false);
             if (new_p == NULL) {
                 // could not grow existing memory; shrink it to fit previous
-                (void)m_renew_maybe(byte, MP_STATE_VM(qstr_last_chunk), MP_STATE_VM(qstr_last_alloc), MP_STATE_VM(qstr_last_used), false);
+                (void)m_renew_maybe(char, MP_STATE_VM(qstr_last_chunk), MP_STATE_VM(qstr_last_alloc), MP_STATE_VM(qstr_last_used), false);
                 MP_STATE_VM(qstr_last_chunk) = NULL;
             } else {
                 // could grow existing memory
@@ -226,10 +235,10 @@ qstr qstr_from_strn(const char *str, size_t len) {
             if (al < MICROPY_ALLOC_QSTR_CHUNK_INIT) {
                 al = MICROPY_ALLOC_QSTR_CHUNK_INIT;
             }
-            MP_STATE_VM(qstr_last_chunk) = m_new_maybe(byte, al);
+            MP_STATE_VM(qstr_last_chunk) = m_new_maybe(char, al);
             if (MP_STATE_VM(qstr_last_chunk) == NULL) {
                 // failed to allocate a large chunk so try with exact size
-                MP_STATE_VM(qstr_last_chunk) = m_new_maybe(byte, n_bytes);
+                MP_STATE_VM(qstr_last_chunk) = m_new_maybe(char, n_bytes);
                 if (MP_STATE_VM(qstr_last_chunk) == NULL) {
                     QSTR_EXIT();
                     m_malloc_fail(n_bytes);
@@ -241,40 +250,38 @@ qstr qstr_from_strn(const char *str, size_t len) {
         }
 
         // allocate memory from the chunk for this new interned string's data
-        byte *q_ptr = MP_STATE_VM(qstr_last_chunk) + MP_STATE_VM(qstr_last_used);
+        char *q_ptr = MP_STATE_VM(qstr_last_chunk) + MP_STATE_VM(qstr_last_used);
         MP_STATE_VM(qstr_last_used) += n_bytes;
 
         // store the interned strings' data
         mp_uint_t hash = qstr_compute_hash((const byte *)str, len);
-        Q_SET_HASH(q_ptr, hash);
-        Q_SET_LENGTH(q_ptr, len);
-        memcpy(q_ptr + MICROPY_QSTR_BYTES_IN_HASH + MICROPY_QSTR_BYTES_IN_LEN, str, len);
-        q_ptr[MICROPY_QSTR_BYTES_IN_HASH + MICROPY_QSTR_BYTES_IN_LEN + len] = '\0';
-        q = qstr_add(q_ptr);
+        memcpy(q_ptr, str, len);
+        q_ptr[len] = '\0';
+        q = qstr_add(hash, len, q_ptr);
     }
     QSTR_EXIT();
     return q;
 }
 
 mp_uint_t qstr_hash(qstr q) {
-    const byte *qd = find_qstr(q);
-    return Q_GET_HASH(qd);
+    const qstr_pool_t *pool = find_qstr(&q);
+    return pool->hashes[q];
 }
 
 size_t qstr_len(qstr q) {
-    const byte *qd = find_qstr(q);
-    return Q_GET_LENGTH(qd);
+    const qstr_pool_t *pool = find_qstr(&q);
+    return pool->lengths[q];
 }
 
 const char *qstr_str(qstr q) {
-    const byte *qd = find_qstr(q);
-    return (const char *)Q_GET_DATA(qd);
+    const qstr_pool_t *pool = find_qstr(&q);
+    return pool->qstrs[q];
 }
 
 const byte *qstr_data(qstr q, size_t *len) {
-    const byte *qd = find_qstr(q);
-    *len = Q_GET_LENGTH(qd);
-    return Q_GET_DATA(qd);
+    const qstr_pool_t *pool = find_qstr(&q);
+    *len = pool->lengths[q];
+    return (byte *)pool->qstrs[q];
 }
 
 void qstr_pool_info(size_t *n_pool, size_t *n_qstr, size_t *n_str_data_bytes, size_t *n_total_bytes) {
@@ -283,16 +290,17 @@ void qstr_pool_info(size_t *n_pool, size_t *n_qstr, size_t *n_str_data_bytes, si
     *n_qstr = 0;
     *n_str_data_bytes = 0;
     *n_total_bytes = 0;
-    for (qstr_pool_t *pool = MP_STATE_VM(last_pool); pool != NULL && pool != &CONST_POOL; pool = pool->prev) {
+    for (const qstr_pool_t *pool = MP_STATE_VM(last_pool); pool != NULL && pool != &CONST_POOL; pool = pool->prev) {
         *n_pool += 1;
         *n_qstr += pool->len;
-        for (const byte **q = pool->qstrs, **q_top = pool->qstrs + pool->len; q < q_top; q++) {
-            *n_str_data_bytes += Q_GET_ALLOC(*q);
+        for (qstr_len_t *l = pool->lengths, *l_top = pool->lengths + pool->len; l < l_top; l++) {
+            *n_str_data_bytes += *l + 1;
         }
         #if MICROPY_ENABLE_GC
         *n_total_bytes += gc_nbytes(pool); // this counts actual bytes used in heap
         #else
-        *n_total_bytes += sizeof(qstr_pool_t) + sizeof(qstr) * pool->alloc;
+        *n_total_bytes += sizeof(qstr_pool_t)
+            + (sizeof(const char *) + sizeof(qstr_hash_t) + sizeof(qstr_len_t)) * pool->alloc;
         #endif
     }
     *n_total_bytes += *n_str_data_bytes;
@@ -302,9 +310,9 @@ void qstr_pool_info(size_t *n_pool, size_t *n_qstr, size_t *n_str_data_bytes, si
 #if MICROPY_PY_MICROPYTHON_MEM_INFO
 void qstr_dump_data(void) {
     QSTR_ENTER();
-    for (qstr_pool_t *pool = MP_STATE_VM(last_pool); pool != NULL && pool != &CONST_POOL; pool = pool->prev) {
-        for (const byte **q = pool->qstrs, **q_top = pool->qstrs + pool->len; q < q_top; q++) {
-            mp_printf(&mp_plat_print, "Q(%s)\n", Q_GET_DATA(*q));
+    for (const qstr_pool_t *pool = MP_STATE_VM(last_pool); pool != NULL && pool != &CONST_POOL; pool = pool->prev) {
+        for (const char *const *q = pool->qstrs, *const *q_top = pool->qstrs + pool->len; q < q_top; q++) {
+            mp_printf(&mp_plat_print, "Q(%s)\n", *q);
         }
     }
     QSTR_EXIT();
diff --git a/python/src/py/qstr.h b/python/src/py/qstr.h
index 0b6fb12b0..fa634f90b 100644
--- a/python/src/py/qstr.h
+++ b/python/src/py/qstr.h
@@ -38,7 +38,7 @@
 // first entry in enum will be MP_QSTRnull=0, which indicates invalid/no qstr
 enum {
     #ifndef NO_QSTR
-#define QDEF(id, str) id,
+#define QDEF(id, hash, len, str) id,
     #include "genhdr/qstrdefs.generated.h"
 #undef QDEF
     #endif
@@ -46,13 +46,32 @@ enum {
 };
 
 typedef size_t qstr;
+typedef uint16_t qstr_short_t;
+
+#if MICROPY_QSTR_BYTES_IN_HASH == 1
+typedef uint8_t qstr_hash_t;
+#elif MICROPY_QSTR_BYTES_IN_HASH == 2
+typedef uint16_t qstr_hash_t;
+#else
+#error unimplemented qstr hash decoding
+#endif
+
+#if MICROPY_QSTR_BYTES_IN_LEN == 1
+typedef uint8_t qstr_len_t;
+#elif MICROPY_QSTR_BYTES_IN_LEN == 2
+typedef uint16_t qstr_len_t;
+#else
+#error unimplemented qstr length decoding
+#endif
 
 typedef struct _qstr_pool_t {
-    struct _qstr_pool_t *prev;
+    const struct _qstr_pool_t *prev;
     size_t total_prev_len;
     size_t alloc;
     size_t len;
-    const byte *qstrs[];
+    qstr_hash_t *hashes;
+    qstr_len_t *lengths;
+    const char *qstrs[];
 } qstr_pool_t;
 
 #define QSTR_TOTAL() (MP_STATE_VM(last_pool)->total_prev_len + MP_STATE_VM(last_pool)->len)
diff --git a/python/src/py/qstrdefs.h b/python/src/py/qstrdefs.h
index 5b4e0dc48..5003636df 100644
--- a/python/src/py/qstrdefs.h
+++ b/python/src/py/qstrdefs.h
@@ -39,6 +39,10 @@ Q()
 Q(*)
 Q(_)
 Q(/)
+#if MICROPY_PY_SYS_PS1_PS2
+Q(>>> )
+Q(... )
+#endif
 #if MICROPY_PY_BUILTINS_STR_OP_MODULO
 Q(%#o)
 Q(%#x)
@@ -60,6 +64,10 @@ Q(<string>)
 Q(<stdin>)
 Q(utf-8)
 
+#if MICROPY_MODULE_FROZEN
+Q(.frozen)
+#endif
+
 #if MICROPY_ENABLE_PYSTACK
 Q(pystack exhausted)
 #endif
diff --git a/python/src/py/repl.c b/python/src/py/repl.c
index 822e385ab..0369b0219 100644
--- a/python/src/py/repl.c
+++ b/python/src/py/repl.c
@@ -33,6 +33,16 @@
 
 #if MICROPY_HELPER_REPL
 
+#if MICROPY_PY_SYS_PS1_PS2
+const char *mp_repl_get_psx(unsigned int entry) {
+    if (mp_obj_is_str(MP_STATE_VM(sys_mutable)[entry])) {
+        return mp_obj_str_get_str(MP_STATE_VM(sys_mutable)[entry]);
+    } else {
+        return "";
+    }
+}
+#endif
+
 STATIC bool str_startswith_word(const char *str, const char *head) {
     size_t i;
     for (i = 0; str[i] && head[i]; i++) {
@@ -303,10 +313,7 @@ size_t mp_repl_autocomplete(const char *str, size_t len, const mp_print_t *print
                 return sizeof(import_str) - 1 - s_len;
             }
         }
-        if (q_first == 0) {
-            *compl_str = "    ";
-            return s_len ? 0 : 4;
-        }
+        return 0;
     }
 
     // 1 match found, or multiple matches with a common prefix
diff --git a/python/src/py/repl.h b/python/src/py/repl.h
index a7a4136ca..9e8f7f1dd 100644
--- a/python/src/py/repl.h
+++ b/python/src/py/repl.h
@@ -31,8 +31,34 @@
 #include "py/mpprint.h"
 
 #if MICROPY_HELPER_REPL
+
+#if MICROPY_PY_SYS_PS1_PS2
+
+const char *mp_repl_get_psx(unsigned int entry);
+
+static inline const char *mp_repl_get_ps1(void) {
+    return mp_repl_get_psx(MP_SYS_MUTABLE_PS1);
+}
+
+static inline const char *mp_repl_get_ps2(void) {
+    return mp_repl_get_psx(MP_SYS_MUTABLE_PS2);
+}
+
+#else
+
+static inline const char *mp_repl_get_ps1(void) {
+    return ">>> ";
+}
+
+static inline const char *mp_repl_get_ps2(void) {
+    return "... ";
+}
+
+#endif
+
 bool mp_repl_continue_with_input(const char *input);
 size_t mp_repl_autocomplete(const char *str, size_t len, const mp_print_t *print, const char **compl_str);
+
 #endif
 
 #endif // MICROPY_INCLUDED_PY_REPL_H
diff --git a/python/src/py/runtime.c b/python/src/py/runtime.c
index 27e5bc13e..e6d8c6807 100644
--- a/python/src/py/runtime.c
+++ b/python/src/py/runtime.c
@@ -25,10 +25,11 @@
  * THE SOFTWARE.
  */
 
+#include <assert.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <string.h>
-#include <assert.h>
+#include <unistd.h>
 
 #include "py/parsenum.h"
 #include "py/compile.h"
@@ -58,13 +59,23 @@ const mp_obj_module_t mp_module___main__ = {
     .globals = (mp_obj_dict_t *)&MP_STATE_VM(dict_main),
 };
 
+MP_REGISTER_MODULE(MP_QSTR___main__, mp_module___main__);
+
 void mp_init(void) {
     qstr_init();
 
     // no pending exceptions to start with
     MP_STATE_THREAD(mp_pending_exception) = MP_OBJ_NULL;
     #if MICROPY_ENABLE_SCHEDULER
-    MP_STATE_VM(sched_state) = MP_SCHED_IDLE;
+    #if MICROPY_SCHEDULER_STATIC_NODES
+    if (MP_STATE_VM(sched_head) == NULL) {
+        // no pending callbacks to start with
+        MP_STATE_VM(sched_state) = MP_SCHED_IDLE;
+    } else {
+        // pending callbacks are on the list, eg from before a soft reset
+        MP_STATE_VM(sched_state) = MP_SCHED_PENDING;
+    }
+    #endif
     MP_STATE_VM(sched_idx) = 0;
     MP_STATE_VM(sched_len) = 0;
     #endif
@@ -122,16 +133,34 @@ void mp_init(void) {
     MP_STATE_VM(vfs_mount_table) = NULL;
     #endif
 
+    #if MICROPY_PY_SYS_PATH_ARGV_DEFAULTS
+    mp_obj_list_init(MP_OBJ_TO_PTR(mp_sys_path), 0);
+    mp_obj_list_append(mp_sys_path, MP_OBJ_NEW_QSTR(MP_QSTR_)); // current dir (or base dir of the script)
+    #if MICROPY_MODULE_FROZEN
+    mp_obj_list_append(mp_sys_path, MP_OBJ_NEW_QSTR(MP_QSTR__dot_frozen));
+    #endif
+    mp_obj_list_init(MP_OBJ_TO_PTR(mp_sys_argv), 0);
+    #endif
+
     #if MICROPY_PY_SYS_ATEXIT
     MP_STATE_VM(sys_exitfunc) = mp_const_none;
     #endif
 
+    #if MICROPY_PY_SYS_PS1_PS2
+    MP_STATE_VM(sys_mutable[MP_SYS_MUTABLE_PS1]) = MP_OBJ_NEW_QSTR(MP_QSTR__gt__gt__gt__space_);
+    MP_STATE_VM(sys_mutable[MP_SYS_MUTABLE_PS2]) = MP_OBJ_NEW_QSTR(MP_QSTR__dot__dot__dot__space_);
+    #endif
+
     #if MICROPY_PY_SYS_SETTRACE
     MP_STATE_THREAD(prof_trace_callback) = MP_OBJ_NULL;
     MP_STATE_THREAD(prof_callback_is_executing) = false;
     MP_STATE_THREAD(current_code_state) = NULL;
     #endif
 
+    #if MICROPY_PY_SYS_TRACEBACKLIMIT
+    MP_STATE_VM(sys_mutable[MP_SYS_MUTABLE_TRACEBACKLIMIT]) = MP_OBJ_NEW_SMALL_INT(1000);
+    #endif
+
     #if MICROPY_PY_BLUETOOTH
     MP_STATE_VM(bluetooth) = MP_OBJ_NULL;
     #endif
@@ -157,7 +186,7 @@ void mp_deinit(void) {
     #endif
 }
 
-mp_obj_t mp_load_name(qstr qst) {
+mp_obj_t MICROPY_WRAP_MP_LOAD_NAME(mp_load_name)(qstr qst) {
     // logic: search locals, globals, builtins
     DEBUG_OP_printf("load name %s\n", qstr_str(qst));
     // If we're at the outer scope (locals == globals), dispatch to load_global right away
@@ -170,7 +199,7 @@ mp_obj_t mp_load_name(qstr qst) {
     return mp_load_global(qst);
 }
 
-mp_obj_t mp_load_global(qstr qst) {
+mp_obj_t MICROPY_WRAP_MP_LOAD_GLOBAL(mp_load_global)(qstr qst) {
     // logic: search globals, builtins
     DEBUG_OP_printf("load global %s\n", qstr_str(qst));
     mp_map_elem_t *elem = mp_map_lookup(&mp_globals_get()->map, MP_OBJ_NEW_QSTR(qst), MP_MAP_LOOKUP);
@@ -311,7 +340,7 @@ mp_obj_t mp_unary_op(mp_unary_op_t op, mp_obj_t arg) {
     }
 }
 
-mp_obj_t mp_binary_op(mp_binary_op_t op, mp_obj_t lhs, mp_obj_t rhs) {
+mp_obj_t MICROPY_WRAP_MP_BINARY_OP(mp_binary_op)(mp_binary_op_t op, mp_obj_t lhs, mp_obj_t rhs) {
     DEBUG_OP_printf("binary " UINT_FMT " %q %p %p\n", op, mp_binary_op_method_name[op], lhs, rhs);
 
     // TODO correctly distinguish inplace operators for mutable objects
@@ -681,12 +710,11 @@ void mp_call_prepare_args_n_kw_var(bool have_self, size_t n_args_n_kw, const mp_
     if (have_self) {
         self = *args++; // may be MP_OBJ_NULL
     }
-    uint n_args = n_args_n_kw & 0xff;
-    uint n_kw = (n_args_n_kw >> 8) & 0xff;
-    mp_obj_t pos_seq = args[n_args + 2 * n_kw]; // may be MP_OBJ_NULL
-    mp_obj_t kw_dict = args[n_args + 2 * n_kw + 1]; // may be MP_OBJ_NULL
+    size_t n_args = n_args_n_kw & 0xff;
+    size_t n_kw = (n_args_n_kw >> 8) & 0xff;
+    mp_uint_t star_args = MP_OBJ_SMALL_INT_VALUE(args[n_args + 2 * n_kw]);
 
-    DEBUG_OP_printf("call method var (fun=%p, self=%p, n_args=%u, n_kw=%u, args=%p, seq=%p, dict=%p)\n", fun, self, n_args, n_kw, args, pos_seq, kw_dict);
+    DEBUG_OP_printf("call method var (fun=%p, self=%p, n_args=%u, n_kw=%u, args=%p, map=%u)\n", fun, self, n_args, n_kw, args, star_args);
 
     // We need to create the following array of objects:
     //     args[0 .. n_args]  unpacked(pos_seq)  args[n_args .. n_args + 2 * n_kw]  unpacked(kw_dict)
@@ -694,19 +722,40 @@ void mp_call_prepare_args_n_kw_var(bool have_self, size_t n_args_n_kw, const mp_
 
     // The new args array
     mp_obj_t *args2;
-    uint args2_alloc;
-    uint args2_len = 0;
+    size_t args2_alloc;
+    size_t args2_len = 0;
+
+    // Try to get a hint for unpacked * args length
+    ssize_t list_len = 0;
+
+    if (star_args != 0) {
+        for (size_t i = 0; i < n_args; i++) {
+            if ((star_args >> i) & 1) {
+                mp_obj_t len = mp_obj_len_maybe(args[i]);
+                if (len != MP_OBJ_NULL) {
+                    // -1 accounts for 1 of n_args occupied by this arg
+                    list_len += mp_obj_get_int(len) - 1;
+                }
+            }
+        }
+    }
 
     // Try to get a hint for the size of the kw_dict
-    uint kw_dict_len = 0;
-    if (kw_dict != MP_OBJ_NULL && mp_obj_is_type(kw_dict, &mp_type_dict)) {
-        kw_dict_len = mp_obj_dict_len(kw_dict);
+    ssize_t kw_dict_len = 0;
+
+    for (size_t i = 0; i < n_kw; i++) {
+        mp_obj_t key = args[n_args + i * 2];
+        mp_obj_t value = args[n_args + i * 2 + 1];
+        if (key == MP_OBJ_NULL && value != MP_OBJ_NULL && mp_obj_is_type(value, &mp_type_dict)) {
+            // -1 accounts for 1 of n_kw occupied by this arg
+            kw_dict_len += mp_obj_dict_len(value) - 1;
+        }
     }
 
     // Extract the pos_seq sequence to the new args array.
     // Note that it can be arbitrary iterator.
-    if (pos_seq == MP_OBJ_NULL) {
-        // no sequence
+    if (star_args == 0) {
+        // no star args to unpack
 
         // allocate memory for the new array of args
         args2_alloc = 1 + n_args + 2 * (n_kw + kw_dict_len);
@@ -720,33 +769,11 @@ void mp_call_prepare_args_n_kw_var(bool have_self, size_t n_args_n_kw, const mp_
         // copy the fixed pos args
         mp_seq_copy(args2 + args2_len, args, n_args, mp_obj_t);
         args2_len += n_args;
-
-    } else if (mp_obj_is_type(pos_seq, &mp_type_tuple) || mp_obj_is_type(pos_seq, &mp_type_list)) {
-        // optimise the case of a tuple and list
-
-        // get the items
-        size_t len;
-        mp_obj_t *items;
-        mp_obj_get_array(pos_seq, &len, &items);
-
-        // allocate memory for the new array of args
-        args2_alloc = 1 + n_args + len + 2 * (n_kw + kw_dict_len);
-        args2 = mp_nonlocal_alloc(args2_alloc * sizeof(mp_obj_t));
-
-        // copy the self
-        if (self != MP_OBJ_NULL) {
-            args2[args2_len++] = self;
-        }
-
-        // copy the fixed and variable position args
-        mp_seq_cat(args2 + args2_len, args, n_args, items, len, mp_obj_t);
-        args2_len += n_args + len;
-
     } else {
-        // generic iterator
+        // at least one star arg to unpack
 
         // allocate memory for the new array of args
-        args2_alloc = 1 + n_args + 2 * (n_kw + kw_dict_len) + 3;
+        args2_alloc = 1 + n_args + list_len + 2 * (n_kw + kw_dict_len);
         args2 = mp_nonlocal_alloc(args2_alloc * sizeof(mp_obj_t));
 
         // copy the self
@@ -754,84 +781,118 @@ void mp_call_prepare_args_n_kw_var(bool have_self, size_t n_args_n_kw, const mp_
             args2[args2_len++] = self;
         }
 
-        // copy the fixed position args
-        mp_seq_copy(args2 + args2_len, args, n_args, mp_obj_t);
-        args2_len += n_args;
+        for (size_t i = 0; i < n_args; i++) {
+            mp_obj_t arg = args[i];
+            if ((star_args >> i) & 1) {
+                // star arg
+                if (mp_obj_is_type(arg, &mp_type_tuple) || mp_obj_is_type(arg, &mp_type_list)) {
+                    // optimise the case of a tuple and list
 
-        // extract the variable position args from the iterator
-        mp_obj_iter_buf_t iter_buf;
-        mp_obj_t iterable = mp_getiter(pos_seq, &iter_buf);
-        mp_obj_t item;
-        while ((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
-            if (args2_len >= args2_alloc) {
-                args2 = mp_nonlocal_realloc(args2, args2_alloc * sizeof(mp_obj_t), args2_alloc * 2 * sizeof(mp_obj_t));
-                args2_alloc *= 2;
+                    // get the items
+                    size_t len;
+                    mp_obj_t *items;
+                    mp_obj_get_array(arg, &len, &items);
+
+                    // copy the items
+                    assert(args2_len + len <= args2_alloc);
+                    mp_seq_copy(args2 + args2_len, items, len, mp_obj_t);
+                    args2_len += len;
+                } else {
+                    // generic iterator
+
+                    // extract the variable position args from the iterator
+                    mp_obj_iter_buf_t iter_buf;
+                    mp_obj_t iterable = mp_getiter(arg, &iter_buf);
+                    mp_obj_t item;
+                    while ((item = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+                        if (args2_len >= args2_alloc) {
+                            args2 = mp_nonlocal_realloc(args2, args2_alloc * sizeof(mp_obj_t),
+                                args2_alloc * 2 * sizeof(mp_obj_t));
+                            args2_alloc *= 2;
+                        }
+                        args2[args2_len++] = item;
+                    }
+                }
+            } else {
+                // normal argument
+                assert(args2_len < args2_alloc);
+                args2[args2_len++] = arg;
             }
-            args2[args2_len++] = item;
         }
     }
 
     // The size of the args2 array now is the number of positional args.
-    uint pos_args_len = args2_len;
+    size_t pos_args_len = args2_len;
 
-    // Copy the fixed kw args.
-    mp_seq_copy(args2 + args2_len, args + n_args, 2 * n_kw, mp_obj_t);
-    args2_len += 2 * n_kw;
+    // ensure there is still enough room for kw args
+    if (args2_len + 2 * (n_kw + kw_dict_len) > args2_alloc) {
+        size_t new_alloc = args2_len + 2 * (n_kw + kw_dict_len);
+        args2 = mp_nonlocal_realloc(args2, args2_alloc * sizeof(mp_obj_t),
+            new_alloc * sizeof(mp_obj_t));
+        args2_alloc = new_alloc;
+    }
 
-    // Extract (key,value) pairs from kw_dict dictionary and append to args2.
-    // Note that it can be arbitrary iterator.
-    if (kw_dict == MP_OBJ_NULL) {
-        // pass
-    } else if (mp_obj_is_type(kw_dict, &mp_type_dict)) {
-        // dictionary
-        mp_map_t *map = mp_obj_dict_get_map(kw_dict);
-        assert(args2_len + 2 * map->used <= args2_alloc); // should have enough, since kw_dict_len is in this case hinted correctly above
-        for (size_t i = 0; i < map->alloc; i++) {
-            if (mp_map_slot_is_filled(map, i)) {
-                // the key must be a qstr, so intern it if it's a string
-                mp_obj_t key = map->table[i].key;
-                if (!mp_obj_is_qstr(key)) {
-                    key = mp_obj_str_intern_checked(key);
+    // Copy the kw args.
+    for (size_t i = 0; i < n_kw; i++) {
+        mp_obj_t kw_key = args[n_args + i * 2];
+        mp_obj_t kw_value = args[n_args + i * 2 + 1];
+        if (kw_key == MP_OBJ_NULL) {
+            // double-star args
+            if (mp_obj_is_type(kw_value, &mp_type_dict)) {
+                // dictionary
+                mp_map_t *map = mp_obj_dict_get_map(kw_value);
+                // should have enough, since kw_dict_len is in this case hinted correctly above
+                assert(args2_len + 2 * map->used <= args2_alloc);
+                for (size_t j = 0; j < map->alloc; j++) {
+                    if (mp_map_slot_is_filled(map, j)) {
+                        // the key must be a qstr, so intern it if it's a string
+                        mp_obj_t key = map->table[j].key;
+                        if (!mp_obj_is_qstr(key)) {
+                            key = mp_obj_str_intern_checked(key);
+                        }
+                        args2[args2_len++] = key;
+                        args2[args2_len++] = map->table[j].value;
+                    }
                 }
-                args2[args2_len++] = key;
-                args2[args2_len++] = map->table[i].value;
-            }
-        }
-    } else {
-        // generic mapping:
-        // - call keys() to get an iterable of all keys in the mapping
-        // - call __getitem__ for each key to get the corresponding value
+            } else {
+                // generic mapping:
+                // - call keys() to get an iterable of all keys in the mapping
+                // - call __getitem__ for each key to get the corresponding value
 
-        // get the keys iterable
-        mp_obj_t dest[3];
-        mp_load_method(kw_dict, MP_QSTR_keys, dest);
-        mp_obj_t iterable = mp_getiter(mp_call_method_n_kw(0, 0, dest), NULL);
+                // get the keys iterable
+                mp_obj_t dest[3];
+                mp_load_method(kw_value, MP_QSTR_keys, dest);
+                mp_obj_t iterable = mp_getiter(mp_call_method_n_kw(0, 0, dest), NULL);
 
-        mp_obj_t key;
-        while ((key = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
-            // expand size of args array if needed
-            if (args2_len + 1 >= args2_alloc) {
-                uint new_alloc = args2_alloc * 2;
-                if (new_alloc < 4) {
-                    new_alloc = 4;
+                mp_obj_t key;
+                while ((key = mp_iternext(iterable)) != MP_OBJ_STOP_ITERATION) {
+                    // expand size of args array if needed
+                    if (args2_len + 1 >= args2_alloc) {
+                        size_t new_alloc = args2_alloc * 2;
+                        args2 = mp_nonlocal_realloc(args2, args2_alloc * sizeof(mp_obj_t), new_alloc * sizeof(mp_obj_t));
+                        args2_alloc = new_alloc;
+                    }
+
+                    // the key must be a qstr, so intern it if it's a string
+                    if (!mp_obj_is_qstr(key)) {
+                        key = mp_obj_str_intern_checked(key);
+                    }
+
+                    // get the value corresponding to the key
+                    mp_load_method(kw_value, MP_QSTR___getitem__, dest);
+                    dest[2] = key;
+                    mp_obj_t value = mp_call_method_n_kw(1, 0, dest);
+
+                    // store the key/value pair in the argument array
+                    args2[args2_len++] = key;
+                    args2[args2_len++] = value;
                 }
-                args2 = mp_nonlocal_realloc(args2, args2_alloc * sizeof(mp_obj_t), new_alloc * sizeof(mp_obj_t));
-                args2_alloc = new_alloc;
             }
-
-            // the key must be a qstr, so intern it if it's a string
-            if (!mp_obj_is_qstr(key)) {
-                key = mp_obj_str_intern_checked(key);
-            }
-
-            // get the value corresponding to the key
-            mp_load_method(kw_dict, MP_QSTR___getitem__, dest);
-            dest[2] = key;
-            mp_obj_t value = mp_call_method_n_kw(1, 0, dest);
-
-            // store the key/value pair in the argument array
-            args2[args2_len++] = key;
-            args2[args2_len++] = value;
+        } else {
+            // normal kwarg
+            assert(args2_len + 2 <= args2_alloc);
+            args2[args2_len++] = kw_key;
+            args2[args2_len++] = kw_value;
         }
     }
 
@@ -1008,8 +1069,7 @@ STATIC const mp_obj_type_t mp_type_checked_fun = {
 };
 
 STATIC mp_obj_t mp_obj_new_checked_fun(const mp_obj_type_t *type, mp_obj_t fun) {
-    mp_obj_checked_fun_t *o = m_new_obj(mp_obj_checked_fun_t);
-    o->base.type = &mp_type_checked_fun;
+    mp_obj_checked_fun_t *o = mp_obj_malloc(mp_obj_checked_fun_t, &mp_type_checked_fun);
     o->type = type;
     o->fun = fun;
     return MP_OBJ_FROM_PTR(o);
@@ -1081,6 +1141,10 @@ void mp_load_method_maybe(mp_obj_t obj, qstr attr, mp_obj_t *dest) {
     dest[0] = MP_OBJ_NULL;
     dest[1] = MP_OBJ_NULL;
 
+    // Note: the specific case of obj being an instance type is fast-path'ed in the VM
+    // for the MP_BC_LOAD_ATTR opcode. Instance types handle type->attr and look up directly
+    // in their member's map.
+
     // get the type
     const mp_obj_type_t *type = mp_obj_get_type(obj);
 
@@ -1096,12 +1160,20 @@ void mp_load_method_maybe(mp_obj_t obj, qstr attr, mp_obj_t *dest) {
     if (attr == MP_QSTR___next__ && type->iternext != NULL) {
         dest[0] = MP_OBJ_FROM_PTR(&mp_builtin_next_obj);
         dest[1] = obj;
-
-    } else if (type->attr != NULL) {
+        return;
+    }
+    if (type->attr != NULL) {
         // this type can do its own load, so call it
         type->attr(obj, attr, dest);
-
-    } else if (type->locals_dict != NULL) {
+        // If type->attr has set dest[1] = MP_OBJ_SENTINEL, we should proceed
+        // with lookups below (i.e. in locals_dict). If not, return right away.
+        if (dest[1] != MP_OBJ_SENTINEL) {
+            return;
+        }
+        // Clear the fail flag set by type->attr so it's like it never ran.
+        dest[1] = MP_OBJ_NULL;
+    }
+    if (type->locals_dict != NULL) {
         // generic method lookup
         // this is a lookup in the object (ie not class or type)
         assert(type->locals_dict->base.type == &mp_type_dict); // MicroPython restriction, for now
@@ -1110,6 +1182,7 @@ void mp_load_method_maybe(mp_obj_t obj, qstr attr, mp_obj_t *dest) {
         if (elem != NULL) {
             mp_convert_member_lookup(obj, type, elem->value, dest);
         }
+        return;
     }
 }
 
@@ -1368,8 +1441,10 @@ mp_obj_t mp_make_raise_obj(mp_obj_t o) {
         // create and return a new exception instance by calling o
         // TODO could have an option to disable traceback, then builtin exceptions (eg TypeError)
         // could have const instances in ROM which we return here instead
-        return mp_call_function_n_kw(o, 0, 0, NULL);
-    } else if (mp_obj_is_exception_instance(o)) {
+        o = mp_call_function_n_kw(o, 0, 0, NULL);
+    }
+
+    if (mp_obj_is_exception_instance(o)) {
         // o is an instance of an exception, so use it as the exception
         return o;
     } else {
diff --git a/python/src/py/runtime.h b/python/src/py/runtime.h
index f0d41f38d..4393fbfa8 100644
--- a/python/src/py/runtime.h
+++ b/python/src/py/runtime.h
@@ -57,6 +57,15 @@ typedef struct _mp_arg_t {
     mp_arg_val_t defval;
 } mp_arg_t;
 
+struct _mp_sched_node_t;
+
+typedef void (*mp_sched_callback_t)(struct _mp_sched_node_t *);
+
+typedef struct _mp_sched_node_t {
+    mp_sched_callback_t callback;
+    struct _mp_sched_node_t *next;
+} mp_sched_node_t;
+
 // Tables mapping operator enums to qstrs, defined in objtype.c
 extern const byte mp_unary_op_method_name[];
 extern const byte mp_binary_op_method_name[];
@@ -74,6 +83,7 @@ void mp_sched_lock(void);
 void mp_sched_unlock(void);
 #define mp_sched_num_pending() (MP_STATE_VM(sched_len))
 bool mp_sched_schedule(mp_obj_t function, mp_obj_t arg);
+bool mp_sched_schedule_node(mp_sched_node_t *node, mp_sched_callback_t callback);
 #endif
 
 // extra printing method specifically for mp_obj_t's which are integral type
diff --git a/python/src/py/scheduler.c b/python/src/py/scheduler.c
index bd0bbf207..3966da297 100644
--- a/python/src/py/scheduler.c
+++ b/python/src/py/scheduler.c
@@ -90,6 +90,24 @@ void mp_handle_pending(bool raise_exc) {
 // or by the VM's inlined version of that function.
 void mp_handle_pending_tail(mp_uint_t atomic_state) {
     MP_STATE_VM(sched_state) = MP_SCHED_LOCKED;
+
+    #if MICROPY_SCHEDULER_STATIC_NODES
+    // Run all pending C callbacks.
+    while (MP_STATE_VM(sched_head) != NULL) {
+        mp_sched_node_t *node = MP_STATE_VM(sched_head);
+        MP_STATE_VM(sched_head) = node->next;
+        if (MP_STATE_VM(sched_head) == NULL) {
+            MP_STATE_VM(sched_tail) = NULL;
+        }
+        mp_sched_callback_t callback = node->callback;
+        node->callback = NULL;
+        MICROPY_END_ATOMIC_SECTION(atomic_state);
+        callback(node);
+        atomic_state = MICROPY_BEGIN_ATOMIC_SECTION();
+    }
+    #endif
+
+    // Run at most one pending Python callback.
     if (!mp_sched_empty()) {
         mp_sched_item_t item = MP_STATE_VM(sched_queue)[MP_STATE_VM(sched_idx)];
         MP_STATE_VM(sched_idx) = IDX_MASK(MP_STATE_VM(sched_idx) + 1);
@@ -99,6 +117,7 @@ void mp_handle_pending_tail(mp_uint_t atomic_state) {
     } else {
         MICROPY_END_ATOMIC_SECTION(atomic_state);
     }
+
     mp_sched_unlock();
 }
 
@@ -117,7 +136,11 @@ void mp_sched_unlock(void) {
     assert(MP_STATE_VM(sched_state) < 0);
     if (++MP_STATE_VM(sched_state) == 0) {
         // vm became unlocked
-        if (MP_STATE_THREAD(mp_pending_exception) != MP_OBJ_NULL || mp_sched_num_pending()) {
+        if (MP_STATE_THREAD(mp_pending_exception) != MP_OBJ_NULL
+            #if MICROPY_SCHEDULER_STATIC_NODES
+            || MP_STATE_VM(sched_head) != NULL
+            #endif
+            || mp_sched_num_pending()) {
             MP_STATE_VM(sched_state) = MP_SCHED_PENDING;
         } else {
             MP_STATE_VM(sched_state) = MP_SCHED_IDLE;
@@ -146,6 +169,33 @@ bool MICROPY_WRAP_MP_SCHED_SCHEDULE(mp_sched_schedule)(mp_obj_t function, mp_obj
     return ret;
 }
 
+#if MICROPY_SCHEDULER_STATIC_NODES
+bool mp_sched_schedule_node(mp_sched_node_t *node, mp_sched_callback_t callback) {
+    mp_uint_t atomic_state = MICROPY_BEGIN_ATOMIC_SECTION();
+    bool ret;
+    if (node->callback == NULL) {
+        if (MP_STATE_VM(sched_state) == MP_SCHED_IDLE) {
+            MP_STATE_VM(sched_state) = MP_SCHED_PENDING;
+        }
+        node->callback = callback;
+        node->next = NULL;
+        if (MP_STATE_VM(sched_tail) == NULL) {
+            MP_STATE_VM(sched_head) = node;
+        } else {
+            MP_STATE_VM(sched_tail)->next = node;
+        }
+        MP_STATE_VM(sched_tail) = node;
+        MICROPY_SCHED_HOOK_SCHEDULED;
+        ret = true;
+    } else {
+        // already scheduled
+        ret = false;
+    }
+    MICROPY_END_ATOMIC_SECTION(atomic_state);
+    return ret;
+}
+#endif
+
 #else // MICROPY_ENABLE_SCHEDULER
 
 // A variant of this is inlined in the VM at the pending exception check
diff --git a/python/src/py/scope.c b/python/src/py/scope.c
index 98e02fb53..8fc094328 100644
--- a/python/src/py/scope.c
+++ b/python/src/py/scope.c
@@ -40,7 +40,7 @@ STATIC const uint8_t scope_simple_name_table[] = {
     [SCOPE_GEN_EXPR] = MP_QSTR__lt_genexpr_gt_,
 };
 
-scope_t *scope_new(scope_kind_t kind, mp_parse_node_t pn, qstr source_file, mp_uint_t emit_options) {
+scope_t *scope_new(scope_kind_t kind, mp_parse_node_t pn, mp_uint_t emit_options) {
     // Make sure those qstrs indeed fit in an uint8_t.
     MP_STATIC_ASSERT(MP_QSTR__lt_module_gt_ <= UINT8_MAX);
     MP_STATIC_ASSERT(MP_QSTR__lt_lambda_gt_ <= UINT8_MAX);
@@ -52,7 +52,6 @@ scope_t *scope_new(scope_kind_t kind, mp_parse_node_t pn, qstr source_file, mp_u
     scope_t *scope = m_new0(scope_t, 1);
     scope->kind = kind;
     scope->pn = pn;
-    scope->source_file = source_file;
     if (kind == SCOPE_FUNCTION || kind == SCOPE_CLASS) {
         assert(MP_PARSE_NODE_IS_STRUCT(pn));
         scope->simple_name = MP_PARSE_NODE_LEAF_ARG(((mp_parse_node_struct_t *)pn)->nodes[0]);
diff --git a/python/src/py/scope.h b/python/src/py/scope.h
index edf164c4a..b781dde42 100644
--- a/python/src/py/scope.h
+++ b/python/src/py/scope.h
@@ -32,6 +32,7 @@
 typedef enum {
     ID_INFO_KIND_UNDECIDED,
     ID_INFO_KIND_GLOBAL_IMPLICIT,
+    ID_INFO_KIND_GLOBAL_IMPLICIT_ASSIGNED,
     ID_INFO_KIND_GLOBAL_EXPLICIT,
     ID_INFO_KIND_LOCAL, // in a function f, written and only referenced by f
     ID_INFO_KIND_CELL,  // in a function f, read/written by children of f
@@ -75,7 +76,6 @@ typedef struct _scope_t {
     struct _scope_t *next;
     mp_parse_node_t pn;
     mp_raw_code_t *raw_code;
-    uint16_t source_file; // a qstr
     uint16_t simple_name; // a qstr
     uint16_t scope_flags;  // see runtime0.h
     uint16_t emit_options; // see emitglue.h
@@ -90,7 +90,7 @@ typedef struct _scope_t {
     id_info_t *id_info;
 } scope_t;
 
-scope_t *scope_new(scope_kind_t kind, mp_parse_node_t pn, qstr source_file, mp_uint_t emit_options);
+scope_t *scope_new(scope_kind_t kind, mp_parse_node_t pn, mp_uint_t emit_options);
 void scope_free(scope_t *scope);
 id_info_t *scope_find_or_add_id(scope_t *scope, qstr qstr, id_info_kind_t kind);
 id_info_t *scope_find(scope_t *scope, qstr qstr);
diff --git a/python/src/py/showbc.c b/python/src/py/showbc.c
index cb81b8835..f9c334b93 100644
--- a/python/src/py/showbc.c
+++ b/python/src/py/showbc.c
@@ -28,7 +28,7 @@
 #include <assert.h>
 
 #include "py/bc0.h"
-#include "py/bc.h"
+#include "py/emitglue.h"
 
 #if MICROPY_DEBUG_PRINTERS
 
@@ -38,80 +38,90 @@
             unum = (unum << 7) + (*ip & 0x7f); \
         } while ((*ip++ & 0x80) != 0); \
 }
-#define DECODE_ULABEL do { unum = (ip[0] | (ip[1] << 8)); ip += 2; } while (0)
-#define DECODE_SLABEL do { unum = (ip[0] | (ip[1] << 8)) - 0x8000; ip += 2; } while (0)
 
-#if MICROPY_PERSISTENT_CODE
+#define DECODE_ULABEL \
+    do { \
+        if (ip[0] & 0x80) { \
+            unum = ((ip[0] & 0x7f) | (ip[1] << 7)); \
+            ip += 2; \
+        } else { \
+            unum = ip[0]; \
+            ip += 1; \
+        } \
+    } while (0)
+
+#define DECODE_SLABEL \
+    do { \
+        if (ip[0] & 0x80) { \
+            unum = ((ip[0] & 0x7f) | (ip[1] << 7)) - 0x4000; \
+            ip += 2; \
+        } else { \
+            unum = ip[0] - 0x40; \
+            ip += 1; \
+        } \
+    } while (0)
+
+#if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
 
 #define DECODE_QSTR \
-    qst = ip[0] | ip[1] << 8; \
-    ip += 2;
-#define DECODE_PTR \
     DECODE_UINT; \
-    unum = mp_showbc_const_table[unum]
-#define DECODE_OBJ \
-    DECODE_UINT; \
-    unum = mp_showbc_const_table[unum]
+    qst = qstr_table[unum]
 
 #else
 
-#define DECODE_QSTR { \
-        qst = 0; \
-        do { \
-            qst = (qst << 7) + (*ip & 0x7f); \
-        } while ((*ip++ & 0x80) != 0); \
-}
-#define DECODE_PTR do { \
-        ip = (byte *)MP_ALIGN(ip, sizeof(void *)); \
-        unum = (uintptr_t)*(void **)ip; \
-        ip += sizeof(void *); \
-} while (0)
-#define DECODE_OBJ do { \
-        ip = (byte *)MP_ALIGN(ip, sizeof(mp_obj_t)); \
-        unum = (mp_uint_t)*(mp_obj_t *)ip; \
-        ip += sizeof(mp_obj_t); \
-} while (0)
+#define DECODE_QSTR \
+    DECODE_UINT; \
+    qst = unum;
 
 #endif
 
-const byte *mp_showbc_code_start;
-const mp_uint_t *mp_showbc_const_table;
+#define DECODE_PTR \
+    DECODE_UINT; \
+    unum = (mp_uint_t)(uintptr_t)child_table[unum]
 
-void mp_bytecode_print(const mp_print_t *print, const void *descr, const byte *ip, mp_uint_t len, const mp_uint_t *const_table) {
-    mp_showbc_code_start = ip;
+#define DECODE_OBJ \
+    DECODE_UINT; \
+    unum = (mp_uint_t)obj_table[unum]
+
+void mp_bytecode_print(const mp_print_t *print, const mp_raw_code_t *rc, const mp_module_constants_t *cm) {
+    const byte *ip_start = rc->fun_data;
+    const byte *ip = rc->fun_data;
 
     // Decode prelude
     MP_BC_PRELUDE_SIG_DECODE(ip);
     MP_BC_PRELUDE_SIZE_DECODE(ip);
     const byte *code_info = ip;
 
-    #if MICROPY_PERSISTENT_CODE
-    qstr block_name = code_info[0] | (code_info[1] << 8);
-    qstr source_file = code_info[2] | (code_info[3] << 8);
-    code_info += 4;
-    #else
     qstr block_name = mp_decode_uint(&code_info);
-    qstr source_file = mp_decode_uint(&code_info);
+    #if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
+    block_name = cm->qstr_table[block_name];
+    qstr source_file = cm->qstr_table[0];
+    #else
+    qstr source_file = cm->source_file;
     #endif
-    mp_printf(print, "File %s, code block '%s' (descriptor: %p, bytecode @%p " UINT_FMT " bytes)\n",
-        qstr_str(source_file), qstr_str(block_name), descr, mp_showbc_code_start, len);
+    mp_printf(print, "File %s, code block '%s' (descriptor: %p, bytecode @%p %u bytes)\n",
+        qstr_str(source_file), qstr_str(block_name), rc, ip_start, (unsigned)rc->fun_data_len);
 
     // raw bytecode dump
-    size_t prelude_size = ip - mp_showbc_code_start + n_info + n_cell;
-    mp_printf(print, "Raw bytecode (code_info_size=" UINT_FMT ", bytecode_size=" UINT_FMT "):\n",
-        prelude_size, len - prelude_size);
-    for (mp_uint_t i = 0; i < len; i++) {
+    size_t prelude_size = ip - ip_start + n_info + n_cell;
+    mp_printf(print, "Raw bytecode (code_info_size=%u, bytecode_size=%u):\n",
+        (unsigned)prelude_size, (unsigned)(rc->fun_data_len - prelude_size));
+    for (size_t i = 0; i < rc->fun_data_len; i++) {
         if (i > 0 && i % 16 == 0) {
             mp_printf(print, "\n");
         }
-        mp_printf(print, " %02x", mp_showbc_code_start[i]);
+        mp_printf(print, " %02x", ip_start[i]);
     }
     mp_printf(print, "\n");
 
     // bytecode prelude: arg names (as qstr objects)
     mp_printf(print, "arg names:");
     for (mp_uint_t i = 0; i < n_pos_args + n_kwonly_args; i++) {
-        mp_printf(print, " %s", qstr_str(MP_OBJ_QSTR_VALUE(const_table[i])));
+        qstr qst = mp_decode_uint(&code_info);
+        #if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
+        qst = cm->qstr_table[qst];
+        #endif
+        mp_printf(print, " %s", qstr_str(qst));
     }
     mp_printf(print, "\n");
 
@@ -120,6 +130,7 @@ void mp_bytecode_print(const mp_print_t *print, const void *descr, const byte *i
 
     // skip over code_info
     ip += n_info;
+    const byte *line_info_top = ip;
 
     // bytecode prelude: initialise closed over variables
     for (size_t i = 0; i < n_cell; ++i) {
@@ -132,7 +143,7 @@ void mp_bytecode_print(const mp_print_t *print, const void *descr, const byte *i
         mp_int_t bc = 0;
         mp_uint_t source_line = 1;
         mp_printf(print, "  bc=" INT_FMT " line=" UINT_FMT "\n", bc, source_line);
-        for (const byte *ci = code_info; *ci;) {
+        for (const byte *ci = code_info; ci < line_info_top;) {
             if ((ci[0] & 0x80) == 0) {
                 // 0b0LLBBBBB encoding
                 bc += ci[0] & 0x1f;
@@ -147,10 +158,14 @@ void mp_bytecode_print(const mp_print_t *print, const void *descr, const byte *i
             mp_printf(print, "  bc=" INT_FMT " line=" UINT_FMT "\n", bc, source_line);
         }
     }
-    mp_bytecode_print2(print, ip, len - prelude_size, const_table);
+    mp_bytecode_print2(print, ip, rc->fun_data_len - prelude_size, rc->children, cm);
 }
 
-const byte *mp_bytecode_print_str(const mp_print_t *print, const byte *ip) {
+const byte *mp_bytecode_print_str(const mp_print_t *print, const byte *ip_start, const byte *ip, mp_raw_code_t *const *child_table, const mp_module_constants_t *cm) {
+    #if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
+    const qstr_short_t *qstr_table = cm->qstr_table;
+    #endif
+    const mp_obj_t *obj_table = cm->obj_table;
     mp_uint_t unum;
     qstr qst;
 
@@ -208,25 +223,16 @@ const byte *mp_bytecode_print_str(const mp_print_t *print, const byte *ip) {
         case MP_BC_LOAD_NAME:
             DECODE_QSTR;
             mp_printf(print, "LOAD_NAME %s", qstr_str(qst));
-            if (MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE) {
-                mp_printf(print, " (cache=%u)", *ip++);
-            }
             break;
 
         case MP_BC_LOAD_GLOBAL:
             DECODE_QSTR;
             mp_printf(print, "LOAD_GLOBAL %s", qstr_str(qst));
-            if (MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE) {
-                mp_printf(print, " (cache=%u)", *ip++);
-            }
             break;
 
         case MP_BC_LOAD_ATTR:
             DECODE_QSTR;
             mp_printf(print, "LOAD_ATTR %s", qstr_str(qst));
-            if (MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE) {
-                mp_printf(print, " (cache=%u)", *ip++);
-            }
             break;
 
         case MP_BC_LOAD_METHOD:
@@ -270,9 +276,6 @@ const byte *mp_bytecode_print_str(const mp_print_t *print, const byte *ip) {
         case MP_BC_STORE_ATTR:
             DECODE_QSTR;
             mp_printf(print, "STORE_ATTR %s", qstr_str(qst));
-            if (MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE) {
-                mp_printf(print, " (cache=%u)", *ip++);
-            }
             break;
 
         case MP_BC_STORE_SUBSCR:
@@ -321,32 +324,32 @@ const byte *mp_bytecode_print_str(const mp_print_t *print, const byte *ip) {
 
         case MP_BC_JUMP:
             DECODE_SLABEL;
-            mp_printf(print, "JUMP " UINT_FMT, (mp_uint_t)(ip + unum - mp_showbc_code_start));
+            mp_printf(print, "JUMP " UINT_FMT, (mp_uint_t)(ip + unum - ip_start));
             break;
 
         case MP_BC_POP_JUMP_IF_TRUE:
             DECODE_SLABEL;
-            mp_printf(print, "POP_JUMP_IF_TRUE " UINT_FMT, (mp_uint_t)(ip + unum - mp_showbc_code_start));
+            mp_printf(print, "POP_JUMP_IF_TRUE " UINT_FMT, (mp_uint_t)(ip + unum - ip_start));
             break;
 
         case MP_BC_POP_JUMP_IF_FALSE:
             DECODE_SLABEL;
-            mp_printf(print, "POP_JUMP_IF_FALSE " UINT_FMT, (mp_uint_t)(ip + unum - mp_showbc_code_start));
+            mp_printf(print, "POP_JUMP_IF_FALSE " UINT_FMT, (mp_uint_t)(ip + unum - ip_start));
             break;
 
         case MP_BC_JUMP_IF_TRUE_OR_POP:
-            DECODE_SLABEL;
-            mp_printf(print, "JUMP_IF_TRUE_OR_POP " UINT_FMT, (mp_uint_t)(ip + unum - mp_showbc_code_start));
+            DECODE_ULABEL;
+            mp_printf(print, "JUMP_IF_TRUE_OR_POP " UINT_FMT, (mp_uint_t)(ip + unum - ip_start));
             break;
 
         case MP_BC_JUMP_IF_FALSE_OR_POP:
-            DECODE_SLABEL;
-            mp_printf(print, "JUMP_IF_FALSE_OR_POP " UINT_FMT, (mp_uint_t)(ip + unum - mp_showbc_code_start));
+            DECODE_ULABEL;
+            mp_printf(print, "JUMP_IF_FALSE_OR_POP " UINT_FMT, (mp_uint_t)(ip + unum - ip_start));
             break;
 
         case MP_BC_SETUP_WITH:
             DECODE_ULABEL; // loop-like labels are always forward
-            mp_printf(print, "SETUP_WITH " UINT_FMT, (mp_uint_t)(ip + unum - mp_showbc_code_start));
+            mp_printf(print, "SETUP_WITH " UINT_FMT, (mp_uint_t)(ip + unum - ip_start));
             break;
 
         case MP_BC_WITH_CLEANUP:
@@ -355,18 +358,18 @@ const byte *mp_bytecode_print_str(const mp_print_t *print, const byte *ip) {
 
         case MP_BC_UNWIND_JUMP:
             DECODE_SLABEL;
-            mp_printf(print, "UNWIND_JUMP " UINT_FMT " %d", (mp_uint_t)(ip + unum - mp_showbc_code_start), *ip);
+            mp_printf(print, "UNWIND_JUMP " UINT_FMT " %d", (mp_uint_t)(ip + unum - ip_start), *ip);
             ip += 1;
             break;
 
         case MP_BC_SETUP_EXCEPT:
             DECODE_ULABEL; // except labels are always forward
-            mp_printf(print, "SETUP_EXCEPT " UINT_FMT, (mp_uint_t)(ip + unum - mp_showbc_code_start));
+            mp_printf(print, "SETUP_EXCEPT " UINT_FMT, (mp_uint_t)(ip + unum - ip_start));
             break;
 
         case MP_BC_SETUP_FINALLY:
             DECODE_ULABEL; // except labels are always forward
-            mp_printf(print, "SETUP_FINALLY " UINT_FMT, (mp_uint_t)(ip + unum - mp_showbc_code_start));
+            mp_printf(print, "SETUP_FINALLY " UINT_FMT, (mp_uint_t)(ip + unum - ip_start));
             break;
 
         case MP_BC_END_FINALLY:
@@ -387,12 +390,12 @@ const byte *mp_bytecode_print_str(const mp_print_t *print, const byte *ip) {
 
         case MP_BC_FOR_ITER:
             DECODE_ULABEL; // the jump offset if iteration finishes; for labels are always forward
-            mp_printf(print, "FOR_ITER " UINT_FMT, (mp_uint_t)(ip + unum - mp_showbc_code_start));
+            mp_printf(print, "FOR_ITER " UINT_FMT, (mp_uint_t)(ip + unum - ip_start));
             break;
 
         case MP_BC_POP_EXCEPT_JUMP:
             DECODE_ULABEL; // these labels are always forward
-            mp_printf(print, "POP_EXCEPT_JUMP " UINT_FMT, (mp_uint_t)(ip + unum - mp_showbc_code_start));
+            mp_printf(print, "POP_EXCEPT_JUMP " UINT_FMT, (mp_uint_t)(ip + unum - ip_start));
             break;
 
         case MP_BC_BUILD_TUPLE:
@@ -531,7 +534,8 @@ const byte *mp_bytecode_print_str(const mp_print_t *print, const byte *ip) {
             } else if (ip[-1] < MP_BC_STORE_FAST_MULTI + 16) {
                 mp_printf(print, "STORE_FAST " UINT_FMT, (mp_uint_t)ip[-1] - MP_BC_STORE_FAST_MULTI);
             } else if (ip[-1] < MP_BC_UNARY_OP_MULTI + MP_UNARY_OP_NUM_BYTECODE) {
-                mp_printf(print, "UNARY_OP " UINT_FMT, (mp_uint_t)ip[-1] - MP_BC_UNARY_OP_MULTI);
+                mp_uint_t op = ip[-1] - MP_BC_UNARY_OP_MULTI;
+                mp_printf(print, "UNARY_OP " UINT_FMT " %s", op, qstr_str(mp_unary_op_method_name[op]));
             } else if (ip[-1] < MP_BC_BINARY_OP_MULTI + MP_BINARY_OP_NUM_BYTECODE) {
                 mp_uint_t op = ip[-1] - MP_BC_BINARY_OP_MULTI;
                 mp_printf(print, "BINARY_OP " UINT_FMT " %s", op, qstr_str(mp_binary_op_method_name[op]));
@@ -546,12 +550,11 @@ const byte *mp_bytecode_print_str(const mp_print_t *print, const byte *ip) {
     return ip;
 }
 
-void mp_bytecode_print2(const mp_print_t *print, const byte *ip, size_t len, const mp_uint_t *const_table) {
-    mp_showbc_code_start = ip;
-    mp_showbc_const_table = const_table;
-    while (ip < len + mp_showbc_code_start) {
-        mp_printf(print, "%02u ", (uint)(ip - mp_showbc_code_start));
-        ip = mp_bytecode_print_str(print, ip);
+void mp_bytecode_print2(const mp_print_t *print, const byte *ip, size_t len, mp_raw_code_t *const *child_table, const mp_module_constants_t *cm) {
+    const byte *ip_start = ip;
+    while (ip < ip_start + len) {
+        mp_printf(print, "%02u ", (uint)(ip - ip_start));
+        ip = mp_bytecode_print_str(print, ip_start, ip, child_table, cm);
         mp_printf(print, "\n");
     }
 }
diff --git a/python/src/py/smallint.h b/python/src/py/smallint.h
index 67daf9b9f..584e0018d 100644
--- a/python/src/py/smallint.h
+++ b/python/src/py/smallint.h
@@ -61,6 +61,13 @@
 
 #define MP_SMALL_INT_MAX ((mp_int_t)(~(MP_SMALL_INT_MIN)))
 
+// https://stackoverflow.com/a/4589384/1976323
+// Number of bits in inttype_MAX, or in any (1<<k)-1 where 0 <= k < 2040
+#define MP_IMAX_BITS(m) ((m) / ((m) % 255 + 1) / 255 % 255 * 8 + 7 - 86 / ((m) % 255 + 12))
+
+// The number of bits in a MP_SMALL_INT including the sign bit.
+#define MP_SMALL_INT_BITS (MP_IMAX_BITS(MP_SMALL_INT_MAX) + 1)
+
 bool mp_small_int_mul_overflow(mp_int_t x, mp_int_t y);
 mp_int_t mp_small_int_modulo(mp_int_t dividend, mp_int_t divisor);
 mp_int_t mp_small_int_floor_divide(mp_int_t num, mp_int_t denom);
diff --git a/python/src/py/vm.c b/python/src/py/vm.c
index bbfc9914e..02f8bc88c 100644
--- a/python/src/py/vm.c
+++ b/python/src/py/vm.c
@@ -31,9 +31,9 @@
 
 #include "py/emitglue.h"
 #include "py/objtype.h"
+#include "py/objfun.h"
 #include "py/runtime.h"
 #include "py/bc0.h"
-#include "py/bc.h"
 #include "py/profile.h"
 
 // *FORMAT-OFF*
@@ -44,7 +44,7 @@
 #else
 #define TRACE_PREFIX mp_printf(&mp_plat_print, "sp=%d ", (int)(sp - &code_state->state[0] + 1))
 #endif
-#define TRACE(ip) TRACE_PREFIX; mp_bytecode_print2(&mp_plat_print, ip, 1, code_state->fun_bc->const_table);
+#define TRACE(ip) TRACE_PREFIX; mp_bytecode_print2(&mp_plat_print, ip, 1, code_state->fun_bc->child_table, &code_state->fun_bc->context->constants);
 #else
 #define TRACE(ip)
 #endif
@@ -61,38 +61,53 @@
     do { \
         unum = (unum << 7) + (*ip & 0x7f); \
     } while ((*ip++ & 0x80) != 0)
-#define DECODE_ULABEL size_t ulab = (ip[0] | (ip[1] << 8)); ip += 2
-#define DECODE_SLABEL size_t slab = (ip[0] | (ip[1] << 8)) - 0x8000; ip += 2
 
-#if MICROPY_PERSISTENT_CODE
+#define DECODE_ULABEL \
+    size_t ulab; \
+    do { \
+        if (ip[0] & 0x80) { \
+            ulab = ((ip[0] & 0x7f) | (ip[1] << 7)); \
+            ip += 2; \
+        } else { \
+            ulab = ip[0]; \
+            ip += 1; \
+        } \
+    } while (0)
+
+#define DECODE_SLABEL \
+    size_t slab; \
+    do { \
+        if (ip[0] & 0x80) { \
+            slab = ((ip[0] & 0x7f) | (ip[1] << 7)) - 0x4000; \
+            ip += 2; \
+        } else { \
+            slab = ip[0] - 0x40; \
+            ip += 1; \
+        } \
+    } while (0)
+
+#if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
 
 #define DECODE_QSTR \
-    qstr qst = ip[0] | ip[1] << 8; \
-    ip += 2;
-#define DECODE_PTR \
     DECODE_UINT; \
-    void *ptr = (void*)(uintptr_t)code_state->fun_bc->const_table[unum]
-#define DECODE_OBJ \
-    DECODE_UINT; \
-    mp_obj_t obj = (mp_obj_t)code_state->fun_bc->const_table[unum]
+    qstr qst = qstr_table[unum]
 
 #else
 
-#define DECODE_QSTR qstr qst = 0; \
-    do { \
-        qst = (qst << 7) + (*ip & 0x7f); \
-    } while ((*ip++ & 0x80) != 0)
-#define DECODE_PTR \
-    ip = (byte*)MP_ALIGN(ip, sizeof(void*)); \
-    void *ptr = *(void**)ip; \
-    ip += sizeof(void*)
-#define DECODE_OBJ \
-    ip = (byte*)MP_ALIGN(ip, sizeof(mp_obj_t)); \
-    mp_obj_t obj = *(mp_obj_t*)ip; \
-    ip += sizeof(mp_obj_t)
+#define DECODE_QSTR \
+    DECODE_UINT; \
+    qstr qst = unum;
 
 #endif
 
+#define DECODE_PTR \
+    DECODE_UINT; \
+    void *ptr = (void *)(uintptr_t)code_state->fun_bc->child_table[unum]
+
+#define DECODE_OBJ \
+    DECODE_UINT; \
+    mp_obj_t obj = (mp_obj_t)code_state->fun_bc->context->constants.obj_table[unum]
+
 #define PUSH(val) *++sp = (val)
 #define POP() (*sp--)
 #define TOP() (*sp)
@@ -180,30 +195,13 @@
 #define TRACE_TICK(current_ip, current_sp, is_exception)
 #endif // MICROPY_PY_SYS_SETTRACE
 
-#if MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE
-static inline mp_map_elem_t *mp_map_cached_lookup(mp_map_t *map, qstr qst, uint8_t *idx_cache) {
-    size_t idx = *idx_cache;
-    mp_obj_t key = MP_OBJ_NEW_QSTR(qst);
-    mp_map_elem_t *elem = NULL;
-    if (idx < map->alloc && map->table[idx].key == key) {
-        elem = &map->table[idx];
-    } else {
-        elem = mp_map_lookup(map, key, MP_MAP_LOOKUP);
-        if (elem != NULL) {
-            *idx_cache = (elem - &map->table[0]) & 0xff;
-        }
-    }
-    return elem;
-}
-#endif
-
 // fastn has items in reverse order (fastn[0] is local[0], fastn[-1] is local[1], etc)
 // sp points to bottom of stack which grows up
 // returns:
 //  MP_VM_RETURN_NORMAL, sp valid, return value in *sp
 //  MP_VM_RETURN_YIELD, ip, sp valid, yielded value in *sp
 //  MP_VM_RETURN_EXCEPTION, exception in state[0]
-mp_vm_return_kind_t mp_execute_bytecode(mp_code_state_t *code_state, volatile mp_obj_t inject_exc) {
+mp_vm_return_kind_t MICROPY_WRAP_MP_EXECUTE_BYTECODE(mp_execute_bytecode)(mp_code_state_t *code_state, volatile mp_obj_t inject_exc) {
 #define SELECTIVE_EXC_IP (0)
 #if SELECTIVE_EXC_IP
 #define MARK_EXC_IP_SELECTIVE() { code_state->ip = ip; } /* stores ip 1 byte past last opcode */
@@ -272,6 +270,9 @@ outer_dispatch_loop:
             // local variables that are not visible to the exception handler
             const byte *ip = code_state->ip;
             mp_obj_t *sp = code_state->sp;
+            #if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
+            const qstr_short_t *qstr_table = code_state->fun_bc->context->constants.qstr_table;
+            #endif
             mp_obj_t obj_shared;
             MICROPY_VM_HOOK_INIT
 
@@ -361,84 +362,46 @@ dispatch_loop:
                     goto load_check;
                 }
 
-                #if !MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE
                 ENTRY(MP_BC_LOAD_NAME): {
                     MARK_EXC_IP_SELECTIVE();
                     DECODE_QSTR;
                     PUSH(mp_load_name(qst));
                     DISPATCH();
                 }
-                #else
-                ENTRY(MP_BC_LOAD_NAME): {
-                    MARK_EXC_IP_SELECTIVE();
-                    DECODE_QSTR;
-                    mp_map_elem_t *elem = mp_map_cached_lookup(&mp_locals_get()->map, qst, (uint8_t*)ip);
-                    mp_obj_t obj;
-                    if (elem != NULL) {
-                        obj = elem->value;
-                    } else {
-                        obj = mp_load_name(qst);
-                    }
-                    PUSH(obj);
-                    ip++;
-                    DISPATCH();
-                }
-                #endif
 
-                #if !MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE
                 ENTRY(MP_BC_LOAD_GLOBAL): {
                     MARK_EXC_IP_SELECTIVE();
                     DECODE_QSTR;
                     PUSH(mp_load_global(qst));
                     DISPATCH();
                 }
-                #else
-                ENTRY(MP_BC_LOAD_GLOBAL): {
-                    MARK_EXC_IP_SELECTIVE();
-                    DECODE_QSTR;
-                    mp_map_elem_t *elem = mp_map_cached_lookup(&mp_globals_get()->map, qst, (uint8_t*)ip);
-                    mp_obj_t obj;
-                    if (elem != NULL) {
-                        obj = elem->value;
-                    } else {
-                        obj = mp_load_global(qst);
-                    }
-                    PUSH(obj);
-                    ip++;
-                    DISPATCH();
-                }
-                #endif
 
-                #if !MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE
-                ENTRY(MP_BC_LOAD_ATTR): {
-                    FRAME_UPDATE();
-                    MARK_EXC_IP_SELECTIVE();
-                    DECODE_QSTR;
-                    SET_TOP(mp_load_attr(TOP(), qst));
-                    DISPATCH();
-                }
-                #else
                 ENTRY(MP_BC_LOAD_ATTR): {
                     FRAME_UPDATE();
                     MARK_EXC_IP_SELECTIVE();
                     DECODE_QSTR;
                     mp_obj_t top = TOP();
+                    mp_obj_t obj;
+                    #if MICROPY_OPT_LOAD_ATTR_FAST_PATH
+                    // For the specific case of an instance type, it implements .attr
+                    // and forwards to its members map. Attribute lookups on instance
+                    // types are extremely common, so avoid all the other checks and
+                    // calls that normally happen first.
                     mp_map_elem_t *elem = NULL;
                     if (mp_obj_is_instance_type(mp_obj_get_type(top))) {
                         mp_obj_instance_t *self = MP_OBJ_TO_PTR(top);
-                        elem = mp_map_cached_lookup(&self->members, qst, (uint8_t*)ip);
+                        elem = mp_map_lookup(&self->members, MP_OBJ_NEW_QSTR(qst), MP_MAP_LOOKUP);
                     }
-                    mp_obj_t obj;
-                    if (elem != NULL) {
+                    if (elem) {
                         obj = elem->value;
-                    } else {
+                    } else
+                    #endif
+                    {
                         obj = mp_load_attr(top, qst);
                     }
                     SET_TOP(obj);
-                    ip++;
                     DISPATCH();
                 }
-                #endif
 
                 ENTRY(MP_BC_LOAD_METHOD): {
                     MARK_EXC_IP_SELECTIVE();
@@ -494,7 +457,6 @@ dispatch_loop:
                     DISPATCH();
                 }
 
-                #if !MICROPY_OPT_CACHE_MAP_LOOKUP_IN_BYTECODE
                 ENTRY(MP_BC_STORE_ATTR): {
                     FRAME_UPDATE();
                     MARK_EXC_IP_SELECTIVE();
@@ -503,32 +465,6 @@ dispatch_loop:
                     sp -= 2;
                     DISPATCH();
                 }
-                #else
-                // This caching code works with MICROPY_PY_BUILTINS_PROPERTY and/or
-                // MICROPY_PY_DESCRIPTORS enabled because if the attr exists in
-                // self->members then it can't be a property or have descriptors.  A
-                // consequence of this is that we can't use MP_MAP_LOOKUP_ADD_IF_NOT_FOUND
-                // in the fast-path below, because that store could override a property.
-                ENTRY(MP_BC_STORE_ATTR): {
-                    FRAME_UPDATE();
-                    MARK_EXC_IP_SELECTIVE();
-                    DECODE_QSTR;
-                    mp_map_elem_t *elem = NULL;
-                    mp_obj_t top = TOP();
-                    if (mp_obj_is_instance_type(mp_obj_get_type(top)) && sp[-1] != MP_OBJ_NULL) {
-                        mp_obj_instance_t *self = MP_OBJ_TO_PTR(top);
-                        elem = mp_map_cached_lookup(&self->members, qst, (uint8_t*)ip);
-                    }
-                    if (elem != NULL) {
-                        elem->value = sp[-1];
-                    } else {
-                        mp_store_attr(sp[0], qst, sp[-1]);
-                    }
-                    sp -= 2;
-                    ip++;
-                    DISPATCH();
-                }
-                #endif
 
                 ENTRY(MP_BC_STORE_SUBSCR):
                     MARK_EXC_IP_SELECTIVE();
@@ -624,9 +560,9 @@ dispatch_loop:
                 }
 
                 ENTRY(MP_BC_JUMP_IF_TRUE_OR_POP): {
-                    DECODE_SLABEL;
+                    DECODE_ULABEL;
                     if (mp_obj_is_true(TOP())) {
-                        ip += slab;
+                        ip += ulab;
                     } else {
                         sp--;
                     }
@@ -634,11 +570,11 @@ dispatch_loop:
                 }
 
                 ENTRY(MP_BC_JUMP_IF_FALSE_OR_POP): {
-                    DECODE_SLABEL;
+                    DECODE_ULABEL;
                     if (mp_obj_is_true(TOP())) {
                         sp--;
                     } else {
-                        ip += slab;
+                        ip += ulab;
                     }
                     DISPATCH_WITH_PEND_EXC_CHECK();
                 }
@@ -809,8 +745,8 @@ unwind_jump:;
                     obj = mp_getiter(obj, iter_buf);
                     if (obj != MP_OBJ_FROM_PTR(iter_buf)) {
                         // Iterator didn't use the stack so indicate that with MP_OBJ_NULL.
-                        sp[-MP_OBJ_ITER_BUF_NSLOTS + 1] = MP_OBJ_NULL;
-                        sp[-MP_OBJ_ITER_BUF_NSLOTS + 2] = obj;
+                        *(sp - MP_OBJ_ITER_BUF_NSLOTS + 1) = MP_OBJ_NULL;
+                        *(sp - MP_OBJ_ITER_BUF_NSLOTS + 2) = obj;
                     }
                     DISPATCH();
                 }
@@ -821,8 +757,8 @@ unwind_jump:;
                     DECODE_ULABEL; // the jump offset if iteration finishes; for labels are always forward
                     code_state->sp = sp;
                     mp_obj_t obj;
-                    if (sp[-MP_OBJ_ITER_BUF_NSLOTS + 1] == MP_OBJ_NULL) {
-                        obj = sp[-MP_OBJ_ITER_BUF_NSLOTS + 2];
+                    if (*(sp - MP_OBJ_ITER_BUF_NSLOTS + 1) == MP_OBJ_NULL) {
+                        obj = *(sp - MP_OBJ_ITER_BUF_NSLOTS + 2);
                     } else {
                         obj = MP_OBJ_FROM_PTR(&sp[-MP_OBJ_ITER_BUF_NSLOTS + 1]);
                     }
@@ -941,15 +877,15 @@ unwind_jump:;
 
                 ENTRY(MP_BC_MAKE_FUNCTION): {
                     DECODE_PTR;
-                    PUSH(mp_make_function_from_raw_code(ptr, MP_OBJ_NULL, MP_OBJ_NULL));
+                    PUSH(mp_make_function_from_raw_code(ptr, code_state->fun_bc->context, NULL));
                     DISPATCH();
                 }
 
                 ENTRY(MP_BC_MAKE_FUNCTION_DEFARGS): {
                     DECODE_PTR;
                     // Stack layout: def_tuple def_dict <- TOS
-                    mp_obj_t def_dict = POP();
-                    SET_TOP(mp_make_function_from_raw_code(ptr, TOP(), def_dict));
+                    sp -= 1;
+                    SET_TOP(mp_make_function_from_raw_code(ptr, code_state->fun_bc->context, sp));
                     DISPATCH();
                 }
 
@@ -958,7 +894,7 @@ unwind_jump:;
                     size_t n_closed_over = *ip++;
                     // Stack layout: closed_overs <- TOS
                     sp -= n_closed_over - 1;
-                    SET_TOP(mp_make_closure_from_raw_code(ptr, n_closed_over, sp));
+                    SET_TOP(mp_make_closure_from_raw_code(ptr, code_state->fun_bc->context, n_closed_over, sp));
                     DISPATCH();
                 }
 
@@ -967,7 +903,7 @@ unwind_jump:;
                     size_t n_closed_over = *ip++;
                     // Stack layout: def_tuple def_dict closed_overs <- TOS
                     sp -= 2 + n_closed_over - 1;
-                    SET_TOP(mp_make_closure_from_raw_code(ptr, 0x100 | n_closed_over, sp));
+                    SET_TOP(mp_make_closure_from_raw_code(ptr, code_state->fun_bc->context, 0x100 | n_closed_over, sp));
                     DISPATCH();
                 }
 
@@ -1013,8 +949,8 @@ unwind_jump:;
                     // unum & 0xff == n_positional
                     // (unum >> 8) & 0xff == n_keyword
                     // We have following stack layout here:
-                    // fun arg0 arg1 ... kw0 val0 kw1 val1 ... seq dict <- TOS
-                    sp -= (unum & 0xff) + ((unum >> 7) & 0x1fe) + 2;
+                    // fun arg0 arg1 ... kw0 val0 kw1 val1 ... bitmap <- TOS
+                    sp -= (unum & 0xff) + ((unum >> 7) & 0x1fe) + 1;
                     #if MICROPY_STACKLESS
                     if (mp_obj_get_type(*sp) == &mp_type_fun_bc) {
                         code_state->ip = ip;
@@ -1098,8 +1034,8 @@ unwind_jump:;
                     // unum & 0xff == n_positional
                     // (unum >> 8) & 0xff == n_keyword
                     // We have following stack layout here:
-                    // fun self arg0 arg1 ... kw0 val0 kw1 val1 ... seq dict <- TOS
-                    sp -= (unum & 0xff) + ((unum >> 7) & 0x1fe) + 3;
+                    // fun self arg0 arg1 ... kw0 val0 kw1 val1 ... bitmap <- TOS
+                    sp -= (unum & 0xff) + ((unum >> 7) & 0x1fe) + 2;
                     #if MICROPY_STACKLESS
                     if (mp_obj_get_type(*sp) == &mp_type_fun_bc) {
                         code_state->ip = ip;
@@ -1466,23 +1402,20 @@ unwind_loop:
                 const byte *ip = code_state->fun_bc->bytecode;
                 MP_BC_PRELUDE_SIG_DECODE(ip);
                 MP_BC_PRELUDE_SIZE_DECODE(ip);
+                const byte *line_info_top = ip + n_info;
                 const byte *bytecode_start = ip + n_info + n_cell;
-                #if !MICROPY_PERSISTENT_CODE
-                // so bytecode is aligned
-                bytecode_start = MP_ALIGN(bytecode_start, sizeof(mp_uint_t));
-                #endif
                 size_t bc = code_state->ip - bytecode_start;
-                #if MICROPY_PERSISTENT_CODE
-                qstr block_name = ip[0] | (ip[1] << 8);
-                qstr source_file = ip[2] | (ip[3] << 8);
-                ip += 4;
-                #else
                 qstr block_name = mp_decode_uint_value(ip);
-                ip = mp_decode_uint_skip(ip);
-                qstr source_file = mp_decode_uint_value(ip);
-                ip = mp_decode_uint_skip(ip);
+                for (size_t i = 0; i < 1 + n_pos_args + n_kwonly_args; ++i) {
+                    ip = mp_decode_uint_skip(ip);
+                }
+                #if MICROPY_EMIT_BYTECODE_USES_QSTR_TABLE
+                block_name = code_state->fun_bc->context->constants.qstr_table[block_name];
+                qstr source_file = code_state->fun_bc->context->constants.qstr_table[0];
+                #else
+                qstr source_file = code_state->fun_bc->context->constants.source_file;
                 #endif
-                size_t source_line = mp_bytecode_get_source_line(ip, bc);
+                size_t source_line = mp_bytecode_get_source_line(ip, line_info_top, bc);
                 mp_obj_exception_add_traceback(MP_OBJ_FROM_PTR(nlr.ret_val), source_file, source_line, block_name);
             }
 
diff --git a/python/test/ulab.cpp b/python/test/ulab.cpp
new file mode 100644
index 000000000..9246c1d3e
--- /dev/null
+++ b/python/test/ulab.cpp
@@ -0,0 +1,52 @@
+#include <quiz.h>
+#include "execution_environment.h"
+
+QUIZ_CASE(python_ulab) {
+  TestExecutionEnvironment env = init_environnement();
+  // Try to import ulab module and submodules
+  assert_command_execution_succeeds(env, "import ulab");
+  assert_command_execution_succeeds(env, "import ulab as ul");
+  assert_command_execution_succeeds(env, "from ulab import *");
+  assert_command_execution_succeeds(env, "from ulab import numpy");
+  assert_command_execution_succeeds(env, "from ulab import numpy as np");
+  assert_command_execution_succeeds(env, "from ulab import scipy");
+  assert_command_execution_succeeds(env, "from ulab import scipy as sp");
+  assert_command_execution_succeeds(env, "from ulab import scipy as spy");
+  // NumPy tests
+  assert_command_execution_succeeds(env, "np.array([1, 2, 3])");
+  // Store an array in a variable and use it
+  assert_command_execution_succeeds(env, "a = np.array([1, 2, 3])");
+  assert_command_execution_succeeds(env, "a[0]");
+  assert_command_execution_succeeds(env, "a[1]");
+  assert_command_execution_succeeds(env, "a[2]");
+  assert_command_execution_fails(env, "a[3]");
+  // Test np.all
+  assert_command_execution_succeeds(env, "np.all([1, 2, 3])");
+  // SciPy tests
+  // Test ulab.scipy.linalg using spy prefix
+  assert_command_execution_succeeds(env, "spy.linalg.solve_triangular(np.array([[1, 2], [3, 4]]), np.array([5, 6]))");
+  assert_command_execution_fails(env, "spy.linalg.solve_triangular([[1, 2], [3, 4]], [1, 2, 3])");
+  // Test ulab.scipy.optimize using spy prefix
+  assert_command_execution_succeeds(env, "spy.optimize.fmin(lambda x: x**2, 1)");
+  assert_command_execution_fails(env, "spy.optimize.fmin(lambda x: x**2, 1, maxiter=0)");
+  assert_command_execution_succeeds(env, "spy.optimize.fmin(lambda x: x**2, 1, maxiter=1)");
+  assert_command_execution_fails(env, "spy.optimize.bisect(lambda x: x**2, 1, 2, maxiter=0)");
+  assert_command_execution_succeeds(env, "spy.optimize.newton(lambda x: x**2, 1)");
+  assert_command_execution_fails(env, "spy.optimize.newton(lambda x: x**2, 1, maxiter=0)");
+  assert_command_execution_succeeds(env, "spy.optimize.newton(lambda x: x**2, 1, maxiter=1)");
+  // Test ulab.scipy.signal using spy prefix
+  // TODO: Find a way to test this, maybe in a future ulab release ?
+  // assert_command_execution_succeeds(env, "spy.signal.sosfilt(np.array([1, 2, 3]), np.array([7, 8, 9]))");
+  assert_command_execution_fails(env, "spy.signal.spectrogram(np.array([1, 2, 3]), np.array([7, 8, 9]))");
+  // Test ulab.scipy.special using spy prefix
+  assert_command_execution_succeeds(env, "spy.special.erf(1)");
+  assert_command_execution_fails(env, "spy.special.erf(1, 2)");
+  assert_command_execution_succeeds(env, "spy.special.erfc(1)");
+  assert_command_execution_fails(env, "spy.special.erfc(1, 2)");
+  assert_command_execution_succeeds(env, "spy.special.gamma(1)");
+  assert_command_execution_fails(env, "spy.special.gamma(1, 2)");
+  assert_command_execution_succeeds(env, "spy.special.gammaln(1)");
+  assert_command_execution_fails(env, "spy.special.gammaln(1, 2)");
+  deinit_environment();
+}
+
diff --git a/python/upgrade.md b/python/upgrade.md
index 80cb7c377..12cb3568b 100644
--- a/python/upgrade.md
+++ b/python/upgrade.md
@@ -1,14 +1,14 @@
-Steps to upgrade MicroPython:
-- Clone the micropython project and checkout the current version
-- Find the current patches and save them or make sure they were integrated to the next micropython version
-        git diff Path/to/epsilon/py Path/to/micropython/py
-- Checkout the new version in the micropython project
-- Copy the micropython py files in epsilon py folder
-- Update epsilon/python/Makefile and epsilon/python/port/genhdr/qstrdefs.in.h following the instructions in the files
-- Update other epsilon/python/port/genhdr/ files :
+# Steps to upgrade MicroPython
+
+- Clone the MicroPython project and checkout the current version
+- Find the current patches and save them or make sure they were integrated to the next MicroPython version
+        git diff Path/to/Upsilon/python/src/py Path/to/MicroPython/py
+- Checkout the new version in the MicroPython project
+- Copy the MicroPython py files in Upsilon python/src/py folder
+- Update Upsilon/python/Makefile, Upsilon/python/port/genhdr/qstrdefs.in.h and Upsilon/python/port/genhdr/moduledefs.h following the instructions in the files
+- Update other Upsilon/python/port/genhdr/ files :
         Get a clean copy of MicroPython
         Copy our mpconfigport.h over the "bare-arm" port of MicroPython
         "make" the bare-arm port of MicroPython (don't worry if it doesn't finish)
-        Copy the wanted build/genhdr files to epsilon/python/port/genhdr/
+        Copy the wanted build/genhdr files to Upsilon/python/port/genhdr/
 - Put back the patches from the first step if needed
-