From 3cc05549d00e7f3207b9227bf4cc85f30c9b5012 Mon Sep 17 00:00:00 2001
From: Giovanna Lazzari Miotto <giovanna.lazzari.miotto@cern.ch>
Date: Thu, 20 Mar 2025 13:07:58 +0100
Subject: [PATCH] nmc,cmake: Include micron NMC headers

---
 CMakeLists.txt             |    2 +
 src/micron/nmc.h           |  246 ++++
 src/micron/nmc.hpp         | 1023 +++++++++++++
 src/micron/nmc_errno.h     |   54 +
 src/micron/nmc_sync.h      |   61 +
 src/micron/nmc_sync.hpp    |   55 +
 src/micron/nmc_te.hpp      | 1692 +++++++++++++++++++++
 src/micron/nmc_te_intrin.h | 2856 ++++++++++++++++++++++++++++++++++++
 src/micron/nmc_types.h     |  166 +++
 9 files changed, 6155 insertions(+)
 create mode 100644 src/micron/nmc.h
 create mode 100644 src/micron/nmc.hpp
 create mode 100644 src/micron/nmc_errno.h
 create mode 100644 src/micron/nmc_sync.h
 create mode 100644 src/micron/nmc_sync.hpp
 create mode 100644 src/micron/nmc_te.hpp
 create mode 100644 src/micron/nmc_te_intrin.h
 create mode 100644 src/micron/nmc_types.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6a253119..532175f3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -12,6 +12,8 @@ set(SCDAQ_INCL_DIRS
         ${SCDAQ_SOURCE_DIR}/json
         ${SCDAQ_SOURCE_DIR}/wzdma)
 
+#set(MICRON_INCL_DIRS /opt/micron/include)
+
 set(SCDAQ_CMAKE_MODS_DIR ${CMAKE_SOURCE_DIR}/cmake)
 
 set(CMAKE_CXX_STANDARD 17)
diff --git a/src/micron/nmc.h b/src/micron/nmc.h
new file mode 100644
index 00000000..d29ac595
--- /dev/null
+++ b/src/micron/nmc.h
@@ -0,0 +1,246 @@
+/*
+ * Copyright (C) 2024 Micron Technology, Inc.
+ *
+ * This file is the confidential and proprietary property of
+ * Micron Technology, Inc.
+ */
+
+#pragma once
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "nmc_errno.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _WIN32
+#pragma warning(disable:4099) // Ignore warning where type is declared as class in one place and struct in another
+#endif
+
+#include "nmc_types.h"		// pickup common types
+#include "nmc_sync.h"
+
+extern nmc_remote_host_t nmc_attach_remote_host(char *hostname);
+extern void nmc_detach_remote_host(nmc_remote_host_t host);
+
+extern nmc_attach_props_t nmc_attach_props_construct(void);
+extern void nmc_attach_props_destruct(nmc_attach_props_t props);
+
+extern nmc_remote_host_t nmc_ap_get_remote_host(nmc_attach_props_t props);
+extern void nmc_ap_set_remote_host(nmc_attach_props_t props, nmc_remote_host_t);
+
+extern uint32_t nmc_ap_get_device_id(nmc_attach_props_t props);
+extern void nmc_ap_set_device_id(nmc_attach_props_t props, uint32_t id);
+
+uint32_t nmc_ap_get_se_count(nmc_attach_props_t props);
+void nmc_ap_set_se_count(nmc_attach_props_t props, uint32_t count);
+
+uint32_t nmc_ap_get_te_count(nmc_attach_props_t props);
+void nmc_ap_set_te_count(nmc_attach_props_t props, uint32_t count);
+
+extern uint32_t nmc_ap_get_command_queue_count(nmc_attach_props_t props);
+extern void nmc_ap_set_command_queue_count(nmc_attach_props_t props, uint32_t count);
+
+extern uint32_t nmc_ap_get_te_total_thread_count(nmc_attach_props_t props);
+extern void nmc_ap_set_te_total_thread_count(nmc_attach_props_t props, uint32_t count);
+
+extern uint32_t nmc_ap_get_te_master_stack_size(nmc_attach_props_t props);
+extern void nmc_ap_set_te_master_stack_size(nmc_attach_props_t props, uint32_t size);
+
+extern uint32_t nmc_ap_get_te_fiber_stack_size(nmc_attach_props_t props);
+extern void nmc_ap_set_te_fiber_stack_size(nmc_attach_props_t props, uint32_t size);
+
+extern uint32_t nmc_ap_get_te_master_thread_count(nmc_attach_props_t props);
+extern void nmc_ap_set_te_master_thread_count(nmc_attach_props_t props, uint32_t count);
+
+extern nmc_stack_check_mode_t nmc_ap_get_te_stack_check_mode(nmc_attach_props_t props);
+extern void nmc_ap_set_te_stack_check_mode(nmc_attach_props_t props, EStackCheckMode mode);
+
+size_t nmc_ap_get_memory_bytes(nmc_attach_props_t props);
+void nmc_ap_set_memory_bytes(nmc_attach_props_t props, size_t size);
+
+/*
+ * Resource management
+ */
+
+#define NMC_ATTACH_PROPS_DEFAULT    ((nmc_attach_props_t)0)
+
+extern nmc_t nmc_construct(nmc_attach_props_t props, nmc_status_t *status);
+extern void nmc_destruct(nmc_t nmc);
+
+
+/*
+ * Query routines
+ */
+extern double nmc_attached_time_ns(nmc_t nmc);
+
+
+/*
+ * Memory management
+ */
+
+extern void *nmc_malloc(nmc_t nmc, size_t size);
+extern void *nmc_calloc(nmc_t nmc, size_t nmemb, size_t size);
+extern void *nmc_realloc(nmc_t nmc, void *ptr, size_t size);
+extern void *nmc_aligned_alloc(nmc_t nmc, size_t alignment, size_t size);
+extern void nmc_free(nmc_t nmc, void *ptr);
+extern void *nmc_mmap(nmc_t nmc, void *addr, size_t length, int prot, int flags);
+extern int nmc_munmap(nmc_t nmc, void *addr, size_t length);
+extern void nmc_alloc_break(size_t seqid);
+
+/*
+* Host User Command Properties
+*/
+extern nmc_cmd_props_t nmc_cmd_props_construct(uint16_t cmd_id);
+extern void nmc_cmd_props_destruct(nmc_cmd_props_t cmd_props);
+extern void nmc_cmd_props_set_cmd_id(nmc_cmd_props_t cmd_props, uint16_t cmd_id);
+extern void nmc_cmd_props_set_huq_id(nmc_cmd_props_t cmd_props, uint8_t huq_id);
+extern void nmc_cmd_props_set_cmd_atomic(nmc_cmd_props_t cmd_props, bool atomic);
+extern void nmc_cmd_props_set_arg1_fp(nmc_cmd_props_t cmd_props, bool arg_fp);
+extern void nmc_cmd_props_set_arg2_fp(nmc_cmd_props_t cmd_props, bool arg_fp);
+extern void nmc_cmd_props_set_arg3_fp(nmc_cmd_props_t cmd_props, bool arg_fp);
+extern void nmc_cmd_props_set_arg4_fp(nmc_cmd_props_t cmd_props, bool arg_fp);
+
+/*
+ * Memory Operations
+ */
+extern nmc_status_t nmc_mem_load(nmc_t nmc, nmc_cmd_props_t cmd_props,
+    void *pAddr, size_t size);
+extern nmc_status_t nmc_mem_store(nmc_t nmc, nmc_cmd_props_t cmd_props,
+    void *pAddr, uint64_t data, size_t size);
+
+extern nmc_status_t nmc_mem_copy(nmc_t nmc, nmc_cmd_props_t cmd_props, void *pDst,
+        void *pSrc, size_t size);
+
+extern nmc_status_t nmc_mem_set(nmc_t nmc, nmc_cmd_props_t cmd_props, void *pDst,
+        uint64_t elemData, size_t elemSize, size_t elemCnt);
+
+extern nmc_status_t nmc_mem_gather_stride(nmc_t nmc, nmc_cmd_props_t cmd_props, void *pDst,
+        void *pSrcBase, size_t elemSize, size_t elemStride, size_t elemCnt);
+
+extern nmc_status_t nmc_mem_gather_address(nmc_t nmc, nmc_cmd_props_t cmd_props, void *pDst,
+        void *pSrcAddr, size_t elemSize, size_t elemCnt);
+
+extern nmc_status_t nmc_mem_gather_offset(nmc_t nmc, nmc_cmd_props_t cmd_props,
+	void *pDst, void *pSrcOffset, void *pSrcBase, size_t elemSize,
+	size_t elemCnt);
+
+extern nmc_status_t nmc_mem_scatter_stride(nmc_t nmc, nmc_cmd_props_t cmd_props, void *pDstBase,
+        void *pSrc, size_t elemSize, size_t elemStride, size_t elemCnt);
+
+extern nmc_status_t nmc_mem_scatter_address(nmc_t nmc, nmc_cmd_props_t cmd_props, void *pDstAddr,
+        void *pSrc, size_t elemSize, size_t elemCnt);
+
+extern nmc_status_t nmc_mem_scatter_offset(nmc_t nmc, nmc_cmd_props_t cmd_props, void *pDstBase,
+        void *DstOffset, void *pSrc, size_t elemSize, size_t elemCnt);
+
+
+/*
+ * Image and symbol management
+ */
+
+extern nmc_status_t nmc_te_load(const char *pathname);
+extern void *nmc_te_lookup(const char *symname);
+
+extern nmc_status_t nmc_se_load(nmc_t nmc, const char *pathname);
+extern void *nmc_se_lookup(nmc_t nmc, const char *symname);
+
+/*
+* Command response
+*/
+extern nmc_response_t nmc_response_construct();
+extern void nmc_response_destruct(nmc_response_t response);
+
+extern nmc_status_t nmc_get_response(nmc_t nmc, nmc_response_t response);
+extern nmc_status_t nmc_peek_response(nmc_t nmc, nmc_response_t response);
+extern nmc_status_t nmc_pop_response(nmc_t nmc);
+extern nmc_cmd_t nmc_response_get_cmd(nmc_response_t response);
+extern nmc_cid_t nmc_response_get_cmd_id(nmc_response_t response);
+extern nmc_status_t nmc_response_get_status(nmc_response_t response);
+
+extern void nmc_response_join(nmc_response_t response, nmc_cid_t *cid, uint64_t *arg1, uint64_t *arg2);
+extern void nmc_response_atomic(nmc_response_t response, nmc_cid_t *cid, uint64_t *arg);
+extern void nmc_response_mem_load(nmc_response_t response, nmc_cid_t *cid, uint64_t *data);
+extern void nmc_response_event_receive(nmc_response_t response, nmc_cid_t *cid, uint64_t *ev_data);
+extern void nmc_response_event_destination(nmc_response_t response, nmc_cid_t *cid, uint64_t *ev_dest);
+
+
+/*
+ * Thread creation
+ */
+
+extern nmc_status_t nmc_thread_create(nmc_t nmc, nmc_cmd_props_t cmd_props,
+        void *entry, void *target, uint64_t arg1, uint64_t arg2, uint64_t arg3, uint64_t arg4);
+
+
+/*
+ * Events
+ */
+
+extern nmc_event_t nmc_event_allocate(nmc_t nmc, bool bData);
+extern nmc_status_t nmc_event_free(nmc_t nmc, nmc_event_t event);
+
+extern nmc_status_t nmc_event_destination(nmc_t nmc, nmc_cmd_props_t cmd_props,
+        nmc_event_t evNum);
+extern nmc_status_t nmc_event_simple_mode(nmc_t nmc, nmc_cmd_props_t cmd_props,
+        nmc_event_t evNum);
+extern nmc_status_t nmc_event_broadcast_mode(nmc_t nmc, nmc_cmd_props_t cmd_props,
+        nmc_event_t evNum, uint16_t evChan);
+extern nmc_status_t nmc_event_collect_simple_mode(nmc_t nmc, nmc_cmd_props_t cmd_props,
+        nmc_event_t evNum, uint16_t evCnt);
+extern nmc_status_t nmc_event_collect_reduce_mode(nmc_t nmc, nmc_cmd_props_t cmd_props,
+        nmc_event_reduce_op_type_t evOpType, nmc_event_reduce_op_size_t evOpSize, 
+        nmc_event_t evNum, uint16_t evCnt, uint64_t data);
+extern nmc_status_t nmc_event_collect_cascade_mode(nmc_t nmc, nmc_cmd_props_t cmd_props,
+        uint32_t deviceId, uint32_t cascadeQueueId, nmc_event_t evNum, uint16_t evCnt);
+extern nmc_status_t nmc_event_send(nmc_t nmc, nmc_cmd_props_t cmd_props,
+        uint64_t evDest);
+extern nmc_status_t nmc_event_send_data(nmc_t nmc, nmc_cmd_props_t cmd_props,
+        uint64_t evDest, uint64_t data);
+extern nmc_status_t nmc_event_broadcast(nmc_t nmc, nmc_cmd_props_t cmd_props,
+        nmc_event_t evNum, uint16_t evChan);
+extern nmc_status_t nmc_event_broadcast_data(nmc_t nmc, nmc_cmd_props_t cmd_props,
+        nmc_event_t evNum, uint16_t evChan, uint64_t data);
+extern nmc_status_t nmc_event_receive(nmc_t nmc, nmc_cmd_props_t cmd_props,
+        nmc_event_t evNum);
+
+/*
+ * Atomic operations
+ */
+
+extern nmc_status_t nmc_atomic_add(nmc_t nmc, nmc_cmd_props_t cmd_props,
+    void *pAddr, uint64_t data, size_t size);
+extern nmc_status_t nmc_atomic_xor(nmc_t nmc, nmc_cmd_props_t cmd_props,
+    void *pAddr, uint64_t data, size_t size);
+extern nmc_status_t nmc_atomic_and(nmc_t nmc, nmc_cmd_props_t cmd_props,
+    void *pAddr, uint64_t data, size_t size);
+extern nmc_status_t nmc_atomic_or(nmc_t nmc, nmc_cmd_props_t cmd_props,
+    void *pAddr, uint64_t data, size_t size);
+extern nmc_status_t nmc_atomic_min(nmc_t nmc, nmc_cmd_props_t cmd_props,
+    void *pAddr, uint64_t data, size_t size);
+extern nmc_status_t nmc_atomic_minu(nmc_t nmc, nmc_cmd_props_t cmd_props,
+    void *pAddr, uint64_t data, size_t size);
+extern nmc_status_t nmc_atomic_max(nmc_t nmc, nmc_cmd_props_t cmd_props,
+    void *pAddr, uint64_t data, size_t size);
+extern nmc_status_t nmc_atomic_maxu(nmc_t nmc, nmc_cmd_props_t cmd_props,
+    void *pAddr, uint64_t data, size_t size);
+extern nmc_status_t nmc_atomic_fadd32(nmc_t nmc, nmc_cmd_props_t cmd_props,
+	void *pAddr, float data);
+extern nmc_status_t nmc_atomic_fmin32(nmc_t nmc, nmc_cmd_props_t cmd_props,
+	void *pAddr, float data);
+extern nmc_status_t nmc_atomic_fmax32(nmc_t nmc, nmc_cmd_props_t cmd_props,
+	void *pAddr, float data);
+extern nmc_status_t nmc_atomic_fadd64(nmc_t nmc, nmc_cmd_props_t cmd_props,
+	void *pAddr, double data);
+extern nmc_status_t nmc_atomic_fmin64(nmc_t nmc, nmc_cmd_props_t cmd_props,
+	void *pAddr, double data);
+extern nmc_status_t nmc_atomic_fmax64(nmc_t nmc, nmc_cmd_props_t cmd_props,
+	void *pAddr, double data);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/micron/nmc.hpp b/src/micron/nmc.hpp
new file mode 100644
index 00000000..b6e5c99f
--- /dev/null
+++ b/src/micron/nmc.hpp
@@ -0,0 +1,1023 @@
+//
+// Copyright (C) 2024 Micron Technology, Inc.
+//
+// This file is the confidential and proprietary property of
+// Micron Technology, Inc.
+//
+
+#pragma once
+
+#include <cstring>
+#include <type_traits>
+#include "nmc.h"
+#include "nmc_sync.hpp"
+
+class NmcAttachProps {
+    friend class Nmc;
+public:
+    /// @brief Construct an empty attach properties object
+    ///
+    /// Used to specify hardware resources in a Near Memory Compute Device
+    /// attach domain. The object is initialized to the device defaults and the
+    /// programmer can update only the desired properties.
+    ///
+    /// The attach property object is also used to query the Near Memory Compute Device
+    /// properties after a successful attach. During attach, these fields are updated
+    /// to match what was allocated.
+    ///
+    /// @see Example:
+    ///  To query default Transformation Engine thread count
+    ///  @code
+    ///    NmcAttachProps nmcProps;
+    ///    nmc_status_t status = NMC_NO_MEM;
+    ///    Nmc *pNmc = new Nmc(&status, &nmcProps;
+    ///
+    ///    uint32_t teThreadCount = nmcProps.getTeTotalThreadCount();
+    ///  @endcode
+    NmcAttachProps() {
+        m_opaque = nmc_attach_props_construct();
+    }
+
+    /// @brief Destroy an attach properties object
+    ~NmcAttachProps() {
+        nmc_attach_props_destruct(m_opaque);
+    }
+
+    /// @brief Return device ID
+    ///
+    /// @return Device ID
+    uint32_t getDeviceId() {
+        return nmc_ap_get_device_id(m_opaque);
+    }
+
+    /// @brief Set device ID
+    ///
+    /// @param[in] id ID of device to request
+    void setDeviceId(uint32_t id) {
+        nmc_ap_set_device_id(m_opaque, id);
+    }
+
+    /// @brief Return number of SE compute elements
+    ///
+    /// @return Count of SE elements
+    uint32_t getSeCount() {
+        return nmc_ap_get_se_count(m_opaque);
+    }
+
+    /// @brief Set number of SE compute elements
+    ///
+    /// @param[in] seCnt
+    void setSeCount(uint32_t seCnt) {
+        nmc_ap_set_se_count(m_opaque, seCnt);
+    }
+
+    /// @brief Return number of TE compute elements
+    ///
+    /// @return Count of TE elements
+    uint32_t getTeCount() {
+        return nmc_ap_get_te_count(m_opaque);
+    }
+
+    /// @brief Set number of TE compute elements
+    ///
+    /// @param[in] teCnt
+    void setTeCount(uint32_t teCnt) {
+        nmc_ap_set_te_count(m_opaque, teCnt);
+    }
+
+    /// @brief Return number of TE threads (masters and fibers)
+    ///
+    /// @return Count of total threads
+    uint32_t getTeTotalThreadCount() {
+        return nmc_ap_get_te_total_thread_count(m_opaque);
+    }
+
+    /// @brief Set number of TE threads (masters and fibers)
+    ///
+    /// @param[in] threadCnt Number of total threads
+    void setTeTotalThreadCount(uint32_t threadCnt) {
+        nmc_ap_set_te_total_thread_count(m_opaque, threadCnt);
+    }
+
+    /// @brief Return count of command queues in attach parameters
+    ///
+    /// @return Command queue count
+    uint32_t getCommandQueueCount() {
+        return nmc_ap_get_command_queue_count(m_opaque);
+    }
+
+    /// @brief Set number of command queues to request for an Nmc::attach()
+    ///
+    /// @param[in] commandQueueCount Command queue count
+    void setCommandQueueCount(uint32_t commandQueueCount) {
+        nmc_ap_set_command_queue_count(m_opaque, commandQueueCount);
+    }
+
+    /// @brief Return size of stack for TE master threads
+    ///
+    /// @return Stack size in bytes
+    uint32_t getTeMasterStackSize() {
+        return nmc_ap_get_te_master_stack_size(m_opaque);
+    }
+
+    /// @brief Set size of TE master thread stack for an Nmc::attach()
+    ///
+    /// @param[in] teMasterStackSize Size in bytes
+    void setTeMasterStackSize(uint32_t teMasterStackSize) {
+        nmc_ap_set_te_master_stack_size(m_opaque, teMasterStackSize);
+    }
+
+    /// @brief Return size of stack for TE fibers
+    ///
+    /// @return  Stack size in bytes
+    uint32_t getTeFiberStackSize() {
+        return nmc_ap_get_te_fiber_stack_size(m_opaque);
+    }
+
+    /// @brief Set size of TE fiber stack for an Nmc::attach()
+    ///
+    /// @param[in] teFiberStackSize Size in bytes
+    void setTeFiberStackSize(uint32_t teFiberStackSize) {
+        nmc_ap_set_te_fiber_stack_size(m_opaque, teFiberStackSize);
+    }
+
+    /// @brief Return maximum number of TE master threads
+    ///
+    /// @return Count of available master threads
+    uint32_t getTeMasterThreadCount() {
+        return nmc_ap_get_te_master_thread_count(m_opaque);
+    }
+
+    /// @brief Set number of master TE threads to request for an Nmc::attach()
+    ///
+    /// @param[in] teMasterThreadCount Number of threads
+    void setTeMasterThreadCount(uint32_t teMasterThreadCount) {
+        nmc_ap_set_te_master_thread_count(m_opaque, teMasterThreadCount);
+    }
+
+    /// @brief Return checking mode for TE stack access
+    ///
+    /// @return Stack access mode
+    EStackCheckMode getTeStackCheckMode() {
+        return nmc_ap_get_te_stack_check_mode(m_opaque);
+    }
+
+    /// @brief Set access mode for TE stacks
+    ///
+    /// @param[in] mode Stack access mode
+    void setTeStackCheckMode(EStackCheckMode mode) {
+        nmc_ap_set_te_stack_check_mode(m_opaque, mode);
+    }
+
+    /// @brief Return reserved memory amount
+    ///
+    /// @return Amount of reserved memory in bytes
+    size_t getMemoryBytes() {
+        return nmc_ap_get_memory_bytes(m_opaque);
+    }
+
+    /// @brief Set amount of memory to reserve
+    ///
+    /// @param[in] mem Amount of memory to reserve in bytes
+    void setMemoryBytes(size_t size) {
+        nmc_ap_set_memory_bytes(m_opaque, size);
+    }
+
+private:
+    nmc_attach_props_t m_opaque;
+};
+
+class NmcResponse {
+    friend class Nmc;
+public:
+    NmcResponse() {
+        m_opaque = nmc_response_construct();
+    }
+    ~NmcResponse() {
+        nmc_response_destruct(m_opaque);
+    }
+
+    ENmcCmd getCmd() {
+        return nmc_response_get_cmd(m_opaque);
+    }
+
+    nmc_cid_t getCmdId() {
+        return nmc_response_get_cmd_id(m_opaque);
+    }
+
+    nmc_status_t getStatus() {
+        return nmc_response_get_status(m_opaque);
+    }
+
+    template<typename T1 = uint64_t, typename T2 = uint64_t>
+    void threadJoin(nmc_cid_t* pCmdId, T1* _arg1 = 0, T2* _arg2 = 0) {
+        uint64_t arg1, arg2;
+        nmc_response_join(m_opaque, pCmdId, &arg1, &arg2);
+        if (_arg1) memcpy(_arg1, &arg1, sizeof(T1));
+        if (_arg2) memcpy(_arg2, &arg2, sizeof(T2));
+    }
+
+    template<typename T>
+    void atomic(nmc_cid_t* pCmdId, T* _arg) {
+        uint64_t arg;
+        nmc_response_atomic(m_opaque, pCmdId, &arg);
+        memcpy(_arg, &arg, sizeof(T));
+    }
+
+    template<typename T>
+    void memLoad(nmc_cid_t* pCmdId, T* pData) {
+        uint64_t data;
+        nmc_response_mem_load(m_opaque, pCmdId, &data);
+        memcpy(pData, &data, sizeof(T));
+    }
+
+    void eventReceive(nmc_cid_t* pCmdId, uint64_t* pEvData) {
+        nmc_response_event_receive(m_opaque, pCmdId, pEvData);
+    }
+
+    void eventDestination(nmc_cid_t* pCmdId, uint64_t* pEvDest) {
+        nmc_response_event_destination(m_opaque, pCmdId, pEvDest);
+    }
+
+private:
+    nmc_response_t m_opaque;
+};
+
+#if 0
+///
+/// Near Memory Compute Mutex
+///
+class NmcMutex {
+    NmcMutex();
+    ~NmcMutex();
+
+    int trylock();
+    int lock();
+    int unlock();
+};
+
+
+class NmcBarrier {
+    friend class Nmc;
+
+private:
+    NmcBarrier(Nmc& nmc) : m_opaque(nmc.m_opaque) {
+        m_barrier = nmc_barrier_create(m_opaque);
+    }
+
+    nmc_t   m_opaque;
+    nmc_barrier_t m_barrier;
+
+public:
+    ~NmcBarrier() {
+        nmc_barrier_destroy(m_opaque, m_barrier);
+    }
+    nmc_status_t add(uint32_t threads) {
+        return nmc_barrier_add(m_opaque, m_barrier, threads);
+    }
+};
+
+#endif
+
+class NmcCmdProps {
+    friend class Nmc;
+public:
+    /// @brief Construct an empty command properites object
+    ///
+    /// NmcCmdProps is used to customize commands on the NMC
+    /// It is currently used for:
+    ///  - specifying the CmdId for matching commands on the response side
+    ///  - Configuring Atomic and Fencing operations
+    ///  - transfering type information into type commands
+    ///
+    NmcCmdProps(uint16_t cmdId = 0) {
+        m_opaque = nmc_cmd_props_construct(cmdId);
+    }
+
+    /// @brief Destroy a command properties object
+    ~NmcCmdProps() {
+        nmc_cmd_props_destruct(m_opaque);
+    }
+
+    void setCmdId(uint16_t cmdId) {
+        nmc_cmd_props_set_cmd_id(m_opaque, cmdId);
+    }
+    void setHuqId(uint8_t huqId) {
+        nmc_cmd_props_set_huq_id(m_opaque, huqId);
+    }
+    void setCmdAtomic(bool bAtomic) {
+        nmc_cmd_props_set_cmd_atomic(m_opaque, bAtomic);
+    }
+
+    /// @brief Set argument 1 as a float point value
+    ///
+    /// @param[in] mode Argument 1 floating point value
+    void setArg1Fp(bool argFp) {
+        nmc_cmd_props_set_arg1_fp(m_opaque, argFp);
+    }
+
+    /// @brief Set argument 2 as a float point value
+    ///
+    /// @param[in] mode Argument 2 floating point value
+    void setArg2Fp(bool argFp) {
+        nmc_cmd_props_set_arg2_fp(m_opaque, argFp);
+    }
+
+    /// @brief Set argument 3 as a float point value
+    ///
+    /// @param[in] mode Argument 3 floating point value
+    void setArg3Fp(bool argFp) {
+        nmc_cmd_props_set_arg3_fp(m_opaque, argFp);
+    }
+
+    /// @brief Set argument 4 as a float point value
+    ///
+    /// @param[in] mode Argument 4 floating point value
+    void setArg4Fp(bool argFp) {
+        nmc_cmd_props_set_arg4_fp(m_opaque, argFp);
+    }
+
+private:
+    nmc_cmd_props_t m_opaque;
+};
+
+class NmcCmdId : public NmcCmdProps {
+public:
+    /// @brief Construct an empty command id object
+    ///
+    /// NmcCmdId is an NmcCmdProps object that only uses the cmdId
+    ///
+    NmcCmdId(uint16_t cmdId) {
+        setCmdId(cmdId);
+    }
+};
+
+class NmcHuqId : public NmcCmdProps {
+public:
+    /// @brief Construct an empty command id object
+    ///
+    /// NmcCmdId is an NmcCmdProps object that only uses the cmdId
+    ///
+    NmcHuqId(uint8_t huqId) {
+        setHuqId(huqId);
+    }
+};
+
+class Nmc {
+
+public:
+    /// @brief Constructs an instance of the Nmc class.
+    ///
+    /// Upon creation, will attempt to reserve the Near Memory Compute
+    /// resources specified in pProps (or a default set of resources if
+    /// a nullptr is specified). The returned allocation will include
+    /// memory and compute elements from a single NMC device even if
+    /// multiple devices are present.
+    ///
+    /// @param[out] pStatus Returns zero on success, or a non-zero error code
+    ///     otherwise. Callers should verify success prior to invoking any
+    ///     other methods of the class (except ~Nmc(), which is always permitted).
+    ///
+    /// @param[in] pProps Optional argument specifying the requested resources
+    ///     (see class NmcAttachProps). If omitted, a default allocation
+    ///     consisting of an entire NMC device will be requested.
+    Nmc(nmc_status_t *pStatus = nullptr) : Nmc(NmcAttachProps{}, pStatus) {}
+
+    Nmc(NmcAttachProps const &attachProps, nmc_status_t *pStatus = nullptr) {
+        m_opaque = nmc_construct(attachProps.m_opaque, pStatus);
+    }
+
+    /// @brief Release Near Memory Compute resources.
+    ///
+    /// Releases any reserved Near Memory Compute resources and memory. Any
+    /// threads executing within the NMC attach domain will be terminated.
+    ///
+    ~Nmc() {
+        nmc_destruct(m_opaque);
+    }
+
+    /// @brief Return time since attach.
+    ///
+    /// @return Time in nanoseconds since successful allocation of a Near
+    /// Memory Compute attach domain.
+    double getAttachedTimeNs() {
+        return nmc_attached_time_ns(m_opaque);
+    }
+
+    /// @brief Allocates memory on a device within the NMC attach domain
+    ///
+    /// @param[in] size in bytes of the memory to be allocated
+    ///
+    /// @return pointer to the newly allocated memory or nullptr on failure
+    void* malloc(size_t size) {
+        return nmc_malloc(m_opaque, size);
+    }
+
+    /// @brief Allocates memory on a device within the NMC attach domain
+    ///
+    /// Allocates a block of memory for an array of nmemb elements, each of them size bytes long,
+    /// and initializes all its bits to zero.
+    ///
+    /// @param[in] nmemb  number of elements to allocate.
+    /// @param[in] size  size of each element in bytes
+    /// @return  pointer to the newly allocated memory or nullptr on failure
+    void* calloc(size_t nmemb, size_t size) {
+        return nmc_calloc(m_opaque, nmemb, size);
+    }
+
+    ///
+    /// @brief resize a memory allocation
+    ///
+    /// Resizes a memory allocation pointed to by \p ptr to \p size bytes.
+    /// If \p ptr is nullptr, this functional behaves like malloc.
+    /// If \p ptr is not nullptr, the returned allocation will contain the data in
+    /// the original allocation up to the lesser of the original allocation size and \p size.
+    /// If \p size is larger than the original allocation, the excess bytes are uninitialized.
+    ///
+    /// @param[in] ptr pointer to the memory to be reallocated
+    /// @param[in] size size of the the new array
+    /// @return pointer to newly reallocated memory or nullptr on failure
+    void* realloc(void* ptr, size_t size) {
+        return nmc_realloc(m_opaque, ptr, size);
+    }
+
+    ///
+    /// @brief allocate an aligned memory block
+    ///
+    /// Allocates \p size bytes of memory that are aligned to at least \p alignment
+    ///
+    /// @param[in] alignment the alignment granularity
+    /// @param[in] size the number of bytes to allocation
+    /// @return a pointer to the new allocation or nullptr if the memory cannot be allocated
+    void* alignedAlloc(size_t alignment, size_t size) {
+        return nmc_aligned_alloc(m_opaque, alignment, size);
+    }
+
+    /// @brief free allocated memory
+    ///
+    /// Frees the memory pointed to by \p ptr, which must have been returned
+    /// by a previous call of Nmc::malloc(), Nmc::calloc(), or
+    /// Nmc::alignedAlloc(). Otherwise, or if Nmc::free(ptr) has already been
+    /// called, undefined behavior occurs. If \p ptr is a nullptr, no operation
+    /// is performed.
+    ///
+    /// @param[in] ptr Address of memory to free
+    void free(void* ptr) {
+        nmc_free(m_opaque, ptr);
+    }
+
+    /// @brief map NMC device memory into address space
+    ///
+    /// Creates a new mapping in the virtual address space of the calling
+    /// process. The starting address for the new mapping is specified in
+    /// \p addr. The \p length argument specifies the length of the mapping.
+    ///
+    /// The \p prot agrument is as defined for mmap(), with the exception
+    /// that PROC_EXEC will be ignored.
+    ///
+    /// The 'flags' argument is as defined for mmap(2), with the exception
+    /// that MAP_PRIVATE and MAP_ANONYMOUS are assumed for all mappings.
+    ///
+    /// @param[in] addr Starting address for mapping; must be a multiple of the page size
+    /// @param[in] length Length for the mapping
+    /// @param[in] prot Memory protection mode; see mmap(2)
+    /// @param[in] flags Access flags; see mmap(2)
+    /// @return Pointer to mapped area or MAP_FAILED; errno set on failure
+    void* mmap(void* addr, size_t length, int prot, int flags) {
+        return nmc_mmap(m_opaque, addr, length, prot, flags);
+    }
+
+    /// @brief delete mappings for specified address range
+    ///
+    /// Deletes the mappings for a specified address range and cause any further references
+    /// to addressess within the range to generate invalid memory references.
+    ///
+    /// @param[in] addr Starting address of mapping; must have been returned from a prior call to Nmc::mmap()
+    /// @param[in] length Length of the mapping
+    /// @return Zero on success; returns -1 and sets errno on failure
+    int munmap(void* addr, size_t length) {
+        return nmc_munmap(m_opaque, addr, length);
+    }
+
+    ///
+    /// @brief break on allocated memory block
+    ///
+    /// Identifies a memory block to break on when allocated to identify memory leak source
+    ///
+    /// @param[in] seqId memory block identifier obtained from previous run of application
+    static void allocBreak(size_t seqId) {
+        nmc_alloc_break(seqId);
+    }
+
+    /// @brief Issue load request
+    ///
+    /// @tparam T Size of data to load
+    /// @param[in] cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param[in] pAddr Address to load from
+    ///
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    template <typename T>
+    nmc_status_t memLoad(NmcCmdProps const &cmdProps, T* pAddr) {
+        return nmc_mem_load(m_opaque, cmdProps.m_opaque, pAddr, sizeof(T));
+    }
+
+    /// @brief Pack argument into 64 bit payload
+    ///
+    /// Helper function used in threadCreate for taking an input argument
+    /// and packing it into a 64 bit payload to send to the device.
+    /// Only works for arithmetic and pointer types.
+    ///
+    template<typename T>
+    uint64_t extendArg(T arg)
+    {
+        static_assert((std::is_arithmetic<T>::value || std::is_pointer<T>::value) &&
+            sizeof(T) <= sizeof(uint64_t), "extendArg expects numeric/pointer inputs 64 bits or less");
+        if (std::is_floating_point<T>::value) {
+            uint64_t rtn = 0;
+            memcpy(&rtn, &arg, sizeof(T));
+            return rtn;
+        } else
+            return std::is_signed<T>::value ? (int64_t)arg : (uint64_t)arg;
+    }
+
+    /// @brief Store data in NMC memory
+    ///
+    /// @tparam T Size of data to store
+    /// @param[in] cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param[in] pAddr Address to store to
+    /// @param[in] data Value to store
+    ///
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    template <typename T>
+    nmc_status_t memStore(NmcCmdProps const& cmdProps, T* pAddr, T _data) {
+        uint64_t data = extendArg(_data);
+        return nmc_mem_store(m_opaque, cmdProps.m_opaque, pAddr, data, sizeof(T));
+    }
+
+    /// @brief Copy data
+    ///
+    /// Copies data between host memory and NMC memory.
+    ///
+    /// @param[in] cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param[in] pDst Destination address; may be either host or NMC memory.
+    /// @param[in] pSrc Source address; may be either host or NMC memory.
+    /// @param[in] size in bytes (1, 2, 4, or 8)
+    ///
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    nmc_status_t memCopy(NmcCmdProps const &cmdProps, void* pDst, void* pSrc, size_t size) {
+        return nmc_mem_copy(m_opaque, cmdProps.m_opaque, pDst, pSrc, size);
+    }
+
+    /// @brief Set NMC memory
+    ///
+    /// @tparam T Element size to operate on
+    /// @param[in] cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param[in] pDst Start address of memory region to operate on
+    /// @param[in] elemData Value to store
+    /// @param[in] elemCnt Number of elements of size T
+    ///
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    template <typename T>
+    nmc_status_t memSet(NmcCmdProps const &cmdProps, void* pDst,
+		T elemData, size_t elemSize, size_t elemCnt) {
+		return nmc_mem_set(m_opaque, cmdProps.m_opaque,  pDst, elemData, sizeof(T), elemCnt);
+    }
+
+    /// @brief Perform stride-based gather operation
+    ///
+    /// @param[in] cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param[in] pDst Destination address
+    /// @param[in] pSrcBase Base source address; assumed to be NMC memory
+    /// @param[in] elemSize Size of each element in bytes
+    /// @param[in] elemStride Distance between consecutive elements
+    /// @param[in] elemCnt Number of elements to gather
+    ///
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    nmc_status_t memGather(NmcCmdProps const &cmdProps, void* pDst, void* pSrcBase,
+        size_t elemSize, size_t elemStride, size_t elemCnt) {
+        return nmc_mem_gather_stride(m_opaque, cmdProps.m_opaque, pDst, pSrcBase, elemSize, elemStride, elemCnt);
+    }
+
+    /// @brief Perform address-based gather operation
+    ///
+    /// @param[in] cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param[in] pDst Destination address
+    /// @param[in] pSrcBase Base source address; assumed to be NMC memory
+    /// @param[in] elemSize Size of each element in bytes
+    /// @param[in] elemCnt Number of elements to gather
+    ///
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    nmc_status_t memGather(NmcCmdProps const &cmdProps, void* pDst, void* pSrcAddr,
+        size_t elemSize, size_t elemCnt) {
+        return nmc_mem_gather_address(m_opaque, cmdProps.m_opaque, pDst, pSrcAddr, elemSize, elemCnt);
+    }
+
+    /// @brief Perform an offset-based gather operation
+    ///
+    /// @param[in] cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param[in] pDst Destination address
+    /// @param[in] pSrcBase Base source address; assumed to be NMC memory
+    /// @param[in] pSrcOffset
+    /// @param[in] elemSize Size of each element in bytes
+    /// @param[in] elemCnt Number of elements to gather
+    ///
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    nmc_status_t memGather(NmcCmdProps const &cmdProps, void* pDst, void* pSrcBase,
+        void* pSrcOffset, size_t elemSize, size_t elemCnt) {
+        return nmc_mem_gather_offset(m_opaque, cmdProps.m_opaque, pDst, pSrcBase, pSrcOffset, elemSize, elemCnt);
+    }
+
+    /// @brief Perform stride-based scatter operation
+    ///
+    /// @param[in] cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param[in] pDst Destination address
+    /// @param[in] pSrcBase Base source address; assumed to be NMC memory
+    /// @param[in] elemSize Size of each element in bytes
+    /// @param[in] elemStride Distance between consecutive elements
+    /// @param[in] elemCnt Number of elements to gather
+    ///
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    nmc_status_t memScatter(NmcCmdProps const &cmdProps, void* pDstBase, void* pSrc,
+        size_t elemSize, size_t elemStride, size_t elemCnt) {
+        return nmc_mem_scatter_stride(m_opaque, cmdProps.m_opaque, pDstBase, pSrc, elemSize, elemStride, elemCnt);
+    }
+
+    /// @brief Perform address-based scatter operation
+    ///
+    /// @param[in] cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param[in] pDst Destination address
+    /// @param[in] pSrcBase Base source address; assumed to be NMC memory
+    /// @param[in] elemSize Size of each element in bytes
+    /// @param[in] elemCnt Number of elements to gather
+    ///
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    nmc_status_t memScatter(NmcCmdProps const &cmdProps, void* pDstAddr, void* pSrc,
+        size_t elemSize, size_t elemCnt) {
+        return nmc_mem_scatter_address(m_opaque, cmdProps.m_opaque, pDstAddr, pSrc, elemSize, elemCnt);
+    }
+
+    /// @brief Perform an offset-based scatter operation
+    ///
+    /// @param[in] cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param[in] pDstBase
+    /// @param[in] DstOffset
+    /// @param[in] pSrc Source address
+    /// @param[in] elemSize Size of each element in bytes
+    /// @param[in] elemCnt Number of elements to scatter
+    ///
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    nmc_status_t memScatter(NmcCmdProps const &cmdProps, void* pDstBase, void* pDstOffset,
+        void* pSrc, size_t elemSize, size_t elemCnt) {
+        return nmc_mem_scatter_offset(m_opaque, cmdProps.m_opaque, pDstBase, pDstOffset, pSrc, elemSize, elemCnt);
+    }
+
+    /// @brief Load an image into all attached TEs.
+    ///
+    /// If you are allocating TEs, you must call teLoad before allocating
+    /// a Nmc object. Example:
+    ///
+    ///		status = Nmc::teLoad("foobar.r5");
+    ///
+    /// @param[in] pathname Absolute or relative path to image file
+    ///
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    static nmc_status_t teLoad(const char* pathname) {
+        return nmc_te_load(pathname);
+    }
+
+    /// @brief Look up the address of a TE entry point
+    ///
+    /// @param[in] symname Name of the symbol to look up in the image
+    ///
+    /// @return A nullptr on failure, or the address of the entry point for the named symbol.
+    static void* teLookup(const char* symname) {
+        return nmc_te_lookup(symname);
+    }
+
+    /// @brief Load an image into all attached SEs
+    ///
+    /// @param[in] pathname Absolute or relative path to image file
+    ///
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    nmc_status_t seLoad(const char* pathname) {
+        return nmc_se_load(m_opaque, pathname);
+    }
+
+    /// @brief Look up the address of an SE entry point
+    ///
+    /// @param[in] symname Symbolic name of entry
+    ///
+    /// @return A nullptr on failure, or the address of the entry point for the named symbol.
+    void* seLookup(const char* symname) {
+        return nmc_se_lookup(m_opaque, symname);
+    }
+
+    /// @brief Create a thread on a NMC processing element
+    ///
+    /// Creates a new thread of execution on an NMC device in the current attach domain. Up to four
+    /// arguments of varying sizes may be passed. The thread will be created on the attached NMC device
+    /// which hosts the memory given by pTarget.
+    ///
+    /// @tparam T1 Size of _arg1 (8, 16, 32, or 64-bit integer; signed or unsigned)
+    /// @tparam T2 Size of _arg1 (8, 16, 32, or 64-bit integer; signed or unsigned)
+    /// @tparam T3 Size of _arg1 (8, 16, 32, or 64-bit integer; signed or unsigned)
+    /// @tparam T4 Size of _arg1 (8, 16, 32, or 64-bit integer; signed or unsigned)
+    /// @param[in] cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param[in] pFunc Entry point for new thread
+    /// @param[in] pTarget Address of data to operate on
+    /// @param[in] _arg1 Optional argument to pFunc
+    /// @param[in] _arg2 Optional argument to pFunc
+    /// @param[in] _arg3 Optional argument to pFunc
+    /// @param[in] _arg4 Optional argument to pFunc
+    ///
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    template<typename T1 = uint64_t,
+        typename T2 = uint64_t,
+        typename T3 = uint64_t,
+        typename T4 = uint64_t>
+        nmc_status_t threadCreate(NmcCmdProps& cmdProps,
+            void *pFunc, void *pTarget,
+            T1 _arg1 = 0, T2 _arg2 = 0, T3 _arg3 = 0, T4 _arg4 = 0)
+    {
+        cmdProps.setArg1Fp(std::is_floating_point<T1>::value);
+        cmdProps.setArg2Fp(std::is_floating_point<T2>::value);
+        cmdProps.setArg3Fp(std::is_floating_point<T3>::value);
+        cmdProps.setArg4Fp(std::is_floating_point<T4>::value);
+        uint64_t arg1 = extendArg(_arg1);
+        uint64_t arg2 = extendArg(_arg2);
+        uint64_t arg3 = extendArg(_arg3);
+        uint64_t arg4 = extendArg(_arg4);
+        return nmc_thread_create(m_opaque, cmdProps.m_opaque, pFunc, pTarget,
+            arg1, arg2, arg3, arg4);
+    }
+
+    ///
+    /// @brief Overload threadCreate for Rvalues
+    ///
+    /// This overload enables calling threadCreate with a temporary NmcCmdProps object
+    /// (e.g nmc.threadCreate(NmcCmdId(1)... ). This is needed because the thread create
+    /// call will update the command properties to mark which arguments are floating point,
+    /// so the base function can no longer be const &.
+    ///
+    template <typename T1 = uint64_t,
+        typename T2 = uint64_t,
+        typename T3 = uint64_t,
+        typename T4 = uint64_t>
+        nmc_status_t threadCreate(NmcCmdProps&& cmdProps,
+            void* pFunc, void* pTarget,
+            T1 _arg1 = 0, T2 _arg2 = 0, T3 _arg3 = 0, T4 _arg4 = 0) {
+        return threadCreate(cmdProps, pFunc, pTarget, _arg1, _arg2, _arg3, _arg4);
+    }
+
+    /// @brief Read one completion response from the response queue
+    ///
+    /// @param pResp pointer to the NmcResponse object to receive into
+    ///
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    nmc_status_t recvResponse(NmcResponse* pResp) {
+        return nmc_get_response(m_opaque, pResp->m_opaque);
+    }
+
+    /// @brief Peek at next completion response from the response queue
+    ///
+    /// @param pResp pointer to the NmcResponse object to receive into
+    ///
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    nmc_status_t peekResponse(NmcResponse* pResp) {
+        return nmc_peek_response(m_opaque, pResp->m_opaque);
+    }
+
+    /// @brief Pop one completion response from the response queue
+    ///
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    nmc_status_t popResponse() {
+        return nmc_pop_response(m_opaque);
+    }
+
+    // Events
+
+    ///
+    /// @brief Allocate a new event
+    ///
+    /// Reserves an event within the nmc attach domain
+    ///
+    /// @param bData Whether the event includes data, which must be a uint64_t
+    /// @return an nmc_event_t, regarded as an opaque event container.
+    ///         Use isEventValid to ensure that a valid event was allocated
+    nmc_event_t eventAllocate(bool bData) {
+        return nmc_event_allocate(m_opaque, bData);
+    }
+
+    ///
+    /// @brief Free an event
+    ///
+    /// Releases the event reservation to the nmc attach domain
+    ///
+    /// @param event the event to free
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    nmc_status_t eventFree(nmc_event_t event) {
+        return nmc_event_free(m_opaque, event);
+    }
+
+    ///
+    /// @brief Send a request for the event destination
+    ///
+    /// Events are delivered to specific channels identified by destination address in
+    /// the attach domain. This function is used to request the destination address.
+    /// The address can be retrieved from the response using NmcResponse::eventDestination()
+    ///
+    /// @param cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param evNum the nmc_event_t opaque pointer
+    /// @return NMC_SUCCESS or error number indicating reason for failure
+    nmc_status_t eventDestination(NmcCmdProps const &cmdProps, nmc_event_t evNum) {
+        return nmc_event_destination(m_opaque, cmdProps.m_opaque, evNum);
+    }
+
+    ///
+    /// @brief Sets the event delivery mode to simple (1-to-1)
+    ///
+    /// @param cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param evNum the event number
+    /// @return nmc_status_t
+    nmc_status_t eventSimpleMode(NmcCmdProps const &cmdProps, nmc_event_t evNum) {
+        return nmc_event_simple_mode(m_opaque, cmdProps.m_opaque, evNum);
+    }
+
+    ///
+    /// @brief Sets the event delivery mode to broadcast (1-to-all)
+    ///
+    /// @param cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param evNum the event number
+    /// @param evChan The channel on which to set the mode.
+    /// @return nmc_status_t
+    nmc_status_t eventBroadcastMode(NmcCmdProps const &cmdProps, nmc_event_t evNum, uint16_t evChan) {
+        return nmc_event_broadcast_mode(m_opaque, cmdProps.m_opaque, evNum, evChan);
+    }
+
+    ///
+    /// @brief Sets the event delivery mode to Simple collective (waits for a specified number of events to be received, similar to a barrier)
+    ///
+    /// @param cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param evNum the event number
+    /// @param evCnt The number of events before trigger
+    /// @return nmc_status_t
+    nmc_status_t eventCollectSimpleMode(NmcCmdProps const &cmdProps, nmc_event_t evNum, uint16_t evCnt) {
+        return nmc_event_collect_simple_mode(m_opaque, cmdProps.m_opaque, evNum, evCnt);
+    }
+
+    ///
+    /// @brief Sets the event delivery mode to Reduce collective (waits for a specified number of events to be received, similar to a barrier)
+    ///        and perform reduction operation on message data
+    ///
+    /// @param cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param opType The reduction operation type
+    /// @param opSize The reduction operation size (4/8 bytes)
+    /// @param evNum the event number
+    /// @param evCnt The number of events before trigger
+    nmc_status_t eventCollectReduceMode(NmcCmdProps const &cmdProps, ENmcEventReduceOpType opType, ENmcEventReduceOpSize opSize, nmc_event_t evNum, uint16_t evCnt, uint64_t evData) {
+        return nmc_event_collect_reduce_mode(m_opaque, cmdProps.m_opaque, opType, opSize, evNum, evCnt, evData);
+    }
+
+    ///
+    /// @brief Sets the event delivery mode to Cascade collective (waits for a specified number of events to be received, similar to a barrier)
+    ///        and when all events have been received, send a Cascade event to a target device
+    ///
+    /// @param cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param DeviceId The target devcie ID
+    /// @param CascadeQueueId The queue ID on the targeted device
+    /// @param evNum the event number
+    /// @param evCnt The number of events before trigger
+    /// @return nmc_status_t
+    nmc_status_t eventCollectCascadeMode(NmcCmdProps const &cmdProps, uint32_t deviceId, uint32_t cascadeQueueId, nmc_event_t evNum, uint16_t evCnt) {
+        return nmc_event_collect_cascade_mode(m_opaque, cmdProps.m_opaque, deviceId, cascadeQueueId, evNum, evCnt);
+    }
+
+    ///
+    /// @brief Send a message without data
+    ///
+    /// @param cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param evDest the destination of the event, previously returned with NmcResponse::eventDestination
+    /// @return NMC_SUCCESS or enum indicating reason for failure
+    nmc_status_t eventSend(NmcCmdProps const &cmdProps, uint64_t evDest) {
+        return nmc_event_send(m_opaque, cmdProps.m_opaque, evDest);
+    }
+
+    ///
+    /// @brief Send a message with data
+    ///
+    /// @param cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param evDest the destination of the event, previously returned with NmcResponse::eventDestination
+    /// @param evData NMC_SUCCESS or enum indicating reason for failure
+    /// @return nmc_status_t
+    nmc_status_t eventSend(NmcCmdProps const &cmdProps, uint64_t evDest, uint64_t evData) {
+        return nmc_event_send_data(m_opaque, cmdProps.m_opaque, evDest, evData);
+    }
+
+    ///
+    /// @brief Broadcast a message without data
+    ///
+    /// @param cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param evNum the event number
+    /// @param evChan the channel on which to broadcast
+    /// @return NMC_SUCCESS or enum indicating reason for failure
+    nmc_status_t eventBroadcast(NmcCmdProps const &cmdProps, nmc_event_t evNum, uint16_t evChan) {
+        return nmc_event_broadcast(m_opaque, cmdProps.m_opaque, evNum, evChan);
+    }
+
+    ///
+    /// @brief Broadcast a message with data
+    ///
+    /// @param cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param evNum the event number
+    /// @param evChan the channel on which to broadcast
+    /// @param evData the data to be delivered with the event
+    /// @return NMC_SUCCESS or enum indicating reason for failure
+    nmc_status_t eventBroadcast(NmcCmdProps const &cmdProps, nmc_event_t evNum, uint16_t evChan, uint64_t evData) {
+        return nmc_event_broadcast_data(m_opaque, cmdProps.m_opaque, evNum, evChan, evData);
+    }
+
+    ///
+    /// @brief Receive an event
+    ///
+    /// @param cmdProps Command Properties; used to specify id, atomic, and fencing operations
+    /// @param evNum the event number to check
+    /// @return NMC_SUCCESS if the event is present
+    nmc_status_t eventReceive(NmcCmdProps const &cmdProps, nmc_event_t evNum) {
+        return nmc_event_receive(m_opaque, cmdProps.m_opaque, evNum);
+    }
+
+    // Atomics
+    template <typename T>
+    nmc_status_t atomicAdd(NmcCmdProps const &cmdProps, T* pAddr, T data) {
+		if (std::is_floating_point<T>::value) {
+			if (sizeof(T) == 4)
+				return nmc_atomic_fadd32(m_opaque, cmdProps.m_opaque, pAddr, (float)data);
+			else
+				return nmc_atomic_fadd64(m_opaque, cmdProps.m_opaque, pAddr, (double)data);
+		} else
+        	return nmc_atomic_add(m_opaque, cmdProps.m_opaque, pAddr, data, sizeof(T));
+    }
+
+    template <typename T>
+    nmc_status_t atomicXor(NmcCmdProps const &cmdProps, T* pAddr, T data) {
+        return nmc_atomic_xor(m_opaque, cmdProps.m_opaque, pAddr, data, sizeof(T));
+    }
+
+    template <typename T>
+    nmc_status_t atomicAnd(NmcCmdProps const &cmdProps, T* pAddr, T data) {
+        return nmc_atomic_and(m_opaque, cmdProps.m_opaque, pAddr, data, sizeof(T));
+    }
+
+    template <typename T>
+    nmc_status_t atomicOr(NmcCmdProps const &cmdProps, T* pAddr, T data) {
+        return nmc_atomic_or(m_opaque, cmdProps.m_opaque, pAddr, data, sizeof(T));
+    }
+
+    template <typename T>
+    nmc_status_t atomicMin(NmcCmdProps const &cmdProps, T* pAddr, T data) {
+		if (std::is_floating_point<T>::value) {
+			if (sizeof(T) == 4)
+				return nmc_atomic_fmin32(m_opaque, cmdProps.m_opaque, pAddr, (float)data);
+			else
+				return nmc_atomic_fmin64(m_opaque, cmdProps.m_opaque, pAddr, (double)data);
+		} else {
+	        if (std::is_unsigned<T>::value)
+	            return nmc_atomic_minu(m_opaque, cmdProps.m_opaque, pAddr, data, sizeof(T));
+	        else
+	            return nmc_atomic_min(m_opaque, cmdProps.m_opaque, pAddr, data, sizeof(T));
+	    }
+	}
+
+    template <typename T>
+    nmc_status_t atomicMax(NmcCmdProps const &cmdProps, T* pAddr, T data) {
+		if (std::is_floating_point<T>::value) {
+			if (sizeof(T) == 4)
+				return nmc_atomic_fmax32(m_opaque, cmdProps.m_opaque, pAddr, (float)data);
+			else
+				return nmc_atomic_fmax64(m_opaque, cmdProps.m_opaque, pAddr, (double)data);
+		} else {
+	        if (std::is_unsigned<T>::value)
+	            return nmc_atomic_maxu(m_opaque, cmdProps.m_opaque, pAddr, data, sizeof(T));
+	        else
+	            return nmc_atomic_max(m_opaque, cmdProps.m_opaque, pAddr, data, sizeof(T));
+	    }
+	}
+
+    NmcSpinLock spinLockCreate(){
+        return NmcSpinLock(m_opaque);
+    }
+
+#if 0
+    NmcMutex* mutexCreate();
+    nmc_status_t mutexDestroy(NmcMutex* mutex);
+
+
+    NmcBarrier* barrierCreate() {
+        return new NmcBarrier(*this);
+    }
+
+#endif
+
+private:
+    nmc_t m_opaque;
+};
diff --git a/src/micron/nmc_errno.h b/src/micron/nmc_errno.h
new file mode 100644
index 00000000..81382bf9
--- /dev/null
+++ b/src/micron/nmc_errno.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (C) 2024 Micron Technology, Inc.
+ *
+ * This file is the confidential and proprietary property of
+ *              Micron Technology, Inc.
+ */
+/*
+ *  NMC Runtime Errno API
+ */
+#pragma once
+
+/*
+ * Error values that are shared across the Host and TE.
+ */
+#define NMC_SUCCESS        0
+#define NMC_INV_NMC        128
+#define NMC_NO_CHILD       129
+#define NMC_NO_IMAGE       130
+#define NMC_RETRY          131
+#define NMC_NOT_SUPP       132
+#define NMC_INV_BARRIER    133
+#define NMC_INV_MTX        134
+#define NMC_NOTMTX         135
+#define NMC_DEADLCK        136
+#define NMC_RESP_ID        137
+#define NMC_RESP_STATUS    138
+#define NMC_INV_BARRIER_ATTR 139
+#define NMC_INV_VALUE      140
+#define NMC_INV_CMD        141
+#define NMC_INV_AMO_ADDR   142
+#define NMC_NO_MEM         143
+#define NMC_INV_APP_ENG    144
+#define NMC_INV_ARG_TYPE   145
+#define NMC_INV_RTN_ARG_CNT 146
+#define NMC_INV_DEV_ID     147
+#define NMC_INV_HUQ_ID     148
+#define NMC_NO_TENANT	   149
+
+
+/* Host event interface error values */
+#define NMC_INV_EVMODE     150
+#define NMC_INV_EVNUM      151
+#define NMC_INV_EVCHAN     152
+#define NMC_INV_EVCNT      153
+#define NMC_RCV_TERM       154
+#define NMC_INV_EVDST      155
+
+/*
+ * nmc_get_response can return the following for error conditions
+ */
+#define NMC_NO_TE	   252
+#define NMC_NO_DM	   253
+#define NMC_NO_SE	   254
+#define NMC_NOT_HUC_CMD	   255
diff --git a/src/micron/nmc_sync.h b/src/micron/nmc_sync.h
new file mode 100644
index 00000000..6db9d4ff
--- /dev/null
+++ b/src/micron/nmc_sync.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright (C) 2024 Micron Technology, Inc.
+ *
+ * This file is the confidential and proprietary property of
+ * Micron Technology, Inc.
+ */
+
+#pragma once
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "nmc_errno.h"
+#include "nmc_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Synchronization primitives
+ *
+ * All of the synchronization data types are opaque. We only defined
+ * a pointer of type X.
+ *
+ * spin locks:
+ *
+ * Note: The nmc_spinlock_create and nmc_spinlock_destroy are only
+ * avialable on the host. The nmc_spinlock_trylock, nmc_spinlock_lock,
+ * and nmc_spinlock_unlock are defined for host and Risc-V.
+ *
+ */
+#if !defined(__riscv)
+extern nmc_status_t nmc_spinlock_create(nmc_t, nmc_lock_t *);
+extern nmc_status_t nmc_spinlock_destroy(nmc_lock_t);
+#endif
+extern nmc_status_t nmc_spinlock_trylock(nmc_lock_t lock);
+extern nmc_status_t nmc_spinlock_lock(nmc_lock_t lock);
+extern nmc_status_t nmc_spinlock_unlock(nmc_lock_t lock);
+
+#if 0
+extern nmc_status_t nmc_mutex_create(nmc_t nmc, nmc_mutex_t *mtx);
+extern nmc_status_t nmc_mutex_destroy(nmc_t nmc, nmc_mutex_t mtx);
+extern nmc_status_t nmc_mutex_trylock(nmc_mutex_t mtx);
+extern nmc_status_t nmc_mutex_lock(nmc_mutex_t mtx);
+extern nmc_status_t nmc_mutex_unlock(nmc_mutex_t mtx);
+
+/*
+ * Barriers -
+ */
+
+extern nmc_barrier_t nmc_barrier_create(nmc_t nmc);
+extern void nmc_barrier_destroy(nmc_t nmc, nmc_barrier_t barrier);
+extern nmc_status_t nmc_barrier_add(nmc_t nmc, nmc_barrier_t barrier, uint32_t threads);
+#endif
+
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/micron/nmc_sync.hpp b/src/micron/nmc_sync.hpp
new file mode 100644
index 00000000..1c7dcc6e
--- /dev/null
+++ b/src/micron/nmc_sync.hpp
@@ -0,0 +1,55 @@
+//
+// Copyright (C) 2024 Micron Technology, Inc.
+//
+// This file is the confidential and proprietary property of
+// Micron Technology, Inc.
+//
+
+#pragma once
+
+#include "nmc_sync.h"
+
+///
+/// Simple spinlocks for Near Memory Compute.
+///
+class NmcSpinLock {
+    friend class Nmc;
+
+private:
+    nmc_lock_t m_spinlock;
+
+#if !defined(__riscv)
+    // NmcSpinLock & ~NmcSpinLock are not supported on the NDCG device,
+    // so we do nothing
+    NmcSpinLock(nmc_t pnmc){
+	(void) nmc_spinlock_create(pnmc, &m_spinlock);
+    }
+#else
+    NmcSpinLock(){
+    }
+#endif
+
+
+public:
+#if !defined(__riscv)
+    ~NmcSpinLock() {
+	(void) nmc_spinlock_destroy(m_spinlock);
+    }
+#else
+    ~NmcSpinLock() {
+    }
+#endif
+    NmcSpinLock(const NmcSpinLock&) = delete;
+    NmcSpinLock& operator=(const NmcSpinLock&) = delete;
+
+    int trylock() {
+	return nmc_spinlock_trylock(m_spinlock);
+    }
+    int lock() {
+	return nmc_spinlock_lock(m_spinlock);
+    }
+    int unlock() {
+	return nmc_spinlock_unlock(m_spinlock);
+    }
+};
+
diff --git a/src/micron/nmc_te.hpp b/src/micron/nmc_te.hpp
new file mode 100644
index 00000000..f650a56f
--- /dev/null
+++ b/src/micron/nmc_te.hpp
@@ -0,0 +1,1692 @@
+/*
+ * Copyright (C) 2024 Micron Technology, Inc.
+ *
+ * This file is the confidential and proprietary property of
+ *                 Micron Technology, Inc.
+ */
+#pragma once
+
+#include <stdint.h>
+#include <cstring>
+#include <type_traits>
+#include "nmc_te_intrin.h"
+
+enum NmcAppEngine { NmcAppEngInvalid = 0, NmcAppEngTe = 1, NmcAppEngSe = 2 };
+enum NmcAmoMode { NmcAmoRelaxMode = 0, NmcAmoAqMode = 1, NmcAmoRlMode = 2, NmcAmoAqRlMode = 3 };
+
+class NmcTeCmdProps {
+public:
+	NmcTeCmdProps(uint16_t cmdId = 0) {
+		m_cmdId = cmdId;
+		m_isArg1Fp = false;
+		m_isArg2Fp = false;
+		m_isArg3Fp = false;
+		m_isArg4Fp = false;
+	}
+
+	uint64_t getThreadCreateCmdProps() {
+		return m_threadCreateCmdProps;
+	}
+
+	void setCmdId(uint16_t cmdId) {
+		m_cmdId = cmdId;
+	}
+	uint16_t getCmdId() const {
+		return m_cmdId;
+	}
+
+	void setCmdAtomic(bool bAtomic) {
+		m_bCmdAtomic = bAtomic;
+	}
+	bool isCmdAtomic() const {
+		return m_bCmdAtomic;
+	}
+
+	void setRtnArgCnt(uint32_t args) {
+		m_rtnArgCnt = args;
+	}
+	uint32_t getRtnArgCnt() const {
+		return m_rtnArgCnt;
+	}
+
+	void setAppEngine(NmcAppEngine appEng) {
+		m_appEng = appEng;
+	}
+	NmcAppEngine getAppEngine() const {
+		return m_appEng;
+	}
+
+	void setEventData(uint64_t eventData) {
+		m_bEventData = true;
+		m_eventData = eventData;
+	}
+	bool hasEventData() const {
+		return m_bEventData;
+	}
+	uint64_t getEventData() const {
+		return m_eventData;
+	}
+
+	void setNonBlocking(bool bNonBlocking) {
+		m_bNonBlocking = bNonBlocking;
+	}
+	bool isNonBlocking() const {
+		return m_bNonBlocking;
+	}
+
+	void setAmoMode(NmcAmoMode mode) {
+		m_amoMode = mode;
+	}
+	NmcAmoMode getAmoMode() const {
+		return m_amoMode;
+	}
+
+	void setNoFlush(bool bNoFlush) {
+		m_bNoFlush = bNoFlush;
+	}
+	bool isNoFlush() const {
+		return m_bNoFlush;
+	}
+
+	void setNoInvalidate(bool bNoInvalidate) {
+		m_bNoInvalidate = bNoInvalidate;
+	}
+	bool isNoInvalidate() const {
+		return m_bNoInvalidate;
+	}
+
+	void setBusyFail(bool bBusyFail) {
+		m_bBusyFail = bBusyFail;
+	}
+	bool isBusyFail() const {
+		return m_bBusyFail;
+	}
+
+	void setNonTemporal(bool bNonTemporal) {
+		m_bNonTemporal = bNonTemporal;
+	}
+	bool isNonTemporal() const {
+		return m_bNonTemporal;
+	}
+
+	void setSpacial(bool bSpacial) {
+		m_bSpacial = bSpacial;
+	}
+	bool isSpacial() const {
+		return m_bSpacial;
+	}
+
+	void setArg1Fp(bool isArgFp) {
+		m_isArg1Fp = isArgFp;
+	}
+	void setArg2Fp(bool isArgFp) {
+		m_isArg2Fp = isArgFp;
+	}
+	void setArg3Fp(bool isArgFp) {
+		m_isArg3Fp = isArgFp;
+	}
+	void setArg4Fp(bool isArgFp) {
+		m_isArg4Fp = isArgFp;
+	}
+
+private:
+	union {
+		struct {
+			uint16_t m_cmdId;
+			uint16_t m_isArg1Fp : 1;
+			uint16_t m_isArg2Fp : 1;
+			uint16_t m_isArg3Fp : 1;
+			uint16_t m_isArg4Fp : 1;
+		};
+		uint32_t m_threadCreateCmdProps;
+	};
+	bool m_bCmdAtomic = false;
+	uint32_t m_rtnArgCnt = 3;
+	NmcAppEngine m_appEng = NmcAppEngInvalid;
+	bool m_bEventData = false;
+	uint64_t m_eventData = 0;
+	bool m_bNonBlocking = false;
+	NmcAmoMode m_amoMode = NmcAmoAqRlMode;
+	bool m_bNoFlush = false;
+	bool m_bNoInvalidate = false;
+	bool m_bBusyFail = false;
+	bool m_bNonTemporal = false;
+	bool m_bSpacial = false;
+};
+
+class NmcTeAppEng : public NmcTeCmdProps {
+public:
+	NmcTeAppEng() : NmcTeCmdProps()
+	{
+		setAppEngine(NmcAppEngTe);
+	}
+};
+
+class NmcTeRtnArgCnt : public NmcTeCmdProps {
+public:
+	NmcTeRtnArgCnt(uint32_t rtnArgCnt) : NmcTeCmdProps()
+	{
+		setRtnArgCnt(rtnArgCnt);
+	}
+};
+
+class NmcTeNonBlocking : public NmcTeCmdProps {
+public:
+	NmcTeNonBlocking() : NmcTeCmdProps()
+	{
+		setNonBlocking(true);
+	}
+};
+
+class NmcTeEventData : public NmcTeCmdProps {
+public:
+	NmcTeEventData(uint64_t data = 0) : NmcTeCmdProps() {
+		setEventData(data);
+	}
+};
+
+template<typename T>
+uint64_t extendArg(T arg)
+{
+	static_assert((std::is_integral<T>::value || std::is_pointer<T>::value || std::is_floating_point<T>::value) &&
+		sizeof(T) <= sizeof(uint64_t));
+	if (std::is_floating_point<T>::value) {
+		uint64_t rtn;
+		switch (sizeof(T)) {
+		case 4:
+			rtn = fmv_x_s(*(float*)&arg);
+			break;
+		case 8:
+			rtn = fmv_x_d(*(double*)&arg);
+			break;
+		default:
+			rtn = 0;
+			assert(0 && "Invalid Type");
+		}
+		return rtn;
+	} else
+		return std::is_signed<T>::value ? (int64_t)arg : (uint64_t)arg;
+}
+
+template<typename T>
+void moveArg(T *arg, uint64_t u64) {
+	static_assert((std::is_integral<T>::value || std::is_pointer<T>::value || std::is_floating_point<T>::value) &&
+		sizeof(T) <= sizeof(uint64_t));
+	if (std::is_floating_point<T>::value)
+		memcpy(arg, &u64, sizeof(uint64_t));
+	else
+		*arg = (T)u64;
+}
+
+// fiber creation
+
+template<typename TG = uint64_t, typename F = uint64_t>
+int nmcFiberCreate(NmcTeCmdProps cmdProps, TG pTarget, F pFunc)
+{
+	uint64_t seCmdProps = cmdProps.getThreadCreateCmdProps();
+	uint64_t teCmdProps = cmdProps.getThreadCreateCmdProps();
+
+	int64_t status = NMC_SUCCESS;
+	switch (cmdProps.getAppEngine()) {
+	case NmcAppEngTe:
+		if (cmdProps.isBusyFail()) {
+			if (cmdProps.isNoFlush()) {
+				switch (cmdProps.getRtnArgCnt()) {
+				case 0: status = nmc_xfc_te_bf_c0_r0_nf((void *)pTarget, (void *)pFunc, teCmdProps); break;
+				case 1: status = nmc_xfc_te_bf_c0_r1_nf((void *)pTarget, (void *)pFunc, teCmdProps); break;
+				case 2: status = nmc_xfc_te_bf_c0_r2_nf((void *)pTarget, (void *)pFunc, teCmdProps); break;
+				case 3: status = nmc_xfc_te_bf_c0_nr_nf((void *)pTarget, (void *)pFunc, teCmdProps); break;
+				default: status = NMC_INV_RTN_ARG_CNT;
+				}
+			} else {
+				switch (cmdProps.getRtnArgCnt()) {
+				case 0: status = nmc_xfc_te_bf_c0_r0((void *)pTarget, (void *)pFunc, teCmdProps); break;
+				case 1: status = nmc_xfc_te_bf_c0_r1((void *)pTarget, (void *)pFunc, teCmdProps); break;
+				case 2: status = nmc_xfc_te_bf_c0_r2((void *)pTarget, (void *)pFunc, teCmdProps); break;
+				case 3: status = nmc_xfc_te_bf_c0_nr((void *)pTarget, (void *)pFunc, teCmdProps); break;
+				default: status = NMC_INV_RTN_ARG_CNT;
+				}
+			}
+		} else {
+			if (cmdProps.isNoFlush()) {
+				switch (cmdProps.getRtnArgCnt()) {
+				case 0:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c0_r0_nf_at((void *)pTarget, (void *)pFunc, teCmdProps);
+					else
+						nmc_xfc_te_c0_r0_nf((void *)pTarget, (void *)pFunc, teCmdProps);
+					break;
+				case 1:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c0_r1_nf_at((void *)pTarget, (void *)pFunc, teCmdProps);
+					else
+						nmc_xfc_te_c0_r1_nf((void *)pTarget, (void *)pFunc, teCmdProps);
+					break;
+				case 2:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c0_r2_nf_at((void *)pTarget, (void *)pFunc, teCmdProps);
+					else
+						nmc_xfc_te_c0_r2_nf((void *)pTarget, (void *)pFunc, teCmdProps);
+					break;
+				case 3:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c0_nr_nf_at((void *)pTarget, (void *)pFunc, teCmdProps);
+					else
+						nmc_xfc_te_c0_nr_nf((void *)pTarget, (void *)pFunc, teCmdProps);
+					break;
+				default:
+					status = NMC_INV_RTN_ARG_CNT;
+				}
+			} else {
+				switch (cmdProps.getRtnArgCnt()) {
+				case 0:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c0_r0_at((void *)pTarget, (void *)pFunc, teCmdProps);
+					else
+						nmc_xfc_te_c0_r0((void *)pTarget, (void *)pFunc, teCmdProps);
+					break;
+				case 1:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c0_r1_at((void *)pTarget, (void *)pFunc, teCmdProps);
+					else
+						nmc_xfc_te_c0_r1((void *)pTarget, (void *)pFunc, teCmdProps);
+					break;
+				case 2:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c0_r2_at((void *)pTarget, (void *)pFunc, teCmdProps);
+					else
+						nmc_xfc_te_c0_r2((void *)pTarget, (void *)pFunc, teCmdProps);
+					break;
+				case 3:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c0_nr_at((void *)pTarget, (void *)pFunc, teCmdProps);
+					else
+						nmc_xfc_te_c0_nr((void *)pTarget, (void *)pFunc, teCmdProps);
+					break;
+				default:
+					status = NMC_INV_RTN_ARG_CNT;
+				}
+			}
+		}
+		break;
+	case NmcAppEngSe:
+		if (cmdProps.isNoFlush()) {
+			switch (cmdProps.getRtnArgCnt()) {
+			case 0:
+				nmc_xfc_se_c0_r0_nf((void *)pTarget, (void *)pFunc, seCmdProps);
+				break;
+			case 1:
+				nmc_xfc_se_c0_r1_nf((void *)pTarget, (void *)pFunc, seCmdProps);
+				break;
+			case 2:
+				nmc_xfc_se_c0_r2_nf((void *)pTarget, (void *)pFunc, seCmdProps);
+				break;
+			case 3:
+				nmc_xfc_se_c0_nr_nf((void *)pTarget, (void *)pFunc, seCmdProps);
+				break;
+			default:
+				status = NMC_INV_RTN_ARG_CNT;
+			}
+		} else {
+			switch (cmdProps.getRtnArgCnt()) {
+			case 0:
+				nmc_xfc_se_c0_r0((void *)pTarget, (void *)pFunc, seCmdProps);
+				break;
+			case 1:
+				nmc_xfc_se_c0_r1((void *)pTarget, (void *)pFunc, seCmdProps);
+				break;
+			case 2:
+				nmc_xfc_se_c0_r2((void *)pTarget, (void *)pFunc, seCmdProps);
+				break;
+			case 3:
+				nmc_xfc_se_c0_nr((void *)pTarget, (void *)pFunc, seCmdProps);
+				break;
+			default:
+				status = NMC_INV_RTN_ARG_CNT;
+			}
+		}
+		break;
+	default:
+		return NMC_INV_APP_ENG;
+	}
+	return status;
+}
+
+template<typename T1 = uint64_t>
+int nmcFiberCreate(NmcTeCmdProps cmdProps, void *pTarget, void *pFunc,
+	T1 arg1)
+{
+	uint64_t a1 = extendArg(arg1);
+
+	uint64_t seCmdProps = cmdProps.getThreadCreateCmdProps();
+
+	cmdProps.setArg1Fp(std::is_floating_point<T1>::value);
+	uint64_t teCmdProps = cmdProps.getThreadCreateCmdProps();
+
+	int64_t status = NMC_SUCCESS;
+	switch (cmdProps.getAppEngine()) {
+	case NmcAppEngTe:
+		if (cmdProps.isBusyFail()) {
+			if (cmdProps.isNoFlush()) {
+				switch (cmdProps.getRtnArgCnt()) {
+				case 0: status = nmc_xfc_te_bf_c1_r0_nf(pTarget, pFunc, teCmdProps, a1); break;
+				case 1: status = nmc_xfc_te_bf_c1_r1_nf(pTarget, pFunc, teCmdProps, a1); break;
+				case 2: status = nmc_xfc_te_bf_c1_r2_nf(pTarget, pFunc, teCmdProps, a1); break;
+				case 3: status = nmc_xfc_te_bf_c1_nr_nf(pTarget, pFunc, teCmdProps, a1); break;
+				default: status = NMC_INV_RTN_ARG_CNT;
+				}
+			} else {
+				switch (cmdProps.getRtnArgCnt()) {
+				case 0: status = nmc_xfc_te_bf_c1_r0(pTarget, pFunc, teCmdProps, a1); break;
+				case 1: status = nmc_xfc_te_bf_c1_r1(pTarget, pFunc, teCmdProps, a1); break;
+				case 2: status = nmc_xfc_te_bf_c1_r2(pTarget, pFunc, teCmdProps, a1); break;
+				case 3: status = nmc_xfc_te_bf_c1_nr(pTarget, pFunc, teCmdProps, a1); break;
+				default: status = NMC_INV_RTN_ARG_CNT;
+				}
+			}
+		} else {
+			if (cmdProps.isNoFlush()) {
+				switch (cmdProps.getRtnArgCnt()) {
+				case 0:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c1_r0_nf_at(pTarget, pFunc, teCmdProps, a1);
+					else
+						nmc_xfc_te_c1_r0_nf(pTarget, pFunc, teCmdProps, a1);
+					break;
+				case 1:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c1_r1_nf_at(pTarget, pFunc, teCmdProps, a1);
+					else
+						nmc_xfc_te_c1_r1_nf(pTarget, pFunc, teCmdProps, a1);
+					break;
+				case 2:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c1_r2_nf_at(pTarget, pFunc, teCmdProps, a1);
+					else
+						nmc_xfc_te_c1_r2_nf(pTarget, pFunc, teCmdProps, a1);
+					break;
+				case 3:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c1_nr_nf_at(pTarget, pFunc, teCmdProps, a1);
+					else
+						nmc_xfc_te_c1_nr_nf(pTarget, pFunc, teCmdProps, a1);
+					break;
+				default: status = NMC_INV_RTN_ARG_CNT;
+				}
+			} else {
+				switch (cmdProps.getRtnArgCnt()) {
+				case 0:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c1_r0_at(pTarget, pFunc, teCmdProps, a1);
+					else
+						nmc_xfc_te_c1_r0(pTarget, pFunc, teCmdProps, a1);
+					break;
+				case 1:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c1_r1_at(pTarget, pFunc, teCmdProps, a1);
+					else
+						nmc_xfc_te_c1_r1(pTarget, pFunc, teCmdProps, a1);
+					break;
+				case 2:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c1_r2_at(pTarget, pFunc, teCmdProps, a1);
+					else
+						nmc_xfc_te_c1_r2(pTarget, pFunc, teCmdProps, a1);
+					break;
+				case 3:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c1_nr_at(pTarget, pFunc, teCmdProps, a1);
+					else
+						nmc_xfc_te_c1_nr(pTarget, pFunc, teCmdProps, a1);
+					break;
+				default: status = NMC_INV_RTN_ARG_CNT;
+				}
+			}
+		}
+		break;
+	case NmcAppEngSe:
+		if (cmdProps.isNoFlush()) {
+			switch (cmdProps.getRtnArgCnt()) {
+			case 0:
+				nmc_xfc_se_c1_r0_nf(pTarget, pFunc, seCmdProps, a1);
+				break;
+			case 1:
+				nmc_xfc_se_c1_r1_nf(pTarget, pFunc, seCmdProps, a1);
+				break;
+			case 2:
+				nmc_xfc_se_c1_r2_nf(pTarget, pFunc, seCmdProps, a1);
+				break;
+			case 3:
+				nmc_xfc_se_c1_nr_nf(pTarget, pFunc, seCmdProps, a1);
+				break;
+			default: status = NMC_INV_RTN_ARG_CNT;
+			}
+		} else {
+			switch (cmdProps.getRtnArgCnt()) {
+			case 0:
+				nmc_xfc_se_c1_r0(pTarget, pFunc, seCmdProps, a1);
+				break;
+			case 1:
+				nmc_xfc_se_c1_r1(pTarget, pFunc, seCmdProps, a1);
+				break;
+			case 2:
+				nmc_xfc_se_c1_r2(pTarget, pFunc, seCmdProps, a1);
+				break;
+			case 3:
+				nmc_xfc_se_c1_nr(pTarget, pFunc, seCmdProps, a1);
+				break;
+			default: status = NMC_INV_RTN_ARG_CNT;
+			}
+		}
+		break;
+	default:
+		return NMC_INV_APP_ENG;
+	}
+	return status;
+}
+
+template<typename T1 = uint64_t, typename T2 = uint64_t>
+int nmcFiberCreate(NmcTeCmdProps cmdProps, void *pTarget, void *pFunc,
+	T1 arg1, T2 arg2)
+{
+	uint64_t a1 = extendArg(arg1);
+	uint64_t a2 = extendArg(arg2);
+
+	uint64_t seCmdProps = cmdProps.getThreadCreateCmdProps();
+
+	cmdProps.setArg1Fp(std::is_floating_point<T1>::value);
+	cmdProps.setArg2Fp(std::is_floating_point<T2>::value);
+	uint64_t teCmdProps = cmdProps.getThreadCreateCmdProps();
+
+	int64_t status = NMC_SUCCESS;
+	switch (cmdProps.getAppEngine()) {
+	case NmcAppEngTe:
+		if (cmdProps.isBusyFail()) {
+			if (cmdProps.isNoFlush()) {
+				switch (cmdProps.getRtnArgCnt()) {
+				case 0: status = nmc_xfc_te_bf_c2_r0_nf(pTarget, pFunc, teCmdProps, a1, a2); break;
+				case 1: status = nmc_xfc_te_bf_c2_r1_nf(pTarget, pFunc, teCmdProps, a1, a2); break;
+				case 2: status = nmc_xfc_te_bf_c2_r2_nf(pTarget, pFunc, teCmdProps, a1, a2); break;
+				case 3: status = nmc_xfc_te_bf_c2_nr_nf(pTarget, pFunc, teCmdProps, a1, a2); break;
+				default:status = NMC_INV_RTN_ARG_CNT;
+				}
+			} else {
+				switch (cmdProps.getRtnArgCnt()) {
+				case 0: status = nmc_xfc_te_bf_c2_r0(pTarget, pFunc, teCmdProps, a1, a2); break;
+				case 1: status = nmc_xfc_te_bf_c2_r1(pTarget, pFunc, teCmdProps, a1, a2); break;
+				case 2: status = nmc_xfc_te_bf_c2_r2(pTarget, pFunc, teCmdProps, a1, a2); break;
+				case 3: status = nmc_xfc_te_bf_c2_nr(pTarget, pFunc, teCmdProps, a1, a2); break;
+				default:status = NMC_INV_RTN_ARG_CNT;
+				}
+			}
+		} else {
+			if (cmdProps.isNoFlush()) {
+				switch (cmdProps.getRtnArgCnt()) {
+				case 0:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c2_r0_nf_at(pTarget, pFunc, teCmdProps, a1, a2);
+					else
+						nmc_xfc_te_c2_r0_nf(pTarget, pFunc, teCmdProps, a1, a2);
+					break;
+				case 1:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c2_r1_nf_at(pTarget, pFunc, teCmdProps, a1, a2);
+					else
+						nmc_xfc_te_c2_r1_nf(pTarget, pFunc, teCmdProps, a1, a2);
+					break;
+				case 2:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c2_r2_nf_at(pTarget, pFunc, teCmdProps, a1, a2);
+					else
+						nmc_xfc_te_c2_r2_nf(pTarget, pFunc, teCmdProps, a1, a2);
+					break;
+				case 3:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c2_nr_nf_at(pTarget, pFunc, teCmdProps, a1, a2);
+					else
+						nmc_xfc_te_c2_nr_nf(pTarget, pFunc, teCmdProps, a1, a2);
+					break;
+				default:
+					status = NMC_INV_RTN_ARG_CNT;
+				}
+			} else {
+				switch (cmdProps.getRtnArgCnt()) {
+				case 0:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c2_r0_at(pTarget, pFunc, teCmdProps, a1, a2);
+					else
+						nmc_xfc_te_c2_r0(pTarget, pFunc, teCmdProps, a1, a2);
+					break;
+				case 1:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c2_r1_at(pTarget, pFunc, teCmdProps, a1, a2);
+					else
+						nmc_xfc_te_c2_r1(pTarget, pFunc, teCmdProps, a1, a2);
+					break;
+				case 2:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c2_r2_at(pTarget, pFunc, teCmdProps, a1, a2);
+					else
+						nmc_xfc_te_c2_r2(pTarget, pFunc, teCmdProps, a1, a2);
+					break;
+				case 3:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c2_nr_at(pTarget, pFunc, teCmdProps, a1, a2);
+					else
+						nmc_xfc_te_c2_nr(pTarget, pFunc, teCmdProps, a1, a2);
+					break;
+				default:
+					status = NMC_INV_RTN_ARG_CNT;
+				}
+			}
+		}
+		break;
+	case NmcAppEngSe:
+		if (cmdProps.isNoFlush()) {
+			switch (cmdProps.getRtnArgCnt()) {
+			case 0:
+				nmc_xfc_se_c2_r0_nf(pTarget, pFunc, seCmdProps, a1, a2);
+				break;
+			case 1:
+				nmc_xfc_se_c2_r1_nf(pTarget, pFunc, seCmdProps, a1, a2);
+				break;
+			case 2:
+				nmc_xfc_se_c2_r2_nf(pTarget, pFunc, seCmdProps, a1, a2);
+				break;
+			case 3:
+				nmc_xfc_se_c2_nr_nf(pTarget, pFunc, seCmdProps, a1, a2);
+				break;
+			default:
+				status = NMC_INV_RTN_ARG_CNT;
+			}
+		} else {
+			switch (cmdProps.getRtnArgCnt()) {
+			case 0:
+				nmc_xfc_se_c2_r0(pTarget, pFunc, seCmdProps, a1, a2);
+				break;
+			case 1:
+				nmc_xfc_se_c2_r1(pTarget, pFunc, seCmdProps, a1, a2);
+				break;
+			case 2:
+				nmc_xfc_se_c2_r2(pTarget, pFunc, seCmdProps, a1, a2);
+				break;
+			case 3:
+				nmc_xfc_se_c2_nr(pTarget, pFunc, seCmdProps, a1, a2);
+				break;
+			default:
+				status = NMC_INV_RTN_ARG_CNT;
+			}
+		}
+		break;
+	default:
+		return NMC_INV_APP_ENG;
+	}
+	return status;
+}
+
+template<typename T1 = uint64_t, typename T2 = uint64_t,
+	typename T3 = uint64_t, typename T4 = uint64_t>
+	int64_t nmcFiberCreate(NmcTeCmdProps &cmdProps, void *pTarget, void *pFunc,
+		T1 arg1, T2 arg2, T3 arg3, T4 arg4 = 0)
+{
+	uint64_t a1 = extendArg(arg1);
+	uint64_t a2 = extendArg(arg2);
+	uint64_t a3 = extendArg(arg3);
+	uint64_t a4 = extendArg(arg4);
+
+	uint64_t seCmdProps = cmdProps.getThreadCreateCmdProps();
+
+	cmdProps.setArg1Fp(std::is_floating_point<T1>::value);
+	cmdProps.setArg2Fp(std::is_floating_point<T2>::value);
+	cmdProps.setArg3Fp(std::is_floating_point<T3>::value);
+	cmdProps.setArg4Fp(std::is_floating_point<T4>::value);
+	uint64_t teCmdProps = cmdProps.getThreadCreateCmdProps();
+
+	int64_t status = NMC_SUCCESS;
+	switch (cmdProps.getAppEngine()) {
+	case NmcAppEngTe:
+		if (cmdProps.isBusyFail()) {
+			if (cmdProps.isNoFlush()) {
+				switch (cmdProps.getRtnArgCnt()) {
+				case 0: status = nmc_xfc_te_bf_c4_r0_nf(pTarget, pFunc, teCmdProps, a1, a2, a3, a4); break;
+				case 1: status = nmc_xfc_te_bf_c4_r1_nf(pTarget, pFunc, teCmdProps, a1, a2, a3, a4); break;
+				case 2: status = nmc_xfc_te_bf_c4_r2_nf(pTarget, pFunc, teCmdProps, a1, a2, a3, a4); break;
+				case 3: status = nmc_xfc_te_bf_c4_nr_nf(pTarget, pFunc, teCmdProps, a1, a2, a3, a4); break;
+				default:status = NMC_INV_RTN_ARG_CNT;
+				}
+			} else {
+				switch (cmdProps.getRtnArgCnt()) {
+				case 0: status = nmc_xfc_te_bf_c4_r0(pTarget, pFunc, teCmdProps, a1, a2, a3, a4); break;
+				case 1: status = nmc_xfc_te_bf_c4_r1(pTarget, pFunc, teCmdProps, a1, a2, a3, a4); break;
+				case 2: status = nmc_xfc_te_bf_c4_r2(pTarget, pFunc, teCmdProps, a1, a2, a3, a4); break;
+				case 3: status = nmc_xfc_te_bf_c4_nr(pTarget, pFunc, teCmdProps, a1, a2, a3, a4); break;
+				default: status = NMC_INV_RTN_ARG_CNT;
+				}
+			}
+		} else {
+			if (cmdProps.isNoFlush()) {
+				switch (cmdProps.getRtnArgCnt()) {
+				case 0:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c4_r0_nf_at(pTarget, pFunc, teCmdProps, a1, a2, a3, a4);
+					else
+						nmc_xfc_te_c4_r0_nf(pTarget, pFunc, teCmdProps, a1, a2, a3, a4);
+					break;
+				case 1:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c4_r1_nf_at(pTarget, pFunc, teCmdProps, a1, a2, a3, a4);
+					else
+						nmc_xfc_te_c4_r1_nf(pTarget, pFunc, teCmdProps, a1, a2, a3, a4);
+					break;
+				case 2:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c4_r2_nf_at(pTarget, pFunc, teCmdProps, a1, a2, a3, a4);
+					else
+						nmc_xfc_te_c4_r2_nf(pTarget, pFunc, teCmdProps, a1, a2, a3, a4);
+					break;
+				case 3:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c4_nr_nf_at(pTarget, pFunc, teCmdProps, a1, a2, a3, a4);
+					else
+						nmc_xfc_te_c4_nr_nf(pTarget, pFunc, teCmdProps, a1, a2, a3, a4);
+					break;
+				default:
+					status = NMC_INV_RTN_ARG_CNT;
+				}
+			} else {
+				switch (cmdProps.getRtnArgCnt()) {
+				case 0:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c4_r0_at(pTarget, pFunc, teCmdProps, a1, a2, a3, a4);
+					else
+						nmc_xfc_te_c4_r0(pTarget, pFunc, teCmdProps, a1, a2, a3, a4);
+					break;
+				case 1:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c4_r1_at(pTarget, pFunc, teCmdProps, a1, a2, a3, a4);
+					else
+						nmc_xfc_te_c4_r1(pTarget, pFunc, teCmdProps, a1, a2, a3, a4);
+					break;
+				case 2:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c4_r2_at(pTarget, pFunc, teCmdProps, a1, a2, a3, a4);
+					else
+						nmc_xfc_te_c4_r2(pTarget, pFunc, teCmdProps, a1, a2, a3, a4);
+					break;
+				case 3:
+					if (cmdProps.isCmdAtomic())
+						nmc_xfc_te_c4_nr_at(pTarget, pFunc, teCmdProps, a1, a2, a3, a4);
+					else
+						nmc_xfc_te_c4_nr(pTarget, pFunc, teCmdProps, a1, a2, a3, a4);
+					break;
+				default:
+					status = NMC_INV_RTN_ARG_CNT;
+				}
+			}
+		}
+		break;
+	case NmcAppEngSe:
+		if (cmdProps.isNoFlush()) {
+			switch (cmdProps.getRtnArgCnt()) {
+			case 0:
+				nmc_xfc_se_c4_r0_nf(pTarget, pFunc, seCmdProps, a1, a2, a3, a4);
+				break;
+			case 1:
+				nmc_xfc_se_c4_r1_nf(pTarget, pFunc, seCmdProps, a1, a2, a3, a4);
+				break;
+			case 2:
+				nmc_xfc_se_c4_r2_nf(pTarget, pFunc, seCmdProps, a1, a2, a3, a4);
+				break;
+			case 3:
+				nmc_xfc_se_c4_nr_nf(pTarget, pFunc, seCmdProps, a1, a2, a3, a4);
+				break;
+			default:
+				status = NMC_INV_RTN_ARG_CNT;
+			}
+		} else {
+			switch (cmdProps.getRtnArgCnt()) {
+			case 0:
+				nmc_xfc_se_c4_r0(pTarget, pFunc, seCmdProps, a1, a2, a3, a4);
+				break;
+			case 1:
+				nmc_xfc_se_c4_r1(pTarget, pFunc, seCmdProps, a1, a2, a3, a4);
+				break;
+			case 2:
+				nmc_xfc_se_c4_r2(pTarget, pFunc, seCmdProps, a1, a2, a3, a4);
+				break;
+			case 3:
+				nmc_xfc_se_c4_nr(pTarget, pFunc, seCmdProps, a1, a2, a3, a4);
+				break;
+			default:
+				status = NMC_INV_RTN_ARG_CNT;
+			}
+		}
+		break;
+	default:
+		return NMC_INV_APP_ENG;
+	}
+	return status;
+}
+
+inline int nmcFiberJoin(NmcTeCmdProps const &cmdProps, uint16_t *_cmdId)
+{
+	int64_t cmdId;
+	if (cmdProps.isNoInvalidate()) {
+		if (cmdProps.isNonBlocking())
+			nmc_xfj_r0_nb_ni(&cmdId);
+		else
+			nmc_xfj_r0_ni(&cmdId);
+	} else {
+		if (cmdProps.isNonBlocking())
+			nmc_xfj_r0_nb(&cmdId);
+		else
+			nmc_xfj_r0(&cmdId);
+	}
+	*_cmdId = cmdId;
+	return cmdId < 0 ? NMC_RETRY : NMC_SUCCESS;
+}
+
+template<typename T1 = uint64_t>
+inline int nmcFiberJoin(NmcTeCmdProps const &cmdProps, uint16_t *_cmdId, T1 *ret1)
+{
+	int64_t cmdId;
+	uint64_t a1;
+	if (cmdProps.isNoInvalidate()) {
+		if (cmdProps.isNonBlocking())
+			nmc_xfj_r1_nb_ni(&cmdId, &a1);
+		else
+			nmc_xfj_r1_ni(&cmdId, &a1);
+	} else {
+		if (cmdProps.isNonBlocking())
+			nmc_xfj_r1_nb(&cmdId, &a1);
+		else
+			nmc_xfj_r1(&cmdId, &a1);
+	}
+	*_cmdId = cmdId;
+	moveArg(ret1, a1);
+	return cmdId < 0 ? NMC_RETRY : NMC_SUCCESS;
+}
+
+template<typename T1 = uint64_t, typename T2 = uint64_t>
+inline int nmcFiberJoin(NmcTeCmdProps const &cmdProps, uint16_t *_cmdId, T1 *ret1, T2 *ret2)
+{
+	int64_t cmdId;
+	uint64_t a1;
+	uint64_t a2;
+	if (cmdProps.isNoInvalidate()) {
+		if (cmdProps.isNonBlocking())
+			nmc_xfj_r2_nb_ni(&cmdId, &a1, &a2);
+		else
+			nmc_xfj_r2_ni(&cmdId, &a1, &a2);
+	} else {
+		if (cmdProps.isNonBlocking())
+			nmc_xfj_r2_nb(&cmdId, &a1, &a2);
+		else
+			nmc_xfj_r2(&cmdId, &a1, &a2);
+	}
+	*_cmdId = cmdId;
+	moveArg(ret1, a1);
+	moveArg(ret2, a2);
+	return cmdId < 0 ? NMC_RETRY : NMC_SUCCESS;
+}
+
+inline int64_t nmcFiberJoinAll(NmcTeCmdProps const &cmdProps)
+{
+	int rtn;
+	if (cmdProps.isNoInvalidate()) {
+		if (cmdProps.isNonBlocking())
+			nmc_xfja_ni_nb();
+		else
+			nmc_xfja_ni();
+	} else {
+		if (cmdProps.isNonBlocking())
+			nmc_xfja_nb();
+		else
+			nmc_xfja();
+	}
+	return rtn;
+}
+
+inline void nmcReturn()
+{
+	nmc_xtr_r0();
+}
+
+template<typename T1 = uint64_t>
+inline void nmcReturn(T1 arg1)
+{
+	uint64_t a1 = extendArg(arg1);
+
+	nmc_xtr_r1(a1);
+}
+
+template<typename T1 = uint64_t, typename T2 = uint64_t>
+inline void nmcReturn(T1 arg1, T2 arg2)
+{
+	uint64_t a1 = extendArg(arg1);
+	uint64_t a2 = extendArg(arg2);
+
+	nmc_xtr_r2(a1, a2);
+}
+
+inline double nmcGetAttachedTimeNs()
+{
+	return nmc_csr_rdtime();
+}
+
+//
+// nmcPrintf
+//     NMC TE thread safe version of printf.
+//
+// arguments
+//     fmt  format string use to print the additional arguments
+//     ...  the additional arguments.
+//
+// return
+//     Upon successful return, this function returns the number of
+//     characters printted.
+//
+#define nmcPrintf nmc_te_printf
+#define nmcPrintfTime nmc_te_printf_time
+
+//
+// Event routines
+//
+enum NmcEventReceiveMode { UndefinedMode, SimpleMode, CollectiveMode, BroadcastMode };
+inline void nmcSetEventReceiveMode(NmcEventReceiveMode mode, uint64_t eventNum, uint64_t eventCountOrRecvChan = 0) {
+	switch (mode) {
+	case SimpleMode:
+		nmc_xem_sm(eventNum);
+		break;
+	case CollectiveMode:
+		nmc_xem_cm(eventNum, eventCountOrRecvChan);
+		break;
+	case BroadcastMode:
+		nmc_xem_bm(eventNum, eventCountOrRecvChan);
+		break;
+	default:
+		assert(0 && "Invalid event receive mode");
+	}
+}
+
+inline uint64_t nmcEventGetDest(uint64_t eventNum) {
+	return nmc_xed(eventNum);
+}
+
+inline void nmcEventSend(NmcTeCmdProps const &cmdProps, uint64_t eventDest) {
+	if (cmdProps.isNoFlush()) {
+		if (cmdProps.hasEventData())
+			nmc_xes_d_nf(eventDest, cmdProps.getEventData());
+		else
+			nmc_xes_nf(eventDest);
+	} else {
+		if (cmdProps.hasEventData())
+			nmc_xes_d(eventDest, cmdProps.getEventData());
+		else
+			nmc_xes(eventDest);
+	}
+}
+
+inline void nmcEventBroadcast(NmcTeCmdProps const &cmdProps, uint64_t eventNum, uint64_t eventChan) {
+	if (cmdProps.isNoFlush()) {
+		if (cmdProps.hasEventData())
+			nmc_xeb_d_nf(eventNum, eventChan, cmdProps.getEventData());
+		else
+			nmc_xeb_nf(eventNum, eventChan);
+	} else {
+		if (cmdProps.hasEventData())
+			nmc_xeb_d(eventNum, eventChan, cmdProps.getEventData());
+		else
+			nmc_xeb(eventNum, eventChan);
+	}
+}
+
+inline uint64_t nmcEventListen(NmcTeCmdProps const &cmdProps, uint64_t eventMask) {
+	if (cmdProps.isNonBlocking())
+		return nmc_xel_nb(eventMask);
+	else
+		return nmc_xel(eventMask);
+}
+
+template<typename T = uint64_t>
+inline uint64_t nmcEventReceive(NmcTeCmdProps const &cmdProps, T eventNum) {
+	if (cmdProps.isNoInvalidate()) {
+		if (cmdProps.hasEventData())
+			return nmc_xer_d_ni(eventNum);
+		else {
+			nmc_xer_ni(eventNum);
+			return 0;
+		}
+	} else {
+		if (cmdProps.hasEventData())
+			return nmc_xer_d(eventNum);
+		else {
+			nmc_xer(eventNum);
+			return 0;
+		}
+	}
+}
+
+// Atomics
+
+template<typename T = uint32_t>
+inline T nmcAtomicAdd(NmcTeCmdProps const &cmdProps, T *pAddr, T value) {
+	if (cmdProps.isNonBlocking()) {
+		switch (sizeof(T)) {
+		case 4:
+			if (std::is_floating_point<T>::value) {
+				switch (cmdProps.getAmoMode()) {
+				default: nmc_amofadd_s_nb((volatile float *)pAddr, value); break;
+				case NmcAmoAqMode: nmc_amofadd_s_aq_nb((volatile float *)pAddr, value); break;
+				case NmcAmoRlMode: nmc_amofadd_s_rl_nb((volatile float *)pAddr, value); break;
+				case NmcAmoAqRlMode: nmc_amofadd_s_aqrl_nb((volatile float *)pAddr, value); break;
+				}
+			} else {
+				switch (cmdProps.getAmoMode()) {
+				default: nmc_amoadd_w_nb((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqMode: nmc_amoadd_w_aq_nb((volatile int32_t *)pAddr, value); break;
+				case NmcAmoRlMode: nmc_amoadd_w_rl_nb((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: nmc_amoadd_w_aqrl_nb((volatile int32_t *)pAddr, value); break;
+				}
+			}
+			break;
+		case 8:
+			if (std::is_floating_point<T>::value) {
+				switch (cmdProps.getAmoMode()) {
+				default: nmc_amofadd_d_nb((volatile double *)pAddr, value); break;
+				case NmcAmoAqMode: nmc_amofadd_d_aq_nb((volatile double *)pAddr, value); break;
+				case NmcAmoRlMode: nmc_amofadd_d_rl_nb((volatile double *)pAddr, value); break;
+				case NmcAmoAqRlMode: nmc_amofadd_d_aqrl_nb((volatile double *)pAddr, value); break;
+				}
+			} else {
+				switch (cmdProps.getAmoMode()) {
+				default: nmc_amoadd_d_nb((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqMode: nmc_amoadd_d_aq_nb((volatile int64_t *)pAddr, value); break;
+				case NmcAmoRlMode: nmc_amoadd_d_rl_nb((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: nmc_amoadd_d_aqrl_nb((volatile int64_t *)pAddr, value); break;
+				}
+			}
+			break;
+		default:
+			assert(0 && "Invalid type");
+		}
+		return 0;
+	} else {
+		switch (sizeof(T)) {
+		case 4:
+			if (std::is_floating_point<T>::value) {
+				switch (cmdProps.getAmoMode()) {
+				default: return nmc_amofadd_s((volatile float *)pAddr, value); break;
+				case NmcAmoAqMode: return nmc_amofadd_s_aq((volatile float *)pAddr, value); break;
+				case NmcAmoRlMode: return nmc_amofadd_s_rl((volatile float *)pAddr, value); break;
+				case NmcAmoAqRlMode: return nmc_amofadd_s_aqrl((volatile float *)pAddr, value); break;
+				}
+			} else {
+				switch (cmdProps.getAmoMode()) {
+				default: return nmc_amoadd_w((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqMode: return nmc_amoadd_w_aq((volatile int32_t *)pAddr, value); break;
+				case NmcAmoRlMode: return nmc_amoadd_w_rl((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: return nmc_amoadd_w_aqrl((volatile int32_t *)pAddr, value); break;
+				}
+			}
+			break;
+		case 8:
+			if (std::is_floating_point<T>::value) {
+				switch (cmdProps.getAmoMode()) {
+				default: return nmc_amofadd_d((volatile double *)pAddr, value); break;
+				case NmcAmoAqMode: return nmc_amofadd_d_aq((volatile double *)pAddr, value); break;
+				case NmcAmoRlMode: return nmc_amofadd_d_rl((volatile double *)pAddr, value); break;
+				case NmcAmoAqRlMode: return nmc_amofadd_d_aqrl((volatile double *)pAddr, value); break;
+				}
+			} else {
+				switch (cmdProps.getAmoMode()) {
+				default: return nmc_amoadd_d((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqMode: return nmc_amoadd_d_aq((volatile int64_t *)pAddr, value); break;
+				case NmcAmoRlMode: return nmc_amoadd_d_rl((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: return nmc_amoadd_d_aqrl((volatile int64_t *)pAddr, value); break;
+				}
+			}
+			break;
+		default:
+			assert(0 && "Invalid type");
+			return 0;
+		}
+	}
+}
+
+template<typename T = uint32_t>
+inline T nmcAtomicMin(NmcTeCmdProps const &cmdProps, T *pAddr, T value) {
+	if (cmdProps.isNonBlocking()) {
+		if (std::is_floating_point<T>::value) {
+			switch (sizeof(T)) {
+			case 4:
+				switch (cmdProps.getAmoMode()) {
+				default: nmc_amofmin_s_nb((volatile float *)pAddr, value); break;
+				case NmcAmoAqMode: nmc_amofmin_s_aq_nb((volatile float *)pAddr, value); break;
+				case NmcAmoRlMode: nmc_amofmin_s_rl_nb((volatile float *)pAddr, value); break;
+				case NmcAmoAqRlMode: nmc_amofmin_s_aqrl_nb((volatile float *)pAddr, value); break;
+				}
+				break;
+			case 8:
+				switch (cmdProps.getAmoMode()) {
+				default: nmc_amofmin_d_nb((volatile double *)pAddr, value); break;
+				case NmcAmoAqMode: nmc_amofmin_d_aq_nb((volatile double *)pAddr, value); break;
+				case NmcAmoRlMode: nmc_amofmin_d_rl_nb((volatile double *)pAddr, value); break;
+				case NmcAmoAqRlMode: nmc_amofmin_d_aqrl_nb((volatile double *)pAddr, value); break;
+				}
+				break;
+			default:
+				assert(0 && "Invalid type");
+			}
+		} else if (std::is_unsigned<T>::value) {
+			switch (sizeof(T)) {
+			case 4:
+				switch (cmdProps.getAmoMode()) {
+				default: nmc_amominu_w_nb((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqMode: nmc_amominu_w_aq_nb((volatile int32_t *)pAddr, value); break;
+				case NmcAmoRlMode: nmc_amominu_w_rl_nb((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: nmc_amominu_w_aqrl_nb((volatile int32_t *)pAddr, value); break;
+				}
+				break;
+			case 8:
+				switch (cmdProps.getAmoMode()) {
+				default: nmc_amominu_d_nb((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqMode: nmc_amominu_d_aq_nb((volatile int64_t *)pAddr, value); break;
+				case NmcAmoRlMode: nmc_amominu_d_rl_nb((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: nmc_amominu_d_aqrl_nb((volatile int64_t *)pAddr, value); break;
+				}
+				break;
+			default:
+				assert(0 && "Invalid type");
+			}
+		} else {
+			switch (sizeof(T)) {
+			case 4:
+				switch (cmdProps.getAmoMode()) {
+				default: nmc_amomin_w_nb((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqMode: nmc_amomin_w_aq_nb((volatile int32_t *)pAddr, value); break;
+				case NmcAmoRlMode: nmc_amomin_w_rl_nb((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: nmc_amomin_w_aqrl_nb((volatile int32_t *)pAddr, value); break;
+				}
+				break;
+			case 8:
+				switch (cmdProps.getAmoMode()) {
+				default: nmc_amomin_d_nb((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqMode: nmc_amomin_d_aq_nb((volatile int64_t *)pAddr, value); break;
+				case NmcAmoRlMode: nmc_amomin_d_rl_nb((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: nmc_amomin_d_aqrl_nb((volatile int64_t *)pAddr, value); break;
+				}
+				break;
+			default:
+				assert(0 && "Invalid type");
+			}
+		}
+		return 0;
+	} else {
+		if (std::is_floating_point<T>::value) {
+			switch (sizeof(T)) {
+			case 4:
+				switch (cmdProps.getAmoMode()) {
+				default: return nmc_amofmin_s((volatile float *)pAddr, value); break;
+				case NmcAmoAqMode: return nmc_amofmin_s_aq((volatile float *)pAddr, value); break;
+				case NmcAmoRlMode: return nmc_amofmin_s_rl((volatile float *)pAddr, value); break;
+				case NmcAmoAqRlMode: return nmc_amofmin_s_aqrl((volatile float *)pAddr, value); break;
+				}
+				break;
+			case 8:
+				switch (cmdProps.getAmoMode()) {
+				default: return nmc_amofmin_d((volatile double *)pAddr, value); break;
+				case NmcAmoAqMode: return nmc_amofmin_d_aq((volatile double *)pAddr, value); break;
+				case NmcAmoRlMode: return nmc_amofmin_d_rl((volatile double *)pAddr, value); break;
+				case NmcAmoAqRlMode: return nmc_amofmin_d_aqrl((volatile double *)pAddr, value); break;
+				}
+				break;
+			default:
+				assert(0 && "Invalid type");
+				return 0;
+			}
+		} else if (std::is_unsigned<T>::value) {
+			switch (sizeof(T)) {
+			case 4:
+				switch (cmdProps.getAmoMode()) {
+				default: return nmc_amominu_w((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqMode: return nmc_amominu_w_aq((volatile int32_t *)pAddr, value); break;
+				case NmcAmoRlMode: return nmc_amominu_w_rl((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: return nmc_amominu_w_aqrl((volatile int32_t *)pAddr, value); break;
+				}
+				break;
+			case 8:
+				switch (cmdProps.getAmoMode()) {
+				default: return nmc_amominu_d((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqMode: return nmc_amominu_d_aq((volatile int64_t *)pAddr, value); break;
+				case NmcAmoRlMode: return nmc_amominu_d_rl((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: return nmc_amominu_d_aqrl((volatile int64_t *)pAddr, value); break;
+				}
+				break;
+			default:
+				assert(0 && "Invalid type");
+				return 0;
+			}
+		} else {
+			switch (sizeof(T)) {
+			case 4:
+				switch (cmdProps.getAmoMode()) {
+				default: return nmc_amomin_w((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqMode: return nmc_amomin_w_aq((volatile int32_t *)pAddr, value); break;
+				case NmcAmoRlMode: return nmc_amomin_w_rl((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: return nmc_amomin_w_aqrl((volatile int32_t *)pAddr, value); break;
+				}
+				break;
+			case 8:
+				switch (cmdProps.getAmoMode()) {
+				default: return nmc_amomin_d((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqMode: return nmc_amomin_d_aq((volatile int64_t *)pAddr, value); break;
+				case NmcAmoRlMode: return nmc_amomin_d_rl((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: return nmc_amomin_d_aqrl((volatile int64_t *)pAddr, value); break;
+				}
+				break;
+			default:
+				assert(0 && "Invalid type");
+				return 0;
+			}
+		}
+	}
+}
+
+template<typename T = uint32_t>
+inline T nmcAtomicMax(NmcTeCmdProps const &cmdProps, T *pAddr, T value) {
+	if (cmdProps.isNonBlocking()) {
+		if (std::is_floating_point<T>::value) {
+			switch (sizeof(T)) {
+			case 4:
+				switch (cmdProps.getAmoMode()) {
+				default: nmc_amofmax_s_nb((volatile float *)pAddr, value); break;
+				case NmcAmoAqMode: nmc_amofmax_s_aq_nb((volatile float *)pAddr, value); break;
+				case NmcAmoRlMode: nmc_amofmax_s_rl_nb((volatile float *)pAddr, value); break;
+				case NmcAmoAqRlMode: nmc_amofmax_s_aqrl_nb((volatile float *)pAddr, value); break;
+				}
+				break;
+			case 8:
+				switch (cmdProps.getAmoMode()) {
+				default: nmc_amofmax_d_nb((volatile double *)pAddr, value); break;
+				case NmcAmoAqMode: nmc_amofmax_d_aq_nb((volatile double *)pAddr, value); break;
+				case NmcAmoRlMode: nmc_amofmax_d_rl_nb((volatile double *)pAddr, value); break;
+				case NmcAmoAqRlMode: nmc_amofmax_d_aqrl_nb((volatile double *)pAddr, value); break;
+				}
+				break;
+			default:
+				assert(0 && "Invalid type");
+			}
+		} else if (std::is_unsigned<T>::value) {
+			switch (sizeof(T)) {
+			case 4:
+				switch (cmdProps.getAmoMode()) {
+				default: nmc_amomaxu_w_nb((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqMode: nmc_amomaxu_w_aq_nb((volatile int32_t *)pAddr, value); break;
+				case NmcAmoRlMode: nmc_amomaxu_w_rl_nb((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: nmc_amomaxu_w_aqrl_nb((volatile int32_t *)pAddr, value); break;
+				}
+				break;
+			case 8:
+				switch (cmdProps.getAmoMode()) {
+				default: nmc_amomaxu_d_nb((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqMode: nmc_amomaxu_d_aq_nb((volatile int64_t *)pAddr, value); break;
+				case NmcAmoRlMode: nmc_amomaxu_d_rl_nb((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: nmc_amomaxu_d_aqrl_nb((volatile int64_t *)pAddr, value); break;
+				}
+				break;
+			default:
+				assert(0 && "Invalid type");
+			}
+		} else {
+			switch (sizeof(T)) {
+			case 4:
+				switch (cmdProps.getAmoMode()) {
+				default: nmc_amomax_w_nb((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqMode: nmc_amomax_w_aq_nb((volatile int32_t *)pAddr, value); break;
+				case NmcAmoRlMode: nmc_amomax_w_rl_nb((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: nmc_amomax_w_aqrl_nb((volatile int32_t *)pAddr, value); break;
+				}
+				break;
+			case 8:
+				switch (cmdProps.getAmoMode()) {
+				default: nmc_amomax_d_nb((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqMode: nmc_amomax_d_aq_nb((volatile int64_t *)pAddr, value); break;
+				case NmcAmoRlMode: nmc_amomax_d_rl_nb((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: nmc_amomax_d_aqrl_nb((volatile int64_t *)pAddr, value); break;
+				}
+				break;
+			default:
+				assert(0 && "Invalid type");
+			}
+		}
+		return 0;
+	} else {
+		if (std::is_floating_point<T>::value) {
+			switch (sizeof(T)) {
+			case 4:
+				switch (cmdProps.getAmoMode()) {
+				default: return nmc_amofmax_s((volatile float *)pAddr, value); break;
+				case NmcAmoAqMode: return nmc_amofmax_s_aq((volatile float *)pAddr, value); break;
+				case NmcAmoRlMode: return nmc_amofmax_s_rl((volatile float *)pAddr, value); break;
+				case NmcAmoAqRlMode: return nmc_amofmax_s_aqrl((volatile float *)pAddr, value); break;
+				}
+				break;
+			case 8:
+				switch (cmdProps.getAmoMode()) {
+				default: return nmc_amofmax_d((volatile double *)pAddr, value); break;
+				case NmcAmoAqMode: return nmc_amofmax_d_aq((volatile double *)pAddr, value); break;
+				case NmcAmoRlMode: return nmc_amofmax_d_rl((volatile double *)pAddr, value); break;
+				case NmcAmoAqRlMode: return nmc_amofmax_d_aqrl((volatile double *)pAddr, value); break;
+				}
+				break;
+			default:
+				assert(0 && "Invalid type");
+				return 0;
+			}
+		} else if (std::is_unsigned<T>::value) {
+			switch (sizeof(T)) {
+			case 4:
+				switch (cmdProps.getAmoMode()) {
+				default: return nmc_amomaxu_w((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqMode: return nmc_amomaxu_w_aq((volatile int32_t *)pAddr, value); break;
+				case NmcAmoRlMode: return nmc_amomaxu_w_rl((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: return nmc_amomaxu_w_aqrl((volatile int32_t *)pAddr, value); break;
+				}
+				break;
+			case 8:
+				switch (cmdProps.getAmoMode()) {
+				default: return nmc_amomaxu_d((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqMode: return nmc_amomaxu_d_aq((volatile int64_t *)pAddr, value); break;
+				case NmcAmoRlMode: return nmc_amomaxu_d_rl((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: return nmc_amomaxu_d_aqrl((volatile int64_t *)pAddr, value); break;
+				}
+				break;
+			default:
+				assert(0 && "Invalid type");
+				return 0;
+			}
+		} else {
+			switch (sizeof(T)) {
+			case 4:
+				switch (cmdProps.getAmoMode()) {
+				default: return nmc_amomax_w((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqMode: return nmc_amomax_w_aq((volatile int32_t *)pAddr, value); break;
+				case NmcAmoRlMode: return nmc_amomax_w_rl((volatile int32_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: return nmc_amomax_w_aqrl((volatile int32_t *)pAddr, value); break;
+				}
+				break;
+			case 8:
+				switch (cmdProps.getAmoMode()) {
+				default: return nmc_amomax_d((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqMode: return nmc_amomax_d_aq((volatile int64_t *)pAddr, value); break;
+				case NmcAmoRlMode: return nmc_amomax_d_rl((volatile int64_t *)pAddr, value); break;
+				case NmcAmoAqRlMode: return nmc_amomax_d_aqrl((volatile int64_t *)pAddr, value); break;
+				}
+				break;
+			default:
+				assert(0 && "Invalid type");
+				return 0;
+			}
+		}
+	}
+}
+
+template<typename T = uint32_t>
+inline T nmcAtomicXor(NmcTeCmdProps const &cmdProps, T *pAddr, T value) {
+	if (cmdProps.isNonBlocking()) {
+		switch (sizeof(T)) {
+		case 4:
+			switch (cmdProps.getAmoMode()) {
+			default: nmc_amoxor_w_nb((volatile int32_t *)pAddr, value); break;
+			case NmcAmoAqMode: nmc_amoxor_w_aq_nb((volatile int32_t *)pAddr, value); break;
+			case NmcAmoRlMode: nmc_amoxor_w_rl_nb((volatile int32_t *)pAddr, value); break;
+			case NmcAmoAqRlMode: nmc_amoxor_w_aqrl_nb((volatile int32_t *)pAddr, value); break;
+			}
+			break;
+		case 8:
+			switch (cmdProps.getAmoMode()) {
+			default: nmc_amoxor_d_nb((volatile int64_t *)pAddr, value); break;
+			case NmcAmoAqMode: nmc_amoxor_d_aq_nb((volatile int64_t *)pAddr, value); break;
+			case NmcAmoRlMode: nmc_amoxor_d_rl_nb((volatile int64_t *)pAddr, value); break;
+			case NmcAmoAqRlMode: nmc_amoxor_d_aqrl_nb((volatile int64_t *)pAddr, value); break;
+			}
+			break;
+		default:
+			assert(0 && "Invalid type");
+		}
+		return 0;
+	} else {
+		switch (sizeof(T)) {
+		case 4:
+			switch (cmdProps.getAmoMode()) {
+			default: return nmc_amoxor_w((volatile int32_t *)pAddr, value); break;
+			case NmcAmoAqMode: return nmc_amoxor_w_aq((volatile int32_t *)pAddr, value); break;
+			case NmcAmoRlMode: return nmc_amoxor_w_rl((volatile int32_t *)pAddr, value); break;
+			case NmcAmoAqRlMode: return nmc_amoxor_w_aqrl((volatile int32_t *)pAddr, value); break;
+			}
+			break;
+		case 8:
+			switch (cmdProps.getAmoMode()) {
+			default: return nmc_amoxor_d((volatile int64_t *)pAddr, value); break;
+			case NmcAmoAqMode: return nmc_amoxor_d_aq((volatile int64_t *)pAddr, value); break;
+			case NmcAmoRlMode: return nmc_amoxor_d_rl((volatile int64_t *)pAddr, value); break;
+			case NmcAmoAqRlMode: return nmc_amoxor_d_aqrl((volatile int64_t *)pAddr, value); break;
+			}
+			break;
+		default:
+			assert(0 && "Invalid type");
+			return 0;
+		}
+	}
+}
+
+template<typename T = uint32_t>
+inline T nmcAtomicOr(NmcTeCmdProps const &cmdProps, T *pAddr, T value) {
+	if (cmdProps.isNonBlocking()) {
+		switch (sizeof(T)) {
+		case 4:
+			switch (cmdProps.getAmoMode()) {
+			default: nmc_amoor_w_nb((volatile int32_t *)pAddr, value); break;
+			case NmcAmoAqMode: nmc_amoor_w_aq_nb((volatile int32_t *)pAddr, value); break;
+			case NmcAmoRlMode: nmc_amoor_w_rl_nb((volatile int32_t *)pAddr, value); break;
+			case NmcAmoAqRlMode: nmc_amoor_w_aqrl_nb((volatile int32_t *)pAddr, value); break;
+			}
+			break;
+		case 8:
+			switch (cmdProps.getAmoMode()) {
+			default: nmc_amoor_d_nb((volatile int64_t *)pAddr, value); break;
+			case NmcAmoAqMode: nmc_amoor_d_aq_nb((volatile int64_t *)pAddr, value); break;
+			case NmcAmoRlMode: nmc_amoor_d_rl_nb((volatile int64_t *)pAddr, value); break;
+			case NmcAmoAqRlMode: nmc_amoor_d_aqrl_nb((volatile int64_t *)pAddr, value); break;
+			}
+			break;
+		default:
+			assert(0 && "Invalid type");
+		}
+		return 0;
+	} else {
+		switch (sizeof(T)) {
+		case 4:
+			switch (cmdProps.getAmoMode()) {
+			default: return nmc_amoor_w((volatile int32_t *)pAddr, value); break;
+			case NmcAmoAqMode: return nmc_amoor_w_aq((volatile int32_t *)pAddr, value); break;
+			case NmcAmoRlMode: return nmc_amoor_w_rl((volatile int32_t *)pAddr, value); break;
+			case NmcAmoAqRlMode: return nmc_amoor_w_aqrl((volatile int32_t *)pAddr, value); break;
+			}
+			break;
+		case 8:
+			switch (cmdProps.getAmoMode()) {
+			default: return nmc_amoor_d((volatile int64_t *)pAddr, value); break;
+			case NmcAmoAqMode: return nmc_amoor_d_aq((volatile int64_t *)pAddr, value); break;
+			case NmcAmoRlMode: return nmc_amoor_d_rl((volatile int64_t *)pAddr, value); break;
+			case NmcAmoAqRlMode: return nmc_amoor_d_aqrl((volatile int64_t *)pAddr, value); break;
+			}
+			break;
+		default:
+			assert(0 && "Invalid type");
+			return 0;
+		}
+	}
+}
+
+template<typename T = uint32_t>
+inline T nmcAtomicAnd(NmcTeCmdProps const &cmdProps, T *pAddr, T value) {
+	if (cmdProps.isNonBlocking()) {
+		switch (sizeof(T)) {
+		case 4:
+			switch (cmdProps.getAmoMode()) {
+			default: nmc_amoand_w_nb((volatile int32_t *)pAddr, value); break;
+			case NmcAmoAqMode: nmc_amoand_w_aq_nb((volatile int32_t *)pAddr, value); break;
+			case NmcAmoRlMode: nmc_amoand_w_rl_nb((volatile int32_t *)pAddr, value); break;
+			case NmcAmoAqRlMode: nmc_amoand_w_aqrl_nb((volatile int32_t *)pAddr, value); break;
+			}
+			break;
+		case 8:
+			switch (cmdProps.getAmoMode()) {
+			default: nmc_amoand_d_nb((volatile int64_t *)pAddr, value); break;
+			case NmcAmoAqMode: nmc_amoand_d_aq_nb((volatile int64_t *)pAddr, value); break;
+			case NmcAmoRlMode: nmc_amoand_d_rl_nb((volatile int64_t *)pAddr, value); break;
+			case NmcAmoAqRlMode: nmc_amoand_d_aqrl_nb((volatile int64_t *)pAddr, value); break;
+			}
+			break;
+		default:
+			assert(0 && "Invalid type");
+		}
+		return 0;
+	} else {
+		switch (sizeof(T)) {
+		case 4:
+			switch (cmdProps.getAmoMode()) {
+			default: return nmc_amoand_w((volatile int32_t *)pAddr, value); break;
+			case NmcAmoAqMode: return nmc_amoand_w_aq((volatile int32_t *)pAddr, value); break;
+			case NmcAmoRlMode: return nmc_amoand_w_rl((volatile int32_t *)pAddr, value); break;
+			case NmcAmoAqRlMode: return nmc_amoand_w_aqrl((volatile int32_t *)pAddr, value); break;
+			}
+			break;
+		case 8:
+			switch (cmdProps.getAmoMode()) {
+			default: return nmc_amoand_d((volatile int64_t *)pAddr, value); break;
+			case NmcAmoAqMode: return nmc_amoand_d_aq((volatile int64_t *)pAddr, value); break;
+			case NmcAmoRlMode: return nmc_amoand_d_rl((volatile int64_t *)pAddr, value); break;
+			case NmcAmoAqRlMode: return nmc_amoand_d_aqrl((volatile int64_t *)pAddr, value); break;
+			}
+			break;
+		default:
+			assert(0 && "Invalid type");
+			return 0;
+		}
+	}
+}
+
+template<typename T = uint32_t>
+inline T nmcAtomicSwap(NmcTeCmdProps const &cmdProps, T *pAddr, T value) {
+	if (cmdProps.isNonBlocking()) {
+		switch (sizeof(T)) {
+		case 4:
+			switch (cmdProps.getAmoMode()) {
+			default: nmc_amoswap_w_nb((volatile int32_t *)pAddr, value); break;
+			case NmcAmoAqMode: nmc_amoswap_w_aq_nb((volatile int32_t *)pAddr, value); break;
+			case NmcAmoRlMode: nmc_amoswap_w_rl_nb((volatile int32_t *)pAddr, value); break;
+			case NmcAmoAqRlMode: nmc_amoswap_w_aqrl_nb((volatile int32_t *)pAddr, value); break;
+			}
+			break;
+		case 8:
+			switch (cmdProps.getAmoMode()) {
+			default: nmc_amoswap_d_nb((volatile int64_t *)pAddr, value); break;
+			case NmcAmoAqMode: nmc_amoswap_d_aq_nb((volatile int64_t *)pAddr, value); break;
+			case NmcAmoRlMode: nmc_amoswap_d_rl_nb((volatile int64_t *)pAddr, value); break;
+			case NmcAmoAqRlMode: nmc_amoswap_d_aqrl_nb((volatile int64_t *)pAddr, value); break;
+			}
+			break;
+		default:
+			assert(0 && "Invalid type");
+		}
+		return 0;
+	} else {
+		switch (sizeof(T)) {
+		case 4:
+			switch (cmdProps.getAmoMode()) {
+			default: return nmc_amoswap_w((volatile int32_t *)pAddr, value); break;
+			case NmcAmoAqMode: return nmc_amoswap_w_aq((volatile int32_t *)pAddr, value); break;
+			case NmcAmoRlMode: return nmc_amoswap_w_rl((volatile int32_t *)pAddr, value); break;
+			case NmcAmoAqRlMode: return nmc_amoswap_w_aqrl((volatile int32_t *)pAddr, value); break;
+			}
+			break;
+		case 8:
+			switch (cmdProps.getAmoMode()) {
+			default: return nmc_amoswap_d((volatile int64_t *)pAddr, value); break;
+			case NmcAmoAqMode: return nmc_amoswap_d_aq((volatile int64_t *)pAddr, value); break;
+			case NmcAmoRlMode: return nmc_amoswap_d_rl((volatile int64_t *)pAddr, value); break;
+			case NmcAmoAqRlMode: return nmc_amoswap_d_aqrl((volatile int64_t *)pAddr, value); break;
+			}
+			break;
+		default:
+			assert(0 && "Invalid type");
+			return 0;
+		}
+	}
+}
+
+template<typename T = uint32_t>
+inline T nmcAtomicCas(NmcTeCmdProps const &cmdProps, T *pAddr, T cmpValue, T swapValue) {
+	if (cmdProps.isNonBlocking()) {
+		switch (sizeof(T)) {
+		case 4:
+			switch (cmdProps.getAmoMode()) {
+			default: nmc_amocas_w_nb((volatile int32_t *)pAddr, cmpValue, swapValue); break;
+			case NmcAmoAqMode: nmc_amocas_w_aq_nb((volatile int32_t *)pAddr, cmpValue, swapValue); break;
+			case NmcAmoRlMode: nmc_amocas_w_rl_nb((volatile int32_t *)pAddr, cmpValue, swapValue); break;
+			case NmcAmoAqRlMode: nmc_amocas_w_aqrl_nb((volatile int32_t *)pAddr, cmpValue, swapValue); break;
+			}
+			break;
+		case 8:
+			switch (cmdProps.getAmoMode()) {
+			default: nmc_amocas_d_nb((volatile int64_t *)pAddr, cmpValue, swapValue); break;
+			case NmcAmoAqMode: nmc_amocas_d_aq_nb((volatile int64_t *)pAddr, cmpValue, swapValue); break;
+			case NmcAmoRlMode: nmc_amocas_d_rl_nb((volatile int64_t *)pAddr, cmpValue, swapValue); break;
+			case NmcAmoAqRlMode: nmc_amocas_d_aqrl_nb((volatile int64_t *)pAddr, cmpValue, swapValue); break;
+			}
+			break;
+		default:
+			assert(0 && "Invalid type");
+		}
+		return 0;
+	} else {
+		switch (sizeof(T)) {
+		case 4:
+			switch (cmdProps.getAmoMode()) {
+			default: return nmc_amocas_w((volatile int32_t *)pAddr, cmpValue, swapValue); break;
+			case NmcAmoAqMode: return nmc_amocas_w_aq((volatile int32_t *)pAddr, cmpValue, swapValue); break;
+			case NmcAmoRlMode: return nmc_amocas_w_rl((volatile int32_t *)pAddr, cmpValue, swapValue); break;
+			case NmcAmoAqRlMode: return nmc_amocas_w_aqrl((volatile int32_t *)pAddr, cmpValue, swapValue); break;
+			}
+			break;
+		case 8:
+			switch (cmdProps.getAmoMode()) {
+			default: return nmc_amocas_d((volatile int64_t *)pAddr, cmpValue, swapValue); break;
+			case NmcAmoAqMode: return nmc_amocas_d_aq((volatile int64_t *)pAddr, cmpValue, swapValue); break;
+			case NmcAmoRlMode: return nmc_amocas_d_rl((volatile int64_t *)pAddr, cmpValue, swapValue); break;
+			case NmcAmoAqRlMode: return nmc_amocas_d_aqrl((volatile int64_t *)pAddr, cmpValue, swapValue); break;
+			}
+			break;
+		default:
+			assert(0 && "Invalid type");
+			return 0;
+		}
+	}
+}
+
+template<typename T = uint32_t>
+void nmcStore(NmcTeCmdProps const &cmdProps, T volatile *pAddr, T data)
+{
+	if (cmdProps.isNonTemporal()) {
+		if (std::is_floating_point<T>::value) {
+			switch (sizeof(T)) {
+			case 4: nmc_fsw_nt((volatile float *)pAddr, (float)data); return;
+			case 8: nmc_fsd_nt((volatile double *)pAddr, (double)data); return;
+			default: assert(0 && "Invalid type");
+
+			}
+		} else {
+			switch (sizeof(T)) {
+			case 1: nmc_sb_nt((volatile uint8_t *)pAddr, (uint8_t)data); return;
+			case 2: nmc_sh_nt((volatile uint16_t *)pAddr, (uint16_t)data); return;
+			case 4: nmc_sw_nt((volatile uint32_t *)pAddr, (uint32_t)data); return;
+			case 8: nmc_sd_nt((volatile uint64_t *)pAddr, (uint64_t)data); return;
+			default: assert(0 && "Invalid type");
+			}
+		}
+	} else {
+		*pAddr = data;
+	}
+}
+
+template<typename T = uint32_t>
+T nmcLoad(NmcTeCmdProps const &cmdProps, T volatile *pAddr)
+{
+	if (cmdProps.isNonTemporal()) {
+		if (std::is_floating_point<T>::value) {
+			switch (sizeof(T)) {
+			case 4: return nmc_flw_nt((volatile float *)pAddr);
+			case 8: return nmc_fld_nt((volatile double *)pAddr);
+			default: assert(0 && "Invalid type");
+			}
+		} else {
+			if (std::is_signed<T>::value) {
+				switch (sizeof(T)) {
+				case 1: return nmc_lb_nt((volatile int8_t *)pAddr);
+				case 2: return nmc_lh_nt((volatile int16_t *)pAddr);
+				case 4: return nmc_lw_nt((volatile int32_t *)pAddr);
+				case 8: return nmc_ld_nt((volatile int64_t *)pAddr);
+				default: assert(0 && "Invalid type");
+				}
+			} else {
+				switch (sizeof(T)) {
+				case 1: return nmc_lbu_nt((volatile uint8_t *)pAddr);
+				case 2: return nmc_lhu_nt((volatile uint16_t *)pAddr);
+				case 4: return nmc_lwu_nt((volatile uint32_t *)pAddr);
+				case 8: return nmc_ld_nt((volatile int64_t *)pAddr);
+				default: assert(0 && "Invalid type");
+				}
+			}
+		}
+	} else if (cmdProps.isSpacial()) {
+		if (std::is_floating_point<T>::value) {
+			switch (sizeof(T)) {
+			case 4: return nmc_flw_sp((volatile float *)pAddr);
+			case 8: return nmc_fld_sp((volatile double *)pAddr);
+			default: assert(0 && "Invalid type");
+			}
+		} else {
+			if (std::is_signed<T>::value) {
+				switch (sizeof(T)) {
+				case 1: return nmc_lb_sp((volatile int8_t *)pAddr);
+				case 2: return nmc_lh_sp((volatile int16_t *)pAddr);
+				case 4: return nmc_lw_sp((volatile int32_t *)pAddr);
+				case 8: return nmc_ld_sp((volatile int64_t *)pAddr);
+				default: assert(0 && "Invalid type");
+				}
+			} else {
+				switch (sizeof(T)) {
+				case 1: return nmc_lbu_sp((volatile uint8_t *)pAddr);
+				case 2: return nmc_lhu_sp((volatile uint16_t *)pAddr);
+				case 4: return nmc_lwu_sp((volatile uint32_t *)pAddr);
+				case 8: return nmc_ld_sp((volatile int64_t *)pAddr);
+				default: assert(0 && "Invalid type");
+				}
+			}
+		}
+	} else {
+		return *pAddr;
+	}
+	return 0;
+}
+
+inline uint64_t nmcGetDeviceId() {
+	return nmc_xid_dev();
+}
+
+inline uint64_t nmcGetTeId() {
+	return nmc_xid_te();
+}
+
+inline uint64_t nmcGetCoreId() {
+	return nmc_xid_core();
+}
+
+inline uint64_t nmcGetThreadId() {
+	return nmc_xid_thrd();
+}
+
+inline uint64_t nmcTzc(uint64_t data) {
+	return nmc_tzc(data);
+}
+
+inline void nmcLowerThreadPriority() {
+	nmc_xlp();
+}
+
+inline void nmcNormalThreadPriority() {
+	nmc_xnp();
+}
diff --git a/src/micron/nmc_te_intrin.h b/src/micron/nmc_te_intrin.h
new file mode 100644
index 00000000..83e08bb9
--- /dev/null
+++ b/src/micron/nmc_te_intrin.h
@@ -0,0 +1,2856 @@
+/*
+ * Copyright (C) 2024 Micron Technology, Inc.
+ *
+ * This file is the confidential and proprietary property of
+ *                 Micron Technology, Inc.
+ */
+#pragma once
+#include <stdint.h>
+#include "nmc_errno.h"
+
+/*
+ * c++ & __llvm__ does not support register keyword. c++ 17 also
+ * does not support the keyword register.
+ */
+#if defined(__cplusplus)
+	#if defined(__llvm__)
+		#define REG_NOT_SUPPORT 1
+	#endif
+	#if !defined(REG_NOT_SUPPORT) && __cplusplus >= 201703L
+		#define REG_NOT_SUPPORT 2
+	#endif
+#endif
+
+#ifdef REG_NOT_SUPPORT
+#define REGISTER
+#define ASMR(x)
+#else
+#define REGISTER    register
+#define ASMR(x)     asm(x)
+#endif
+
+#undef assert
+#define assert(x) do { if (!(x)) __nmc_te_assert(__FILE__, __LINE__, #x); } while (0)
+
+#ifndef INLINE
+#define INLINE inline
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+	int __nmc_te_assert(const char *file, int line, const char *exp);
+	int nmc_te_printf(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
+	int nmc_te_printf_time(const char *fmt, ...) __attribute__((format(printf, 1, 2)));
+
+
+#ifdef ENABLE_EMULATION
+	#ifdef __cplusplus
+	}
+	#endif
+	#include "nmc_te_emu_intrin.h"
+#else
+
+	INLINE uint64_t fmv_x_s(float din)
+	{
+		uint64_t dout;
+		asm("fmv.x.s %0,%1" : "=r" (dout) : "f" (din));
+		return dout;
+	}
+
+	INLINE uint64_t fmv_x_d(double din)
+	{
+		uint64_t dout;
+		asm("fmv.x.d %0,%1" : "=r" (dout) : "f" (din));
+		return dout;
+	}
+
+
+	// XFC C0
+
+	INLINE void nmc_xfc_te_c0_r0(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.te.c0.r0" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_te_c0_r0_at(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.te.c0.r0.at" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_te_c0_r1(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.te.c0.r1" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_te_c0_r1_at(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.te.c0.r1.at" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_te_c0_r2(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.te.c0.r2" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_te_c0_r2_at(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.te.c0.r2.at" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_te_c0_nr(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.te.c0.nr" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_te_c0_nr_at(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.te.c0.nr.at" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	// XFC C0 NF
+
+	INLINE void nmc_xfc_te_c0_r0_nf(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.te.c0.r0.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_te_c0_r0_nf_at(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.te.c0.r0.nf.at" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_te_c0_r1_nf(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.te.c0.r1.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_te_c0_r1_nf_at(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.te.c0.r1.nf.at" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_te_c0_r2_nf(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.te.c0.r2.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_te_c0_r2_nf_at(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.te.c0.r2.nf.at" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_te_c0_nr_nf(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.te.c0.nr.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_te_c0_nr_nf_at(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.te.c0.nr.nf.at" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	// XFC C0 BF
+
+	INLINE int64_t nmc_xfc_te_bf_c0_r0(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c0.r0" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c0_r1(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c0.r1" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c0_r2(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c0.r2" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c0_nr(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c0.nr" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps));
+		return status;
+	}
+
+	// XFC C0 NF BF
+
+	INLINE int64_t nmc_xfc_te_bf_c0_r0_nf(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c0.r0.nf" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c0_r1_nf(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c0.r1.nf" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c0_r2_nf(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c0.r2.nf" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c0_nr_nf(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c0.nr.nf" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps));
+		return status;
+	}
+
+	// XFC C1
+
+	INLINE void nmc_xfc_te_c1_r0(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.te.c1.r0" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_te_c1_r0_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.te.c1.r0.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_te_c1_r1(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.te.c1.r1" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_te_c1_r1_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.te.c1.r1.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_te_c1_r2(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.te.c1.r2" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_te_c1_r2_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.te.c1.r2.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_te_c1_nr(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.te.c1.nr" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_te_c1_nr_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.te.c1.nr.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	// XFC C1 NF
+
+	INLINE void nmc_xfc_te_c1_r0_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.te.c1.r0.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_te_c1_r0_nf_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.te.c1.r0.nf.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_te_c1_r1_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.te.c1.r1.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_te_c1_r1_nf_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.te.c1.r1.nf.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_te_c1_r2_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.te.c1.r2.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_te_c1_r2_nf_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.te.c1.r2.nf.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_te_c1_nr_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.te.c1.nr.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_te_c1_nr_nf_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.te.c1.nr.nf.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	// XFC C1 BF
+
+	INLINE int64_t nmc_xfc_te_bf_c1_r0(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c1.r0" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c1_r1(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c1.r1" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c1_r2(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c1.r2" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c1_nr(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c1.nr" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+		return status;
+	}
+
+	// XFC C1 NF BF
+
+	INLINE int64_t nmc_xfc_te_bf_c1_r0_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c1.r0.nf" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c1_r1_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c1.r1.nf" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c1_r2_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c1.r2.nf" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c1_nr_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c1.nr.nf" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+		return status;
+	}
+
+	// XFC C2
+
+	INLINE void nmc_xfc_te_c2_r0(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.te.c2.r0" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_te_c2_r0_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.te.c2.r0.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_te_c2_r1(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.te.c2.r1" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_te_c2_r1_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.te.c2.r1.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_te_c2_r2(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.te.c2.r2" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_te_c2_r2_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.te.c2.r2.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_te_c2_nr(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.te.c2.nr" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_te_c2_nr_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.te.c2.nr.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	// XFC C2 NF
+
+	INLINE void nmc_xfc_te_c2_r0_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.te.c2.r0.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_te_c2_r0_nf_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.te.c2.r0.nf.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_te_c2_r1_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.te.c2.r1.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_te_c2_r1_nf_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.te.c2.r1.nf.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_te_c2_r2_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.te.c2.r2.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_te_c2_r2_nf_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.te.c2.r2.nf.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_te_c2_nr_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.te.c2.nr.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_te_c2_nr_nf_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.te.c2.nr.nf.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	// XFC C2 BF
+
+	INLINE int64_t nmc_xfc_te_bf_c2_r0(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c2.r0" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c2_r1(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c2.r1" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c2_r2(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c2.r2" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c2_nr(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c2.nr" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+		return status;
+	}
+
+	// XFC C2 NF BF
+
+	INLINE int64_t nmc_xfc_te_bf_c2_r0_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c2.r0.nf" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c2_r1_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c2.r1.nf" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c2_r2_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c2.r2.nf" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c2_nr_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c2.nr.nf" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+		return status;
+	}
+
+	// XFC C4
+
+	INLINE void nmc_xfc_te_c4_r0(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.te.c4.r0" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_te_c4_r0_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.te.c4.r0.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_te_c4_r1(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.te.c4.r1" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_te_c4_r1_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.te.c4.r1.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_te_c4_r2(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.te.c4.r2" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_te_c4_r2_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.te.c4.r2.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_te_c4_nr(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.te.c4.nr" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_te_c4_nr_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.te.c4.nr.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	// XFC C4
+
+	INLINE void nmc_xfc_te_c4_r0_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.te.c4.r0.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_te_c4_r0_nf_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.te.c4.r0.nf.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_te_c4_r1_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.te.c4.r1.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_te_c4_r1_nf_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.te.c4.r1.nf.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_te_c4_r2_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.te.c4.r2.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_te_c4_r2_nf_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.te.c4.r2.nf.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_te_c4_nr_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.te.c4.nr.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_te_c4_nr_nf_at(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.te.c4.nr.nf.at" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	// XFC C4 BF
+
+	INLINE int64_t nmc_xfc_te_bf_c4_r0(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c4.r0" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c4_r1(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c4.r1" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c4_r2(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c4.r2" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c4_nr(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c4.nr" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+		return status;
+	}
+
+	// XFC C4 NF BF
+
+	INLINE int64_t nmc_xfc_te_bf_c4_r0_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c4.r0.nf" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c4_r1_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c4.r1.nf" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c4_r2_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c4.r2.nf" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+		return status;
+	}
+
+	INLINE int64_t nmc_xfc_te_bf_c4_nr_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		REGISTER int64_t status ASMR("x10");
+		asm volatile("xfc.te.bf.c4.nr.nf" : "=r" (status) : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+		return status;
+	}
+
+	// XFC C0
+
+	INLINE void nmc_xfc_se_c0_r0(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.se.c0.r0" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_se_c0_r1(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.se.c0.r1" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_se_c0_r2(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.se.c0.r2" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_se_c0_nr(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.se.c0.nr" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	// XFC C0 NF
+
+	INLINE void nmc_xfc_se_c0_r0_nf(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.se.c0.r0.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_se_c0_r1_nf(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.se.c0.r1.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_se_c0_r2_nf(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.se.c0.r2.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	INLINE void nmc_xfc_se_c0_nr_nf(void *_daddr, void *_pc, uint64_t _cmdProps)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		asm volatile("xfc.se.c0.nr.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps));
+	}
+
+	// XFC C1
+
+	INLINE void nmc_xfc_se_c1_r0(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.se.c1.r0" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_se_c1_r1(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.se.c1.r1" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_se_c1_r2(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.se.c1.r2" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_se_c1_nr(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.se.c1.nr" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	// XFC C1 NF
+
+	INLINE void nmc_xfc_se_c1_r0_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.se.c1.r0.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_se_c1_r1_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.se.c1.r1.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_se_c1_r2_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.se.c1.r2.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	INLINE void nmc_xfc_se_c1_nr_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		asm volatile("xfc.se.c1.nr.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1));
+	}
+
+	// XFC C2
+
+	INLINE void nmc_xfc_se_c2_r0(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.se.c2.r0" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_se_c2_r1(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.se.c2.r1" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_se_c2_r2(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.se.c2.r2" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_se_c2_nr(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.se.c2.nr" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	// XFC C2 NF
+
+	INLINE void nmc_xfc_se_c2_r0_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.se.c2.r0.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_se_c2_r1_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.se.c2.r1.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_se_c2_r2_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.se.c2.r2.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	INLINE void nmc_xfc_se_c2_nr_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		asm volatile("xfc.se.c2.nr.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2));
+	}
+
+	// XFC C4
+
+	INLINE void nmc_xfc_se_c4_r0(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.se.c4.r0" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_se_c4_r1(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.se.c4.r1" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_se_c4_r2(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.se.c4.r2" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_se_c4_nr(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.se.c4.nr" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	// XFC C4
+
+	INLINE void nmc_xfc_se_c4_r0_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.se.c4.r0.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_se_c4_r1_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.se.c4.r1.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_se_c4_r2_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.se.c4.r2.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	INLINE void nmc_xfc_se_c4_nr_nf(void *_daddr, void *_pc, uint64_t _cmdProps,
+		uint64_t _a1, uint64_t _a2, uint64_t _a3, uint64_t _a4)
+	{
+		REGISTER void *daddr ASMR("x10") = _daddr;
+		REGISTER void *pc ASMR("x11") = _pc;
+		REGISTER uint64_t cmdProps ASMR("x12") = _cmdProps;
+		REGISTER uint64_t a1 ASMR("x13") = _a1;
+		REGISTER uint64_t a2 ASMR("x14") = _a2;
+		REGISTER uint64_t a3 ASMR("x15") = _a3;
+		REGISTER uint64_t a4 ASMR("x16") = _a4;
+		asm volatile("xfc.se.c4.nr.nf" : : "r" (daddr), "r" (pc), "r" (cmdProps), "r" (a1), "r" (a2), "r" (a3), "r" (a4));
+	}
+
+	// XFJ
+
+	INLINE void nmc_xfj_r0(int64_t *_cmdId) {
+		REGISTER int64_t cmdId ASMR("x10");
+		REGISTER uint64_t ret1 ASMR("x11");
+		REGISTER uint64_t ret2 ASMR("x12");
+		asm volatile("xfj" : "=r" (cmdId), "=r" (ret1), "=r" (ret2));
+		*_cmdId = cmdId;
+	}
+
+	INLINE void nmc_xfj_r1(int64_t *_cmdId, uint64_t *_ret1) {
+		REGISTER int64_t cmdId ASMR("x10");
+		REGISTER uint64_t ret1 ASMR("x11");
+		REGISTER uint64_t ret2 ASMR("x12");
+		asm volatile("xfj" : "=r" (cmdId), "=r" (ret1), "=r" (ret2));
+		*_cmdId = cmdId;
+		*_ret1 = ret1;
+	}
+
+	INLINE void nmc_xfj_r2(int64_t *_cmdId, uint64_t *_ret1, uint64_t *_ret2) {
+		REGISTER int64_t cmdId ASMR("x10");
+		REGISTER uint64_t ret1 ASMR("x11");
+		REGISTER uint64_t ret2 ASMR("x12");
+		asm volatile("xfj" : "=r" (cmdId), "=r" (ret1), "=r" (ret2));
+		*_cmdId = cmdId;
+		*_ret1 = ret1;
+		*_ret2 = ret2;
+	}
+
+	INLINE void nmc_xfj_r0_ni(int64_t *_cmdId) {
+		REGISTER int64_t cmdId ASMR("x10");
+		REGISTER uint64_t ret1 ASMR("x11");
+		REGISTER uint64_t ret2 ASMR("x12");
+		asm volatile("xfj.ni" : "=r" (cmdId), "=r" (ret1), "=r" (ret2));
+		*_cmdId = cmdId;
+	}
+
+	INLINE void nmc_xfj_r1_ni(int64_t *_cmdId, uint64_t *_ret1) {
+		REGISTER int64_t cmdId ASMR("x10");
+		REGISTER uint64_t ret1 ASMR("x11");
+		REGISTER uint64_t ret2 ASMR("x12");
+		asm volatile("xfj.ni" : "=r" (cmdId), "=r" (ret1), "=r" (ret2));
+		*_cmdId = cmdId;
+		*_ret1 = ret1;
+	}
+
+	INLINE void nmc_xfj_r2_ni(int64_t *_cmdId, uint64_t *_ret1, uint64_t *_ret2) {
+		REGISTER int64_t cmdId ASMR("x10");
+		REGISTER uint64_t ret1 ASMR("x11");
+		REGISTER uint64_t ret2 ASMR("x12");
+		asm volatile("xfj.ni" : "=r" (cmdId), "=r" (ret1), "=r" (ret2));
+		*_cmdId = cmdId;
+		*_ret1 = ret1;
+		*_ret2 = ret2;
+	}
+
+	INLINE void nmc_xfj_r0_nb(int64_t *_cmdId) {
+		REGISTER int64_t cmdId ASMR("x10");
+		REGISTER uint64_t ret1 ASMR("x11");
+		REGISTER uint64_t ret2 ASMR("x12");
+		asm volatile("xfj.nb" : "=r" (cmdId), "=r" (ret1), "=r" (ret2));
+		*_cmdId = cmdId;
+	}
+
+	INLINE void nmc_xfj_r1_nb(int64_t *_cmdId, uint64_t *_ret1) {
+		REGISTER int64_t cmdId ASMR("x10");
+		REGISTER uint64_t ret1 ASMR("x11");
+		REGISTER uint64_t ret2 ASMR("x12");
+		asm volatile("xfj.nb" : "=r" (cmdId), "=r" (ret1), "=r" (ret2));
+		*_cmdId = cmdId;
+		*_ret1 = ret1;
+	}
+
+	INLINE void nmc_xfj_r2_nb(int64_t *_cmdId, uint64_t *_ret1, uint64_t *_ret2) {
+		REGISTER int64_t cmdId ASMR("x10");
+		REGISTER uint64_t ret1 ASMR("x11");
+		REGISTER uint64_t ret2 ASMR("x12");
+		asm volatile("xfj.nb" : "=r" (cmdId), "=r" (ret1), "=r" (ret2));
+		*_cmdId = cmdId;
+		*_ret1 = ret1;
+		*_ret2 = ret2;
+	}
+
+	INLINE void nmc_xfj_r0_nb_ni(int64_t *_cmdId) {
+		REGISTER int64_t cmdId ASMR("x10");
+		REGISTER uint64_t ret1 ASMR("x11");
+		REGISTER uint64_t ret2 ASMR("x12");
+		asm volatile("xfj.nb.ni" : "=r" (cmdId), "=r" (ret1), "=r" (ret2));
+		*_cmdId = cmdId;
+	}
+
+	INLINE void nmc_xfj_r1_nb_ni(int64_t *_cmdId, uint64_t *_ret1) {
+		REGISTER int64_t cmdId ASMR("x10");
+		REGISTER uint64_t ret1 ASMR("x11");
+		REGISTER uint64_t ret2 ASMR("x12");
+		asm volatile("xfj.nb.ni" : "=r" (cmdId), "=r" (ret1), "=r" (ret2));
+		*_cmdId = cmdId;
+		*_ret1 = ret1;
+	}
+
+	INLINE void nmc_xfj_r2_nb_ni(int64_t *_cmdId, uint64_t *_ret1, uint64_t *_ret2) {
+		REGISTER int64_t cmdId ASMR("x10");
+		REGISTER uint64_t ret1 ASMR("x11");
+		REGISTER uint64_t ret2 ASMR("x12");
+		asm volatile("xfj.nb.ni" : "=r" (cmdId), "=r" (ret1), "=r" (ret2));
+		*_cmdId = cmdId;
+		*_ret1 = ret1;
+		*_ret2 = ret2;
+	}
+
+	INLINE int64_t nmc_xfja() {
+		REGISTER int64_t rtn ASMR("x10");
+		asm volatile("xfja" : "=r" (rtn));
+		return rtn;
+	}
+
+	INLINE int64_t nmc_xfja_nb() {
+		REGISTER int64_t rtn ASMR("x10");
+		asm volatile("xfja.nb" : "=r" (rtn));
+		return rtn;
+	}
+
+	INLINE int64_t nmc_xfja_ni() {
+		REGISTER int64_t rtn ASMR("x10");
+		asm volatile("xfja.ni" : "=r" (rtn));
+		return rtn;
+	}
+
+	INLINE int64_t nmc_xfja_ni_nb() {
+		REGISTER int64_t rtn ASMR("x10");
+		asm volatile("xfja.ni.nb" : "=r" (rtn));
+		return rtn;
+	}
+
+	// XTR
+
+	INLINE void nmc_xtr_r0() {
+		asm volatile("xtr.r0");
+	}
+
+	INLINE void nmc_xtr_r1(uint64_t a1) {
+		asm volatile("xtr.r1 %0" : : "r" (a1));
+	}
+
+	INLINE void nmc_xtr_r2(uint64_t a1, uint64_t a2) {
+		asm volatile("xtr.r2 %0,%1" : : "r" (a1), "r" (a2));
+	}
+
+	INLINE double nmc_csr_rdtime() {
+		double dest;
+		asm("rdtime %0" : "=r" (dest));
+		return dest;
+	}
+
+	INLINE uint64_t nmc_csr_rdcycle() {
+		uint64_t dest;
+		asm("rdcycle %0" : "=r" (dest));
+		return dest;
+	}
+
+	//
+	// Event routines
+	//
+	INLINE void nmc_xem_sm(uint64_t eventNum) {
+		asm volatile("xem.sm %0" : : "r" (eventNum));
+	}
+	INLINE void nmc_xem_cm(uint64_t eventNum, uint64_t eventCount) {
+		asm volatile("xem.cm %0,%1" : : "r" (eventNum), "r" (eventCount));
+	}
+	INLINE void nmc_xem_bm(uint64_t eventNum, uint64_t recvChan) {
+		asm volatile("xem.bm %0,%1" : : "r" (eventNum), "r" (recvChan));
+	}
+
+	INLINE uint64_t nmc_xed(uint64_t eventNum) {
+		uint64_t dest;
+		asm("xed %0,%1" : "=r" (dest) : "r" (eventNum));
+		return dest;
+	}
+
+	INLINE void nmc_xes(uint64_t eventDest) {
+		asm volatile("xes %0" : : "r" (eventDest));
+	}
+	INLINE void nmc_xes_nf(uint64_t _eventDest) {
+		REGISTER uint64_t eventDest = _eventDest;
+		asm volatile("xes.nf %0" : : "r" (eventDest));
+	}
+
+	INLINE void nmc_xes_d(uint64_t eventDest, uint64_t eventData) {
+		asm volatile("xes.d %0,%1" : : "r" (eventDest), "r" (eventData));
+	}
+	INLINE void nmc_xes_d_nf(uint64_t eventDest, uint64_t eventData) {
+		asm volatile("xes.d.nf %0,%1" : : "r" (eventDest), "r" (eventData));
+	}
+
+	INLINE void nmc_xeb(uint64_t eventNum, uint64_t Chan) {
+		uint64_t eventDest = (Chan << 8) | eventNum;
+		asm volatile("xeb %0" : : "r" (eventDest));
+	}
+	INLINE void nmc_xeb_nf(uint64_t eventNum, uint64_t Chan) {
+		uint64_t eventDest = (Chan << 8) | eventNum;
+		asm volatile("xeb.nf %0" : : "r" (eventDest));
+	}
+
+	INLINE void nmc_xeb_d(uint64_t eventNum, uint64_t Chan, uint64_t eventData) {
+		uint64_t eventDest = (Chan << 8) | eventNum;
+		asm volatile("xeb.d %0,%1" : : "r" (eventDest), "r" (eventData));
+	}
+	INLINE void nmc_xeb_d_nf(uint64_t eventNum, uint64_t Chan, uint64_t eventData) {
+		uint64_t eventDest = (Chan << 8) | eventNum;
+		asm volatile("xeb.d.nf %0,%1" : : "r" (eventDest), "r" (eventData));
+	}
+
+	INLINE uint64_t nmc_xel(uint64_t eventMask) {
+		uint64_t rtnMask;
+		asm volatile("xel %0,%1" : "=r" (rtnMask) : "r" (eventMask));
+		return rtnMask;
+	}
+	INLINE uint64_t nmc_xel_nb(uint64_t eventMask) {
+		uint64_t rtnMask;
+		asm volatile("xel.nb %0,%1" : "=r" (rtnMask) : "r" (eventMask));
+		return rtnMask;
+	}
+
+	INLINE void nmc_xer(uint64_t eventNum) {
+		asm volatile("xer %0" : : "r" (eventNum));
+	}
+	INLINE void nmc_xer_ni(uint64_t eventNum) {
+		asm volatile("xer.ni %0" : : "r" (eventNum));
+	}
+
+	INLINE uint64_t nmc_xer_d(uint64_t eventNum) {
+		uint64_t evData;
+		asm volatile("xer.d %0,%1" : "=r" (evData) : "r" (eventNum));
+		return evData;
+	}
+	INLINE uint64_t nmc_xer_d_ni(uint64_t eventNum) {
+		uint64_t evData;
+		asm volatile("xer.d.ni %0,%1" : "=r" (evData) : "r" (eventNum));
+		return evData;
+	}
+
+	// Atomic memory operations
+
+	// AMOADD
+
+	INLINE int32_t nmc_amoadd_w(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoadd.w  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amoadd_w_aq(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoadd.w.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amoadd_w_rl(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoadd.w.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amoadd_w_aqrl(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoadd.w.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amoadd_w_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoadd.w.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoadd_w_aq_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoadd.w.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoadd_w_rl_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoadd.w.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoadd_w_aqrl_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoadd.w.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	INLINE int64_t nmc_amoadd_d(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoadd.d  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amoadd_d_aq(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoadd.d.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amoadd_d_rl(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoadd.d.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amoadd_d_aqrl(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoadd.d.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amoadd_d_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoadd.d.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoadd_d_aq_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoadd.d.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoadd_d_rl_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoadd.d.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoadd_d_aqrl_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoadd.d.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	// AMOXOR
+
+	INLINE int32_t nmc_amoxor_w(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoxor.w  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amoxor_w_aq(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoxor.w.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amoxor_w_rl(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoxor.w.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amoxor_w_aqrl(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoxor.w.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amoxor_w_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoxor.w.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoxor_w_aq_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoxor.w.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoxor_w_rl_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoxor.w.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoxor_w_aqrl_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoxor.w.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	INLINE int64_t nmc_amoxor_d(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoxor.d  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amoxor_d_aq(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoxor.d.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amoxor_d_rl(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoxor.d.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amoxor_d_aqrl(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoxor.d.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amoxor_d_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoxor.d.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoxor_d_aq_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoxor.d.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoxor_d_rl_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoxor.d.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoxor_d_aqrl_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoxor.d.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	// AMOOR
+
+	INLINE int32_t nmc_amoor_w(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoor.w  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amoor_w_aq(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoor.w.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amoor_w_rl(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoor.w.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amoor_w_aqrl(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoor.w.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amoor_w_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoor.w.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoor_w_aq_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoor.w.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoor_w_rl_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoor.w.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoor_w_aqrl_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoor.w.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	INLINE int64_t nmc_amoor_d(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoor.d  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amoor_d_aq(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoor.d.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amoor_d_rl(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoor.d.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amoor_d_aqrl(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoor.d.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amoor_d_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoor.d.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoor_d_aq_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoor.d.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoor_d_rl_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoor.d.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoor_d_aqrl_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoor.d.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	// AMOAND
+
+	INLINE int32_t nmc_amoand_w(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoand.w  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amoand_w_aq(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoand.w.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amoand_w_rl(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoand.w.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amoand_w_aqrl(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoand.w.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amoand_w_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoand.w.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoand_w_aq_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoand.w.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoand_w_rl_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoand.w.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoand_w_aqrl_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoand.w.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	INLINE int64_t nmc_amoand_d(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoand.d  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amoand_d_aq(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoand.d.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amoand_d_rl(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoand.d.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amoand_d_aqrl(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoand.d.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amoand_d_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoand.d.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoand_d_aq_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoand.d.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoand_d_rl_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoand.d.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoand_d_aqrl_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoand.d.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	// AMOMIN
+
+	INLINE int32_t nmc_amomin32(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amomin.w  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amomin_w_aq(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amomin.w.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amomin_w_rl(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amomin.w.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amomin_w_aqrl(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amomin.w.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amomin_w_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amomin.w.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomin_w_aq_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amomin.w.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomin_w_rl_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amomin.w.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomin_w_aqrl_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amomin.w.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	INLINE int64_t nmc_amomin_d(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amomin.d  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amomin_d_aq(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amomin.d.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amomin_d_rl(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amomin.d.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amomin_d_aqrl(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amomin.d.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amomin_d_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amomin.d.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomin_d_aq_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amomin.d.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomin_d_rl_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amomin.d.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomin_d_aqrl_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amomin.d.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	// AMOMAX
+
+	INLINE int32_t nmc_amomax_w(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amomax.w  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amomax_w_aq(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amomax.w.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amomax_w_rl(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amomax.w.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amomax_w_aqrl(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amomax.w.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amomax_w_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amomax.w.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomax_w_aq_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amomax.w.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomax_w_rl_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amomax.w.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomax_w_aqrl_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amomax.w.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	INLINE int64_t nmc_amomax_d(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amomax.d  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amomax_d_aq(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amomax.d.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amomax_d_rl(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amomax.d.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amomax_d_aqrl(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amomax.d.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amomax_d_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amomax.d.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomax_d_aq_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amomax.d.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomax_d_rl_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amomax.d.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomax_d_aqrl_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amomax.d.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	// AMOMINU
+
+	INLINE uint32_t nmc_amominu_w(volatile uint32_t *addr, uint32_t din) {
+		REGISTER uint32_t dout;
+		asm volatile ("amominu.w  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE uint32_t nmc_amominu_w_aq(volatile uint32_t *addr, uint32_t din) {
+		REGISTER uint32_t dout;
+		asm volatile ("amominu.w.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE uint32_t nmc_amominu_w_rl(volatile uint32_t *addr, uint32_t din) {
+		REGISTER uint32_t dout;
+		asm volatile ("amominu.w.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE uint32_t nmc_amominu_w_aqrl(volatile uint32_t *addr, uint32_t din) {
+		REGISTER uint32_t dout;
+		asm volatile ("amominu.w.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amominu_w_nb(volatile uint32_t *addr, uint32_t din) {
+		asm volatile ("amominu.w.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amominu_w_aq_nb(volatile uint32_t *addr, uint32_t din) {
+		asm volatile ("amominu.w.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amominu_w_rl_nb(volatile uint32_t *addr, uint32_t din) {
+		asm volatile ("amominu.w.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amominu_w_aqrl_nb(volatile uint32_t *addr, uint32_t din) {
+		asm volatile ("amominu.w.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	INLINE uint64_t nmc_amominu_d(volatile uint64_t *addr, uint64_t din) {
+		REGISTER uint64_t dout;
+		asm volatile ("amominu.d  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE uint64_t nmc_amominu_d_aq(volatile uint64_t *addr, uint64_t din) {
+		REGISTER uint64_t dout;
+		asm volatile ("amominu.d.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE uint64_t nmc_amominu_d_rl(volatile uint64_t *addr, uint64_t din) {
+		REGISTER uint64_t dout;
+		asm volatile ("amominu.d.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE uint64_t nmc_amominu_d_aqrl(volatile uint64_t *addr, uint64_t din) {
+		REGISTER uint64_t dout;
+		asm volatile ("amominu.d.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amominu_d_nb(volatile uint64_t *addr, uint64_t din) {
+		asm volatile ("amominu.d.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amominu_d_aq_nb(volatile uint64_t *addr, uint64_t din) {
+		asm volatile ("amominu.d.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amominu_d_rl_nb(volatile uint64_t *addr, uint64_t din) {
+		asm volatile ("amominu.d.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amominu_d_aqrl_nb(volatile uint64_t *addr, uint64_t din) {
+		asm volatile ("amominu.d.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	// AMOMAXU
+
+	INLINE uint32_t nmc_amomaxu_w(volatile uint32_t *addr, uint32_t din) {
+		REGISTER uint32_t dout;
+		asm volatile ("amomaxu.w  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE uint32_t nmc_amomaxu_w_aq(volatile uint32_t *addr, uint32_t din) {
+		REGISTER uint32_t dout;
+		asm volatile ("amomaxu.w.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE uint32_t nmc_amomaxu_w_rl(volatile uint32_t *addr, uint32_t din) {
+		REGISTER uint32_t dout;
+		asm volatile ("amomaxu.w.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE uint32_t nmc_amomaxu_w_aqrl(volatile uint32_t *addr, uint32_t din) {
+		REGISTER uint32_t dout;
+		asm volatile ("amomaxu.w.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amomaxu_w_nb(volatile uint32_t *addr, uint32_t din) {
+		asm volatile ("amomaxu.w.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomaxu_w_aq_nb(volatile uint32_t *addr, uint32_t din) {
+		asm volatile ("amomaxu.w.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomaxu_w_rl_nb(volatile uint32_t *addr, uint32_t din) {
+		asm volatile ("amomaxu.w.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomaxu_w_aqrl_nb(volatile uint32_t *addr, uint32_t din) {
+		asm volatile ("amomaxu.w.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	INLINE uint64_t nmc_amomaxu_d(volatile uint64_t *addr, uint64_t din) {
+		REGISTER uint64_t dout;
+		asm volatile ("amomaxu.d  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE uint64_t nmc_amomaxu_d_aq(volatile uint64_t *addr, uint64_t din) {
+		REGISTER uint64_t dout;
+		asm volatile ("amomaxu.d.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE uint64_t nmc_amomaxu_d_rl(volatile uint64_t *addr, uint64_t din) {
+		REGISTER uint64_t dout;
+		asm volatile ("amomaxu.d.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE uint64_t nmc_amomaxu_d_aqrl(volatile uint64_t *addr, uint64_t din) {
+		REGISTER uint64_t dout;
+		asm volatile ("amomaxu.d.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amomaxu_d_nb(volatile uint64_t *addr, uint64_t din) {
+		asm volatile ("amomaxu.d.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomaxu_d_aq_nb(volatile uint64_t *addr, uint64_t din) {
+		asm volatile ("amomaxu.d.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomaxu_d_rl_nb(volatile uint64_t *addr, uint64_t din) {
+		asm volatile ("amomaxu.d.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amomaxu_d_aqrl_nb(volatile uint64_t *addr, uint64_t din) {
+		asm volatile ("amomaxu.d.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	// AMOSWAP
+
+	INLINE int32_t nmc_amoswap_w(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoswap.w  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amoswap_w_aq(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoswap.w.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amoswap_w_rl(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoswap.w.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int32_t nmc_amoswap_w_aqrl(volatile int32_t *addr, int32_t din) {
+		REGISTER int32_t dout;
+		asm volatile ("amoswap.w.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amoswap_w_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoswap.w.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoswap_w_aq_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoswap.w.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoswap_w_rl_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoswap.w.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoswap_w_aqrl_nb(volatile int32_t *addr, int32_t din) {
+		asm volatile ("amoswap.w.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	INLINE int64_t nmc_amoswap_d(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoswap.d  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amoswap_d_aq(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoswap.d.aq  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amoswap_d_rl(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoswap.d.rl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+	INLINE int64_t nmc_amoswap_d_aqrl(volatile int64_t *addr, int64_t din) {
+		REGISTER int64_t dout;
+		asm volatile ("amoswap.d.aqrl  %0,%2,(%1)" : "=r" (dout) : "r" (addr), "r" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amoswap_d_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoswap.d.nr  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoswap_d_aq_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoswap.d.nr.aq  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoswap_d_rl_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoswap.d.nr.rl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+	INLINE void nmc_amoswap_d_aqrl_nb(volatile int64_t *addr, int64_t din) {
+		asm volatile ("amoswap.d.nr.aqrl  %1,(%0)" : : "r" (addr), "r" (din));
+	}
+
+	// AMOCAS
+
+	INLINE int32_t nmc_amocas_w(volatile int32_t *addr, int32_t din1, int32_t din2) {
+		REGISTER int32_t dout;
+		asm volatile ("amocas.w  %0,%2,%3,(%1)" : "=r" (dout) : "r" (addr), "r" (din1), "r" (din2));
+		return dout;
+	}
+	INLINE int32_t nmc_amocas_w_aq(volatile int32_t *addr, int32_t din1, int32_t din2) {
+		REGISTER int32_t dout;
+		asm volatile ("amocas.w.aq  %0,%2,%3,(%1)" : "=r" (dout) : "r" (addr), "r" (din1), "r" (din2));
+		return dout;
+	}
+	INLINE int32_t nmc_amocas_w_rl(volatile int32_t *addr, int32_t din1, int32_t din2) {
+		REGISTER int32_t dout;
+		asm volatile ("amocas.w.rl  %0,%2,%3,(%1)" : "=r" (dout) : "r" (addr), "r" (din1), "r" (din2));
+		return dout;
+	}
+	INLINE int32_t nmc_amocas_w_aqrl(volatile int32_t *addr, int32_t din1, int32_t din2) {
+		REGISTER int32_t dout;
+		asm volatile ("amocas.w.aqrl  %0,%2,%3,(%1)" : "=r" (dout) : "r" (addr), "r" (din1), "r" (din2));
+		return dout;
+	}
+
+	INLINE void nmc_amocas_w_nb(volatile int32_t *addr, int32_t din1, int32_t din2) {
+		asm volatile ("amocas.w.nr  %1,%2,(%0)" : : "r" (addr), "r" (din1), "r" (din2));
+	}
+	INLINE void nmc_amocas_w_aq_nb(volatile int32_t *addr, int32_t din1, int32_t din2) {
+		asm volatile ("amocas.w.nr.aq  %1,%2,(%0)" : : "r" (addr), "r" (din1), "r" (din2));
+	}
+	INLINE void nmc_amocas_w_rl_nb(volatile int32_t *addr, int32_t din1, int32_t din2) {
+		asm volatile ("amocas.w.nr.rl  %1,%2,(%0)" : : "r" (addr), "r" (din1), "r" (din2));
+	}
+	INLINE void nmc_amocas_w_aqrl_nb(volatile int32_t *addr, int32_t din1, int32_t din2) {
+		asm volatile ("amocas.w.nr.aqrl  %1,%2,(%0)" : : "r" (addr), "r" (din1), "r" (din2));
+	}
+
+	INLINE int64_t nmc_amocas_d(volatile int64_t *addr, int64_t din1, int64_t din2) {
+		REGISTER int64_t dout;
+		asm volatile ("amocas.d  %0,%2,%3,(%1)" : "=r" (dout) : "r" (addr), "r" (din1), "r" (din2));
+		return dout;
+	}
+	INLINE int64_t nmc_amocas_d_aq(volatile int64_t *addr, int64_t din1, int64_t din2) {
+		REGISTER int64_t dout;
+		asm volatile ("amocas.d.aq  %0,%2,%3,(%1)" : "=r" (dout) : "r" (addr), "r" (din1), "r" (din2));
+		return dout;
+	}
+	INLINE int64_t nmc_amocas_d_rl(volatile int64_t *addr, int64_t din1, int64_t din2) {
+		REGISTER int64_t dout;
+		asm volatile ("amocas.d.rl  %0,%2,%3,(%1)" : "=r" (dout) : "r" (addr), "r" (din1), "r" (din2));
+		return dout;
+	}
+	INLINE int64_t nmc_amocas_d_aqrl(volatile int64_t *addr, int64_t din1, int64_t din2) {
+		REGISTER int64_t dout;
+		asm volatile ("amocas.d.aqrl  %0,%2,%3,(%1)" : "=r" (dout) : "r" (addr), "r" (din1), "r" (din2));
+		return dout;
+	}
+
+	INLINE void nmc_amocas_d_nb(volatile int64_t *addr, int64_t din1, int64_t din2) {
+		asm volatile ("amocas.d.nr  %1,%2,(%0)" : : "r" (addr), "r" (din1), "r" (din2));
+	}
+	INLINE void nmc_amocas_d_aq_nb(volatile int64_t *addr, int64_t din1, int64_t din2) {
+		asm volatile ("amocas.d.nr.aq  %1,%2,(%0)" : : "r" (addr), "r" (din1), "r" (din2));
+	}
+	INLINE void nmc_amocas_d_rl_nb(volatile int64_t *addr, int64_t din1, int64_t din2) {
+		asm volatile ("amocas.d.nr.rl  %1,%2,(%0)" : : "r" (addr), "r" (din1), "r" (din2));
+	}
+	INLINE void nmc_amocas_d_aqrl_nb(volatile int64_t *addr, int64_t din1, int64_t din2) {
+		asm volatile ("amocas.d.nr.aqrl  %1,%2,(%0)" : : "r" (addr), "r" (din1), "r" (din2));
+	}
+
+	// AMOFADD
+
+	INLINE float nmc_amofadd_s(volatile float *addr, float din) {
+		float dout;
+		asm volatile ("amofadd.s  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE float nmc_amofadd_s_aq(volatile float *addr, float din) {
+		float dout;
+		asm volatile ("amofadd.s.aq  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE float nmc_amofadd_s_rl(volatile float *addr, float din) {
+		float dout;
+		asm volatile ("amofadd.s.rl  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE float nmc_amofadd_s_aqrl(volatile float *addr, float din) {
+		float dout;
+		asm volatile ("amofadd.s.aqrl  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amofadd_s_nb(volatile float *addr, float din) {
+		asm volatile ("amofadd.s.nr  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofadd_s_aq_nb(volatile float *addr, float din) {
+		asm volatile ("amofadd.s.nr.aq  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofadd_s_rl_nb(volatile float *addr, float din) {
+		asm volatile ("amofadd.s.nr.rl  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofadd_s_aqrl_nb(volatile float *addr, float din) {
+		asm volatile ("amofadd.s.nr.aqrl  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+
+	INLINE double nmc_amofadd_d(volatile double *addr, double din) {
+		double dout;
+		asm volatile ("amofadd.d  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE double nmc_amofadd_d_aq(volatile double *addr, double din) {
+		double dout;
+		asm volatile ("amofadd.d.aq  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE double nmc_amofadd_d_rl(volatile double *addr, double din) {
+		double dout;
+		asm volatile ("amofadd.d.rl  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE double nmc_amofadd_d_aqrl(volatile double *addr, double din) {
+		double dout;
+		asm volatile ("amofadd.d.aqrl  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amofadd_d_nb(volatile double *addr, double din) {
+		asm volatile ("amofadd.d.nr  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofadd_d_aq_nb(volatile double *addr, double din) {
+		asm volatile ("amofadd.d.nr.aq  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofadd_d_rl_nb(volatile double *addr, double din) {
+		asm volatile ("amofadd.d.nr.rl  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofadd_d_aqrl_nb(volatile double *addr, double din) {
+		asm volatile ("amofadd.d.nr.aqrl  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+
+	// AMOFMIN
+
+	INLINE float nmc_amofmin_s(volatile float *addr, float din) {
+		float dout;
+		asm volatile ("amofmin.s  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE float nmc_amofmin_s_aq(volatile float *addr, float din) {
+		float dout;
+		asm volatile ("amofmin.s.aq  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE float nmc_amofmin_s_rl(volatile float *addr, float din) {
+		float dout;
+		asm volatile ("amofmin.s.rl  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE float nmc_amofmin_s_aqrl(volatile float *addr, float din) {
+		float dout;
+		asm volatile ("amofmin.s.aqrl  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amofmin_s_nb(volatile float *addr, float din) {
+		asm volatile ("amofmin.s.nr  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofmin_s_aq_nb(volatile float *addr, float din) {
+		asm volatile ("amofmin.s.nr.aq  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofmin_s_rl_nb(volatile float *addr, float din) {
+		asm volatile ("amofmin.s.nr.rl  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofmin_s_aqrl_nb(volatile float *addr, float din) {
+		asm volatile ("amofmin.s.nr.aqrl  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+
+	INLINE double nmc_amofmin_d(volatile double *addr, double din) {
+		double dout;
+		asm volatile ("amofmin.d  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE double nmc_amofmin_d_aq(volatile double *addr, double din) {
+		double dout;
+		asm volatile ("amofmin.d.aq  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE double nmc_amofmin_d_rl(volatile double *addr, double din) {
+		double dout;
+		asm volatile ("amofmin.d.rl  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE double nmc_amofmin_d_aqrl(volatile double *addr, double din) {
+		double dout;
+		asm volatile ("amofmin.d.aqrl  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amofmin_d_nb(volatile double *addr, double din) {
+		asm volatile ("amofmin.d.nr  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofmin_d_aq_nb(volatile double *addr, double din) {
+		asm volatile ("amofmin.d.nr.aq  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofmin_d_rl_nb(volatile double *addr, double din) {
+		asm volatile ("amofmin.d.nr.rl  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofmin_d_aqrl_nb(volatile double *addr, double din) {
+		asm volatile ("amofmin.d.nr.aqrl  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+
+	// AMOFMAX
+
+	INLINE float nmc_amofmax_s(volatile float *addr, float din) {
+		float dout;
+		asm volatile ("amofmax.s  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE float nmc_amofmax_s_aq(volatile float *addr, float din) {
+		float dout;
+		asm volatile ("amofmax.s.aq  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE float nmc_amofmax_s_rl(volatile float *addr, float din) {
+		float dout;
+		asm volatile ("amofmax.s.rl  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE float nmc_amofmax_s_aqrl(volatile float *addr, float din) {
+		float dout;
+		asm volatile ("amofmax.s.aqrl  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amofmax_s_nb(volatile float *addr, float din) {
+		asm volatile ("amofmax.s.nr  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofmax_s_aq_nb(volatile float *addr, float din) {
+		asm volatile ("amofmax.s.nr.aq  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofmax_s_rl_nb(volatile float *addr, float din) {
+		asm volatile ("amofmax.s.nr.rl  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofmax_s_aqrl_nb(volatile float *addr, float din) {
+		asm volatile ("amofmax.s.nr.aqrl  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+
+	INLINE double nmc_amofmax_d(volatile double *addr, double din) {
+		double dout;
+		asm volatile ("amofmax.d  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE double nmc_amofmax_d_aq(volatile double *addr, double din) {
+		double dout;
+		asm volatile ("amofmax.d.aq  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE double nmc_amofmax_d_rl(volatile double *addr, double din) {
+		double dout;
+		asm volatile ("amofmax.d.rl  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+	INLINE double nmc_amofmax_d_aqrl(volatile double *addr, double din) {
+		double dout;
+		asm volatile ("amofmax.d.aqrl  %0,%2,(%1)" : "=f" (dout) : "r" (addr), "f" (din));
+		return dout;
+	}
+
+	INLINE void nmc_amofmax_d_nb(volatile double *addr, double din) {
+		asm volatile ("amofmax.d.nr  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofmax_d_aq_nb(volatile double *addr, double din) {
+		asm volatile ("amofmax.d.nr.aq  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofmax_d_rl_nb(volatile double *addr, double din) {
+		asm volatile ("amofmax.d.nr.rl  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+	INLINE void nmc_amofmax_d_aqrl_nb(volatile double *addr, double din) {
+		asm volatile ("amofmax.d.nr.aqrl  %1,(%0)" : : "r" (addr), "f" (din));
+	}
+
+	// Load and store
+
+	INLINE void nmc_sb_nt(volatile uint8_t *_addr, uint8_t _din) {
+		asm volatile ("sb.nt %0,0(%1)" : : "r" (_din), "r" (_addr));
+	}
+	INLINE void nmc_sh_nt(volatile uint16_t *_addr, uint16_t _din) {
+		asm volatile ("sh.nt %0,0(%1)" : : "r" (_din), "r" (_addr));
+	}
+	INLINE void nmc_sw_nt(volatile uint32_t *_addr, uint32_t _din) {
+		asm volatile ("sw.nt %0,0(%1)" : : "r" (_din), "r" (_addr));
+	}
+	INLINE void nmc_sd_nt(volatile uint64_t *_addr, int64_t _din) {
+		asm volatile ("sd.nt %0,0(%1)" : : "r" (_din), "r" (_addr));
+	}
+#if defined(__GNUC__) && !defined(__llvm__)
+	INLINE void nmc_fsw_nt(volatile float *addr, float din) {
+		asm volatile ("fsw.nt %0,0(%1)" : : "f" (din), "r" (addr));
+	}
+	INLINE void nmc_fsd_nt(volatile double *addr, double din) {
+		asm volatile ("fsd.nt %0,0(%1)" : : "f" (din), "r" (addr));
+	}
+#else
+	extern void nmc_fstore32_nt(volatile float *addr, float din);
+	extern void nmc_fstore64_nt(volatile double *addr, double din);
+#endif
+
+	INLINE int8_t nmc_lb_nt(volatile int8_t *_addr) {
+		int8_t dout;
+		asm volatile ("lb.nt %0,0(%1)" : "=r" (dout) : "r" (_addr));
+		return dout;
+	}
+	INLINE int16_t nmc_lh_nt(volatile int16_t *_addr) {
+		int16_t dout;
+		asm volatile ("lh.nt %0,0(%1)" : "=r" (dout) : "r" (_addr));
+		return dout;
+	}
+	INLINE int32_t nmc_lw_nt(volatile int32_t *_addr) {
+		int32_t dout;
+		asm volatile ("lw.nt %0,0(%1)" : "=r" (dout) : "r" (_addr));
+		return dout;
+	}
+	INLINE int64_t nmc_ld_nt(volatile int64_t *_addr) {
+		int64_t dout;
+		asm volatile ("ld.nt %0,0(%1)" : "=r" (dout) : "r" (_addr));
+		return dout;
+	}
+	INLINE uint8_t nmc_lbu_nt(volatile uint8_t *_addr) {
+		uint8_t dout;
+		asm volatile ("lbu.nt %0,0(%1)" : "=r" (dout) : "r" (_addr));
+		return dout;
+	}
+	INLINE uint16_t nmc_lhu_nt(volatile uint16_t *_addr) {
+		uint16_t dout;
+		asm volatile ("lhu.nt %0,0(%1)" : "=r" (dout) : "r" (_addr));
+		return dout;
+	}
+	INLINE uint32_t nmc_lwu_nt(volatile uint32_t *_addr) {
+		uint32_t dout;
+		asm volatile ("lwu.nt %0,0(%1)" : "=r" (dout) : "r" (_addr));
+		return dout;
+	}
+#if defined(__GNUC__) && !defined(__llvm__)
+	INLINE float nmc_flw_nt(volatile float *_addr) {
+		float dout;
+		asm volatile ("flw.nt %0,0(%1)" : "=f" (dout) : "r" (_addr));
+		return dout;
+	}
+	INLINE double nmc_fld_nt(volatile double *_addr) {
+		double dout;
+		asm volatile ("fld.nt %0,0(%1)" : "=f" (dout) : "r" (_addr));
+		return dout;
+	}
+#else
+	extern float nmc_flw_nt(volatile float *_addr);
+	extern double nmc_fld_nt(volatile double *_addr);
+#endif
+
+	// Cache line loads
+
+
+	INLINE int8_t nmc_lb_sp(volatile int8_t *_addr) {
+		int8_t dout;
+		asm volatile ("lb.sp %0,0(%1)" : "=r" (dout) : "r" (_addr));
+		return dout;
+	}
+	INLINE int16_t nmc_lh_sp(volatile int16_t *_addr) {
+		int16_t dout;
+		asm volatile ("lh.sp %0,0(%1)" : "=r" (dout) : "r" (_addr));
+		return dout;
+	}
+	INLINE int32_t nmc_lw_sp(volatile int32_t *_addr) {
+		int32_t dout;
+		asm volatile ("lw.sp %0,0(%1)" : "=r" (dout) : "r" (_addr));
+		return dout;
+	}
+	INLINE int64_t nmc_ld_sp(volatile int64_t *_addr) {
+		int64_t dout;
+		asm volatile ("ld.sp %0,0(%1)" : "=r" (dout) : "r" (_addr));
+		return dout;
+	}
+	INLINE uint8_t nmc_lbu_sp(volatile uint8_t *_addr) {
+		uint8_t dout;
+		asm volatile ("lbu.sp %0,0(%1)" : "=r" (dout) : "r" (_addr));
+		return dout;
+	}
+	INLINE uint16_t nmc_lhu_sp(volatile uint16_t *_addr) {
+		uint16_t dout;
+		asm volatile ("lhu.sp %0,0(%1)" : "=r" (dout) : "r" (_addr));
+		return dout;
+	}
+	INLINE uint32_t nmc_lwu_sp(volatile uint32_t *_addr) {
+		uint32_t dout;
+		asm volatile ("lwu.sp %0,0(%1)" : "=r" (dout) : "r" (_addr));
+		return dout;
+	}
+#if defined(__GNUC__) && !defined(__llvm__)
+	INLINE float nmc_flw_sp(volatile float *_addr) {
+		float dout;
+		asm volatile ("flw.sp %0,0(%1)" : "=f" (dout) : "r" (_addr));
+		return dout;
+	}
+	INLINE double nmc_fld_sp(volatile double *_addr) {
+		double dout;
+		asm volatile ("fld.sp %0,0(%1)" : "=f" (dout) : "r" (_addr));
+		return dout;
+	}
+#else
+	extern float nmc_flw_sp(volatile float *_addr);
+	extern double nmc_fld_sp(volatile double *_addr);
+#endif
+
+	// ID
+
+	INLINE uint64_t nmc_xid_dev(void) {
+		REGISTER uint64_t nmcId ASMR("x10");
+		asm("xid.dev  %0" : "=r" (nmcId) : );
+		return nmcId;
+	}
+
+	INLINE uint64_t nmc_xid_te(void) {
+		REGISTER uint64_t nmcTe ASMR("x10");
+		asm("xid.te  %0" : "=r" (nmcTe) : );
+		return nmcTe;
+	}
+
+	INLINE uint64_t nmc_xid_core(void) {
+		REGISTER uint64_t nmcCore ASMR("x10");
+		asm("xid.core  %0" : "=r" (nmcCore) : );
+		return nmcCore;
+	}
+
+	INLINE uint64_t nmc_xid_thrd(void) {
+		REGISTER uint64_t nmcThread ASMR("x10");
+		asm("xid.thrd  %0" : "=r" (nmcThread) : );
+		return nmcThread;
+	}
+
+	// tzc(out, in)
+	INLINE uint64_t nmc_tzc(uint64_t __in) {
+		uint64_t __out;
+		asm("tzc %0,%1" : "=r" (__out) : "r" (__in));
+		return __out;
+	}
+
+	// Lower thread priority
+	INLINE void nmc_xlp(void) {
+		asm volatile("xlp");
+	}
+
+	// Return thread priority to normal
+	INLINE void nmc_xnp(void) {
+		asm volatile("xnp");
+	}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // ENABLE_EMULATION
\ No newline at end of file
diff --git a/src/micron/nmc_types.h b/src/micron/nmc_types.h
new file mode 100644
index 00000000..ae46a55f
--- /dev/null
+++ b/src/micron/nmc_types.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright (C) 2024 Micron Technology, Inc.
+ *
+ * This file is the confidential and proprietary property of
+ * Micron Technology, Inc.
+ */
+
+#pragma once
+
+#include <inttypes.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * This file conatins all of the typedef's and enum's definitions used
+ * by nmc api header files for both host and Risc-V api's.
+ */
+
+typedef struct NmcApi*		nmc_t;
+typedef struct NmcLockApi*	nmc_lock_t;
+typedef struct NmcMutexApi*	nmc_mutex_t;
+typedef struct NmcBarrierApi* 	nmc_barrier_t;
+
+typedef uint16_t		nmc_status_t;
+typedef uint64_t		nmc_event_t;
+typedef uint16_t		nmc_cid_t;
+
+typedef struct NmcHucResponseApi*  nmc_response_t;
+typedef struct NmcAttachPropsApi*  nmc_attach_props_t;
+typedef struct NmcCmdPropsApi*     nmc_cmd_props_t;
+
+typedef int32_t nmc_remote_host_t;
+
+#define NMC_CMD_BITS 12
+
+enum __nmc_cmd {
+	NmcCmdInvalid = 0x000,
+	NmcCmdRead = 0x001,
+	NmcCmdWrite = 0x002,
+	NmcCmdClean = 0x003,
+	NmcCmdFlush = 0x004,
+	NmcCmdMscTagRead = 0x005,
+	NmcCmdAmo = 0x006,
+	NmcCmdAmoNr = 0x007,
+	NmcCmdSyscall = 0x008,
+	NmcCmdTrap = 0x009,
+	NmcCmdEventDestination = 0x00A,
+	NmcCmdEventMode = 0x00B,
+	NmcCmdEventSend = 0x00C,
+	NmcCmdEventBroadcast = 0x00D,
+	NmcCmdEventReceive = 0x00E,
+	NmcCmdTeCall = 0x00F,
+	NmcCmdTeReturn = 0x010,
+	NmcCmdWrReq = 0x011,
+	NmcCmdHusRsp = 0x012,
+	NmcCmdDmCopy = 0x013,
+	NmcCmdDmGatherStride = 0x014,
+	NmcCmdDmGatherAddress = 0x015,
+	NmcCmdDmGatherIndex = 0x016,
+	NmcCmdDmScatterStride = 0x017,
+	NmcCmdDmScatterAddress = 0x018,
+	NmcCmdDmScatterIndex = 0x019,
+	NmcCmdDmSet = 0x01A,
+	NmcCmdDmReturn = 0x01B,
+	NmcCmdSeCall = 0x01C,
+	NmcCmdSeReturn = 0x01D,
+	NmcCmdEventAck = 0x01E,
+	NmcCmdAmoAdd32 = 0x0A0,
+	NmcCmdAmoAnd32 = 0x0A1,
+	NmcCmdAmoXor32 = 0x0A2,
+	NmcCmdAmoOr32 = 0x0A3,
+	NmcCmdAmoMin32 = 0x0A4,
+	NmcCmdAmoMax32 = 0x0A5,
+	NmcCmdAmoMinU32 = 0x0A6,
+	NmcCmdAmoMaxU32 = 0x0A7,
+	NmcCmdAmoSwap32 = 0x0A8,
+	NmcCmdAmoCas32 = 0x0A9,
+	NmcCmdAmoFadd32 = 0x0AA,
+	NmcCmdAmoFmin32 = 0x0AB,
+	NmcCmdAmoFmax32 = 0x0AC,
+	NmcCmdAmoAdd64 = 0x0B0,
+	NmcCmdAmoAnd64 = 0x0B1,
+	NmcCmdAmoXor64 = 0x0B2,
+	NmcCmdAmoOr64 = 0x0B3,
+	NmcCmdAmoMin64 = 0x0B4,
+	NmcCmdAmoMax64 = 0x0B5,
+	NmcCmdAmoMinU64 = 0x0B6,
+	NmcCmdAmoMaxU64 = 0x0B7,
+	NmcCmdAmoSwap64 = 0x0B8,
+	NmcCmdAmoCas64 = 0x0B9,
+	NmcCmdAmoFadd64 = 0x0BA,
+	NmcCmdAmoFmin64 = 0x0BB,
+	NmcCmdAmoFmax64 = 0x0BC,
+	NmcCmdLoopBack = 0xffd,
+	NmcCmdAtomic = 0xffe,
+	NmcCmdCreateThread = 0xfff
+};
+
+typedef enum __nmc_cmd ENmcCmd;
+typedef enum __nmc_cmd nmc_cmd_t;
+
+/*
+ * Resource Management
+ */
+
+/*
+ * Stack protection modes for threads/fibers are:
+ * 	AllowThreadWriteAll	No write protection for stack
+ * 	AllowFiberWriteMaster	Allow fibers to have write access to
+ * 				it parent stack.
+ * 	AllowThreadWriteOwn	Only current thread can write to its
+ * 				stack.
+ */
+enum __nmc_stack_check_mode {
+	AllowThreadWriteAll = 0,
+	AllowFiberWriteMaster = 1,
+	AllowThreadWriteOwn = 2
+};
+
+typedef enum __nmc_stack_check_mode EStackCheckMode;
+typedef enum __nmc_stack_check_mode nmc_stack_check_mode_t;
+
+enum __nmc_event_mode {
+	NmcEventSimpleMode = 0,
+	NmcEventBroadcastMode = 1,
+	NmcEventCollectiveSimpleMode = 2,
+	NmcEventCollectiveReduceMode = 3,
+	NmcEventCollectiveCascadeMode = 4,
+};
+
+typedef enum __nmc_event_mode ENmcEventMode;
+typedef enum __nmc_event_mode nmc_event_mode_t;
+
+enum __nmc_event_reduce_op_type {
+    // Used only in CollectReduce
+    Fadd = 0,
+    Add = 1,
+    Fmin = 2,
+    Min = 3,
+    Umin = 4,
+    Fmax = 5,
+    Max = 6,
+    Umax = 7,
+    And = 8,
+    Or = 9,
+    Xor = 10,
+};
+
+typedef enum __nmc_event_reduce_op_type ENmcEventReduceOpType;
+typedef enum __nmc_event_reduce_op_type nmc_event_reduce_op_type_t;
+
+enum __nmc_event_reduce_op_size {
+    // Used only in CollectReduce
+    FourBytes = 0,
+    EightBytes = 1,
+};
+
+typedef enum __nmc_event_reduce_op_size ENmcEventReduceOpSize;
+typedef enum __nmc_event_reduce_op_size nmc_event_reduce_op_size_t;
+
+#ifdef __cplusplus
+}
+#endif
+
-- 
GitLab