diff --git a/aie_runtime_lib/AIE2/aie_objectfifo.h b/aie_runtime_lib/AIE2/aie_objectfifo.h new file mode 100644 index 00000000000..72ec8b547c2 --- /dev/null +++ b/aie_runtime_lib/AIE2/aie_objectfifo.h @@ -0,0 +1,79 @@ +//===- aie_objectfifo.h - ObjectFIFO C API for AIE2 -------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// +// +// C API for ObjectFIFO operations in AIE2 kernels. +// Provides a self-contained struct that bundles locks, buffers, and depth, +// hiding the dual-lock (producer + consumer) semantics of AIE2 semaphore locks. +// +// On AIE2, each ObjectFIFO element has two locks: +// - Producer lock (acq_lock for producer, rel_lock for consumer) +// - Consumer lock (rel_lock for producer, acq_lock for consumer) +// +// The MLIR `aie.objectfifo.lock` and `aie.objectfifo.buffer` ops resolve +// the correct lock IDs and buffer references for each port, passing them +// as function arguments. This header provides a struct and inline functions +// to use them. +// +//===----------------------------------------------------------------------===// + +#ifndef AIE_OBJECTFIFO_H +#define AIE_OBJECTFIFO_H + +#include + +// Lock intrinsics (acquire_equal, release) are provided by the compiler: +// - Peano: auto-included via aiev2intrin.h / aie2pintrin.h +// - Chess: compiler built-ins +#ifndef __AIENGINE__ +#error \ + "aie_objectfifo.h must be compiled for an AIE target (__AIENGINE__ not defined)" +#endif + +// Maximum supported ObjectFIFO depth (number of buffers). +#define OBJECTFIFO_MAX_DEPTH 4 + +// ObjectFIFO handle for C kernels. +// Encapsulates everything needed to acquire/release and access buffers +// for a given ObjectFIFO port (producer or consumer side). +// +// The MLIR compiler fills in the correct lock IDs, buffer pointers, and +// depth based on ObjectFIFO configuration and port direction. +typedef struct { + int32_t acq_lock; // Lock ID for acquire operation + int32_t rel_lock; // Lock ID for release operation + int32_t acq_value; // Value for acquire_equal(): -1 for AcquireGreaterEqual + int32_t rel_value; // Value for release() call (typically 1) + int32_t depth; // Number of buffers (ObjectFIFO depth) + void *buffers[OBJECTFIFO_MAX_DEPTH]; // Buffer pointers +} objectfifo_t; + +// Acquire an ObjectFIFO (blocks until available). +// For producers: waits until a buffer is free to write. +// For consumers: waits until data is ready to read. +static inline void objectfifo_acquire(const objectfifo_t *of) { + acquire_equal(of->acq_lock, of->acq_value); +} + +// Release an ObjectFIFO. +// For producers: signals that data has been written. +// For consumers: signals that the buffer is free. +static inline void objectfifo_release(const objectfifo_t *of) { + release(of->rel_lock, of->rel_value); +} + +// Get the buffer pointer for the current iteration. +// Handles buffer rotation using modular indexing: buffers[iter % depth]. +// The caller should cast the returned void* to the appropriate type. +static inline void *objectfifo_get_buffer(const objectfifo_t *of, + int32_t iter) { + return of->buffers[iter % of->depth]; +} + +#endif // AIE_OBJECTFIFO_H diff --git a/aie_runtime_lib/AIE2P/aie_objectfifo.h b/aie_runtime_lib/AIE2P/aie_objectfifo.h new file mode 100644 index 00000000000..bf7783b4410 --- /dev/null +++ b/aie_runtime_lib/AIE2P/aie_objectfifo.h @@ -0,0 +1,80 @@ +//===- aie_objectfifo.h - ObjectFIFO C API for AIE2P ------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// +// +// C API for ObjectFIFO operations in AIE2P kernels. +// Provides a self-contained struct that bundles locks, buffers, and depth, +// hiding the dual-lock (producer + consumer) semantics of AIE2P semaphore +// locks. +// +// On AIE2P, each ObjectFIFO element has two locks: +// - Producer lock (acq_lock for producer, rel_lock for consumer) +// - Consumer lock (rel_lock for producer, acq_lock for consumer) +// +// The MLIR `aie.objectfifo.lock` and `aie.objectfifo.buffer` ops resolve +// the correct lock IDs and buffer references for each port, passing them +// as function arguments. This header provides a struct and inline functions +// to use them. +// +//===----------------------------------------------------------------------===// + +#ifndef AIE_OBJECTFIFO_H +#define AIE_OBJECTFIFO_H + +#include + +// Lock intrinsics (acquire_equal, release) are provided by the compiler: +// - Peano: auto-included via aiev2intrin.h / aie2pintrin.h +// - Chess: compiler built-ins +#ifndef __AIENGINE__ +#error \ + "aie_objectfifo.h must be compiled for an AIE target (__AIENGINE__ not defined)" +#endif + +// Maximum supported ObjectFIFO depth (number of buffers). +#define OBJECTFIFO_MAX_DEPTH 4 + +// ObjectFIFO handle for C kernels. +// Encapsulates everything needed to acquire/release and access buffers +// for a given ObjectFIFO port (producer or consumer side). +// +// The MLIR compiler fills in the correct lock IDs, buffer pointers, and +// depth based on ObjectFIFO configuration and port direction. +typedef struct { + int32_t acq_lock; // Lock ID for acquire operation + int32_t rel_lock; // Lock ID for release operation + int32_t acq_value; // Value for acquire_equal(): -1 for AcquireGreaterEqual + int32_t rel_value; // Value for release() call (typically 1) + int32_t depth; // Number of buffers (ObjectFIFO depth) + void *buffers[OBJECTFIFO_MAX_DEPTH]; // Buffer pointers +} objectfifo_t; + +// Acquire an ObjectFIFO (blocks until available). +// For producers: waits until a buffer is free to write. +// For consumers: waits until data is ready to read. +static inline void objectfifo_acquire(const objectfifo_t *of) { + acquire_equal(of->acq_lock, of->acq_value); +} + +// Release an ObjectFIFO. +// For producers: signals that data has been written. +// For consumers: signals that the buffer is free. +static inline void objectfifo_release(const objectfifo_t *of) { + release(of->rel_lock, of->rel_value); +} + +// Get the buffer pointer for the current iteration. +// Handles buffer rotation using modular indexing: buffers[iter % depth]. +// The caller should cast the returned void* to the appropriate type. +static inline void *objectfifo_get_buffer(const objectfifo_t *of, + int32_t iter) { + return of->buffers[iter % of->depth]; +} + +#endif // AIE_OBJECTFIFO_H diff --git a/aie_runtime_lib/CMakeLists.txt b/aie_runtime_lib/CMakeLists.txt index 7a5e8509858..778472a9ceb 100644 --- a/aie_runtime_lib/CMakeLists.txt +++ b/aie_runtime_lib/CMakeLists.txt @@ -61,4 +61,18 @@ if(AIETools_AIE2P_FOUND) add_subdirectory(AIE2P) endif() - +# Install and copy aie_objectfifo.h unconditionally (no AIETools dependency) +foreach(arch AIE2 AIE2P) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${arch}/aie_objectfifo.h) + install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/${arch}/aie_objectfifo.h + DESTINATION ${CMAKE_INSTALL_PREFIX}/aie_runtime_lib/${arch}) + add_custom_target(aie-copy-${arch}-runtime-libs-aie_objectfifo.h ALL + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${arch}/aie_objectfifo.h) + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${arch}/aie_objectfifo.h + COMMAND ${CMAKE_COMMAND} -E copy + ${CMAKE_CURRENT_SOURCE_DIR}/${arch}/aie_objectfifo.h + ${CMAKE_CURRENT_BINARY_DIR}/${arch}/aie_objectfifo.h + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${arch}/aie_objectfifo.h) + add_dependencies(aie-runtime-libs aie-copy-${arch}-runtime-libs-aie_objectfifo.h) + endif() +endforeach() diff --git a/include/aie/Dialect/AIE/IR/AIEOps.td b/include/aie/Dialect/AIE/IR/AIEOps.td index afa7769f8f6..1c7a89f879d 100644 --- a/include/aie/Dialect/AIE/IR/AIEOps.td +++ b/include/aie/Dialect/AIE/IR/AIEOps.td @@ -2038,6 +2038,78 @@ def AIE_ObjectFifoSubviewAccessOp : AIE_Op<"objectfifo.subview.access", []> { }]; } +def AIE_ObjectFifoGetLockOp : AIE_Op<"objectfifo.lock", []> { + let summary = "Get acquire and release lock IDs for an ObjectFIFO port"; + let description = [{ + Returns the acquire and release lock IDs for the given ObjectFIFO port. + These lock IDs can be passed to precompiled C kernels that call + `acquire_equal()` / `release()` directly. + + On AIE2 (semaphore locks), each ObjectFIFO element has two locks + (producer + consumer). The acquire lock differs from the release lock: + - Producer port: acquire = prod_lock, release = cons_lock + - Consumer port: acquire = cons_lock, release = prod_lock + + Example: + ``` + %acq_lock, %rel_lock = aie.objectfifo.lock @of1 (Produce) : (index, index) + ``` + The returned `index` values are the localized lock IDs, suitable for passing + to external C functions as integer arguments. + }]; + + let arguments = ( + ins ObjectFifoPort:$port, + FlatSymbolRefAttr:$objFifo_name + ); + + let results = (outs Index:$acq_lock, Index:$rel_lock); + + let assemblyFormat = [{ + attr-dict $objFifo_name `(` $port `)` `:` `(` type($acq_lock) `,` type($rel_lock) `)` + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ObjectFifoCreateOp getObjectFifo(); + }]; +} + +def AIE_ObjectFifoGetBufferOp : AIE_Op<"objectfifo.buffer", []> { + let summary = "Get a buffer reference from an ObjectFIFO without acquiring"; + let description = [{ + Returns a memref to the ObjectFIFO buffer at the given element index, + without performing any lock acquisition. This is intended for use with + precompiled C kernels that manage their own locking via + `aie.objectfifo.lock`. + + Example: + ``` + %buf = aie.objectfifo.buffer @of1 (0) : memref<256xi32> + ``` + The returned memref can be passed to an external C function along with + lock IDs from `aie.objectfifo.lock`. + }]; + + let arguments = ( + ins FlatSymbolRefAttr:$objFifo_name, + ConfinedAttr]>:$index + ); + + let results = (outs AnyMemRef:$output); + + let assemblyFormat = [{ + attr-dict $objFifo_name `(` $index `)` `:` qualified(type($output)) + }]; + + let hasVerifier = 1; + + let extraClassDeclaration = [{ + ObjectFifoCreateOp getObjectFifo(); + }]; +} + def AIE_ObjectFifoRegisterProcessOp: AIE_Op<"objectfifo.register_process", []> { let summary = "Operation that produces the acquire/release patterns for a process registered to an objectFifo"; let description = [{ diff --git a/lib/Dialect/AIE/IR/AIEDialect.cpp b/lib/Dialect/AIE/IR/AIEDialect.cpp index f9662ed8e21..bc52fcc8ec5 100644 --- a/lib/Dialect/AIE/IR/AIEDialect.cpp +++ b/lib/Dialect/AIE/IR/AIEDialect.cpp @@ -1044,6 +1044,91 @@ LogicalResult ObjectFifoSubviewAccessOp::verify() { return success(); } +//===----------------------------------------------------------------------===// +// ObjectFifoGetLockOp +//===----------------------------------------------------------------------===// + +LogicalResult ObjectFifoGetLockOp::verify() { + auto parent = getOperation()->getParentOfType(); + if (parent == nullptr) + return emitOpError("must be called from inside a CoreOp"); + + auto coreTile = parent.getTile(); + auto objFifo = getObjectFifo(); + if (!objFifo) + return emitError("cannot retrieve associated object FIFO"); + if (getPort() == ObjectFifoPort::Produce) { + if (coreTile != objFifo.getProducerTile()) + return parent.emitOpError( + "producer port of objectFifo accessed by core running " + "on non-producer tile"); + } else if (getPort() == ObjectFifoPort::Consume) { + bool found = false; + for (auto consumerTile : objFifo.getConsumerTiles()) { + if (coreTile == consumerTile) { + found = true; + break; + } + } + if (!found) + return parent.emitOpError( + "consumer port of objectFifo accessed by core running " + "on non-consumer tile"); + } + + return success(); +} + +ObjectFifoCreateOp ObjectFifoGetLockOp::getObjectFifo() { + Operation *parent = getOperation(); + while ((parent = parent->getParentOp())) { + if (parent->hasTrait()) { + if (auto *st = SymbolTable::lookupSymbolIn(parent, getObjFifoName()); + isa_and_nonnull(st)) + return dyn_cast(st); + } + } + return {}; +} + +//===----------------------------------------------------------------------===// +// ObjectFifoGetBufferOp +//===----------------------------------------------------------------------===// + +LogicalResult ObjectFifoGetBufferOp::verify() { + auto parent = getOperation()->getParentOfType(); + if (parent == nullptr) + return emitOpError("must be called from inside a CoreOp"); + + auto objFifo = getObjectFifo(); + if (!objFifo) + return emitError("cannot retrieve associated object FIFO"); + + auto objFifoElem = + llvm::cast(objFifo.getElemType()).getElementType(); + if (objFifoElem != getOutput().getType()) + return emitOpError("output memref type must match ObjectFifo element type"); + + int index = getIndex(); + if (index >= objFifo.size()) + return emitOpError("buffer index ") + << index << " exceeds ObjectFifo depth " << objFifo.size(); + + return success(); +} + +ObjectFifoCreateOp ObjectFifoGetBufferOp::getObjectFifo() { + Operation *parent = getOperation(); + while ((parent = parent->getParentOp())) { + if (parent->hasTrait()) { + if (auto *st = SymbolTable::lookupSymbolIn(parent, getObjFifoName()); + isa_and_nonnull(st)) + return dyn_cast(st); + } + } + return {}; +} + //===----------------------------------------------------------------------===// // ObjectFifoRegisterProcessOp //===----------------------------------------------------------------------===// diff --git a/lib/Dialect/AIE/Transforms/AIELocalizeLocks.cpp b/lib/Dialect/AIE/Transforms/AIELocalizeLocks.cpp index 332969d49b8..8f879f710bc 100644 --- a/lib/Dialect/AIE/Transforms/AIELocalizeLocks.cpp +++ b/lib/Dialect/AIE/Transforms/AIELocalizeLocks.cpp @@ -58,7 +58,7 @@ struct AIELocalizeLocksPass : AIELocalizeLocksBase { // it suffices to check if the parent of a UseLockOp is coreOp. if (llvm::none_of(lock.getResult().getUsers(), [&](Operation *user) { - return user->getParentOp() == coreOp; + return coreOp->isProperAncestor(user); })) continue; @@ -77,7 +77,7 @@ struct AIELocalizeLocksPass : AIELocalizeLocksBase { builder, builder.getUnknownLoc(), localLockIndex); lock.getResult().replaceUsesWithIf( coreLockIDValue, [&](OpOperand &opOperand) { - return opOperand.getOwner()->getParentOp() == coreOp; + return coreOp->isProperAncestor(opOperand.getOwner()); }); } } diff --git a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp index ad8a91a0d8c..9779c6bf74e 100644 --- a/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp +++ b/lib/Dialect/AIE/Transforms/AIEObjectFifoStatefulTransform.cpp @@ -2386,6 +2386,82 @@ struct AIEObjectFifoStatefulTransformPass }); if (res.wasInterrupted()) return signalPassFailure(); + + //===----------------------------------------------------------------===// + // Replace ObjectFifoGetLockOps with actual lock SSA values + //===----------------------------------------------------------------===// + res = coreOp.walk([&](ObjectFifoGetLockOp getLockOp) { + ObjectFifoCreateOp op = getLockOp.getObjectFifo(); + if (!op) { + getLockOp->emitOpError("cannot retrieve associated object FIFO"); + return WalkResult::interrupt(); + } + + ObjectFifoCreateOp target = op; + if (auto linkOp = getOptionalLinkOp(op)) + if (objFifoLinks.find(*linkOp) != objFifoLinks.end()) + target = objFifoLinks[*linkOp]; + + if (locksPerFifo[target].empty()) { + getLockOp->emitOpError( + "objectFifo has no locks (synchronization disabled?)"); + return WalkResult::interrupt(); + } + + auto port = getLockOp.getPort(); + auto dev = op->getParentOfType(); + LockOp acqLock, relLock; + + if (!dev.getTargetModel().hasProperty( + AIETargetModel::UsesSemaphoreLocks)) { + // AIE1: single lock per element, both acq and rel use the same lock + acqLock = locksPerFifo[target][0]; + relLock = locksPerFifo[target][0]; + } else { + // AIE2/AIE2P: dual-lock semantics + if (port == ObjectFifoPort::Produce) { + acqLock = locksPerFifo[target][0]; // prod_lock + relLock = locksPerFifo[target][1]; // cons_lock + } else { + acqLock = locksPerFifo[target][1]; // cons_lock + relLock = locksPerFifo[target][0]; // prod_lock + } + } + + getLockOp.getAcqLock().replaceAllUsesWith(acqLock.getResult()); + getLockOp.getRelLock().replaceAllUsesWith(relLock.getResult()); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) + return signalPassFailure(); + + //===----------------------------------------------------------------===// + // Replace ObjectFifoGetBufferOps with actual buffer references + //===----------------------------------------------------------------===// + res = coreOp.walk([&](ObjectFifoGetBufferOp getBufferOp) { + ObjectFifoCreateOp op = getBufferOp.getObjectFifo(); + if (!op) { + getBufferOp->emitOpError("cannot retrieve associated object FIFO"); + return WalkResult::interrupt(); + } + + ObjectFifoCreateOp target = op; + if (auto linkOp = getOptionalLinkOp(op)) + if (objFifoLinks.find(*linkOp) != objFifoLinks.end()) + target = objFifoLinks[*linkOp]; + + int index = getBufferOp.getIndex(); + if (index >= static_cast(buffersPerFifo[target].size())) { + getBufferOp->emitOpError("buffer index out of bounds"); + return WalkResult::interrupt(); + } + + getBufferOp.getOutput().replaceAllUsesWith( + buffersPerFifo[target][index].getBuffer()); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) + return signalPassFailure(); } //===------------------------------------------------------------------===// @@ -2395,7 +2471,8 @@ struct AIEObjectFifoStatefulTransformPass device.walk([&](Operation *op) { if (isa(op)) + ObjectFifoReleaseOp, ObjectFifoAllocateOp, ObjectFifoGetLockOp, + ObjectFifoGetBufferOp>(op)) opsToErase.insert(op); }); SmallVector sorted{opsToErase.begin(), opsToErase.end()}; diff --git a/programming_examples/basic/passthrough_kernel_c_objfifo/CMakeLists.txt b/programming_examples/basic/passthrough_kernel_c_objfifo/CMakeLists.txt new file mode 100644 index 00000000000..33efd6e242b --- /dev/null +++ b/programming_examples/basic/passthrough_kernel_c_objfifo/CMakeLists.txt @@ -0,0 +1,56 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Xilinx Inc. + +# parameters +# -DXRT_INC_DIR: Full path to src/runtime_src/core/include in XRT cloned repo +# -DXRT_LIB_DIR: Path to xrt_coreutil.lib +# -DTARGET_NAME: Target name to be built + +# cmake needs this line +cmake_minimum_required(VERSION 3.30) +set(CMAKE_CXX_STANDARD 23) +set(CMAKE_CXX_STANDARD_REQUIRED YES) + +include(../../common.cmake) + +find_program(WSL NAMES powershell.exe) + +set(IN1_SIZE 4096 CACHE STRING "in1 buffer size") +set(OUT_SIZE 4096 CACHE STRING "out buffer size") +set(TARGET_NAME test CACHE STRING "Target to be built") + +SET (ProjectName ${TARGET_NAME}) +SET (currentTarget ${TARGET_NAME}) + +if ( WSL ) + set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${CMAKE_BINARY_DIR}) + add_compile_options(/Zc:__cplusplus) +endif () + +project(${ProjectName}) + + +add_executable(${currentTarget} test.cpp) + +target_compile_definitions(${currentTarget} PUBLIC + IN1_SIZE=${IN1_SIZE} + OUT_SIZE=${OUT_SIZE} +) + +target_include_directories (${currentTarget} PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/../../utils + ${XRT_INC_DIR} +) + +target_link_directories(${currentTarget} PUBLIC + ${XRT_LIB_DIR} +) + +target_link_libraries(${currentTarget} PUBLIC + xrt_coreutil +) + +target_link_test_utils(${currentTarget}) diff --git a/programming_examples/basic/passthrough_kernel_c_objfifo/Makefile b/programming_examples/basic/passthrough_kernel_c_objfifo/Makefile new file mode 100644 index 00000000000..d7db72b962e --- /dev/null +++ b/programming_examples/basic/passthrough_kernel_c_objfifo/Makefile @@ -0,0 +1,66 @@ +##===- Makefile -----------------------------------------------------------===## +# +# This file licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2024-2026, Advanced Micro Devices, Inc. +# +##===----------------------------------------------------------------------===## + +srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST)))) + +include ${srcdir}/../../makefile-common + +devicename ?= $(if $(filter 1,$(NPU2)),npu2,npu) +targetname = passthrough_kernel_c_objfifo +in1_size = 4096 # in bytes +out_size = 4096 # in bytes, should always be equal to in1_size +CHESS ?= false + +data_size = in1_size +aie_py_src=passthrough_kernel_c_objfifo.py + +.PHONY: all clean + +all: build/final_${data_size}.xclbin + +build/kernel.o: ${srcdir}/kernel.cc + mkdir -p ${@D} +ifeq ($(devicename),npu) + cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2_FLAGS} \ + -I ${MLIR_AIE_DIR}/aie_runtime_lib/AIE2 -DBIT_WIDTH=8 -c $< -o ${@F} +else ifeq ($(devicename),npu2) + cd ${@D} && ${PEANO_INSTALL_DIR}/bin/clang++ ${PEANOWRAP2P_FLAGS} \ + -I ${MLIR_AIE_DIR}/aie_runtime_lib/AIE2P -DBIT_WIDTH=8 -c $< -o ${@F} +else + echo "Device type not supported" +endif + +build/aie2_lineBased_8b_${data_size}.mlir: ${srcdir}/${aie_py_src} + mkdir -p ${@D} + python3 $< -d ${devicename} -i1s ${in1_size} -os ${out_size} > $@ + +build/final_${data_size}.xclbin: build/aie2_lineBased_8b_${data_size}.mlir build/kernel.o + mkdir -p ${@D} + cd ${@D} && aiecc --aie-generate-xclbin --aie-generate-npu-insts \ + --no-xchesscc --no-xbridge \ + --xclbin-name=${@F} --npu-insts-name=insts_${data_size}.bin $(<:%=../%) + + +${targetname}_${data_size}.exe: ${srcdir}/test.cpp + rm -rf _build + mkdir -p _build + cd _build && ${powershell} cmake `${getwslpath} ${srcdir}` -DTARGET_NAME=${targetname} -DIN1_SIZE=${in1_size} -DOUT_SIZE=${out_size} + cd _build && ${powershell} cmake --build . --config Release +ifeq "${powershell}" "powershell.exe" + cp _build/${targetname}.exe $@ +else + cp _build/${targetname} $@ +endif + +run: ${targetname}_${data_size}.exe build/final_${data_size}.xclbin build/insts_${data_size}.bin + ${powershell} ./$< -x build/final_${data_size}.xclbin -i build/insts_${data_size}.bin -k MLIR_AIE + +clean: + rm -rf build _build ${targetname}*.exe diff --git a/programming_examples/basic/passthrough_kernel_c_objfifo/kernel.cc b/programming_examples/basic/passthrough_kernel_c_objfifo/kernel.cc new file mode 100644 index 00000000000..fff06164c14 --- /dev/null +++ b/programming_examples/basic/passthrough_kernel_c_objfifo/kernel.cc @@ -0,0 +1,50 @@ +//===- kernel.cc - Passthrough kernel using C ObjectFIFO API ----*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2024, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// +// +// Demonstrates using aie_objectfifo.h for kernel-managed synchronization. +// Lock IDs and buffer references are passed in from MLIR via +// aie.objectfifo.lock and aie.objectfifo.buffer. The kernel constructs +// objectfifo_t structs and uses objectfifo_get_buffer() for automatic +// buffer rotation. +// +//===----------------------------------------------------------------------===// + +#include + +#include "aie_objectfifo.h" + +extern "C" { + +void passThroughLine(int32_t *in_buf0, int32_t *in_buf1, int32_t *out_buf0, + int32_t *out_buf1, int64_t in_acq_lock, + int64_t in_rel_lock, int64_t out_acq_lock, + int64_t out_rel_lock) { + objectfifo_t of_in = {(int32_t)in_acq_lock, (int32_t)in_rel_lock, -1, 1, 2, + {in_buf0, in_buf1}}; + objectfifo_t of_out = {(int32_t)out_acq_lock, (int32_t)out_rel_lock, -1, 1, 2, + {out_buf0, out_buf1}}; + + for (int iter = 0; iter < 8; iter++) { + objectfifo_acquire(&of_in); + objectfifo_acquire(&of_out); + + int32_t *in = (int32_t *)objectfifo_get_buffer(&of_in, iter); + int32_t *out = (int32_t *)objectfifo_get_buffer(&of_out, iter); + + for (int i = 0; i < 1024; i++) { + out[i] = in[i]; + } + + objectfifo_release(&of_in); + objectfifo_release(&of_out); + } +} + +} // extern "C" diff --git a/programming_examples/basic/passthrough_kernel_c_objfifo/passthrough_kernel_c_objfifo.py b/programming_examples/basic/passthrough_kernel_c_objfifo/passthrough_kernel_c_objfifo.py new file mode 100644 index 00000000000..635611b0916 --- /dev/null +++ b/programming_examples/basic/passthrough_kernel_c_objfifo/passthrough_kernel_c_objfifo.py @@ -0,0 +1,138 @@ +# passthrough_kernel_c_objfifo/passthrough_kernel_c_objfifo.py -*- Python -*- +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates + +# Variant of passthrough_kernel that demonstrates the C ObjectFIFO API. +# Instead of compiler-managed acquire/release, this design passes lock IDs +# and buffer references to the C kernel via aie.objectfifo.lock and +# aie.objectfifo.buffer, letting the kernel call acquire/release directly +# using aie_objectfifo.h. + +import numpy as np +import argparse +import sys + +from aie.dialects.aie import * +from aie.dialects.aiex import * +from aie.extras.context import mlir_mod_ctx +from aie.iron.controlflow import range_ + + +def my_passthrough_kernel(dev, in1_size, out_size): + in1_dtype = np.uint8 + out_dtype = np.uint8 + + N = in1_size // in1_dtype(0).nbytes + lineWidthInBytes = N // 4 # chop input in 4 sub-tensors + + assert ( + out_size == in1_size + ), "Output buffer size must be equal to input buffer size." + + @device(dev) + def device_body(): + # define types + vector_ty = np.ndarray[(N,), np.dtype[in1_dtype]] + line_ty = np.ndarray[(lineWidthInBytes,), np.dtype[in1_dtype]] + + # AIE Core Function declarations + passThroughLine = external_func( + "passThroughLine", + inputs=[ + line_ty, # in buffer 0 + line_ty, # in buffer 1 + line_ty, # out buffer 0 + line_ty, # out buffer 1 + T.index(), # in acq_lock + T.index(), # in rel_lock + T.index(), # out acq_lock + T.index(), # out rel_lock + ], + ) + + # Tile declarations + ShimTile = tile(0, 0) + ComputeTile2 = tile(0, 2) + + # AIE-array data movement with object fifos + of_in = object_fifo("in", ShimTile, ComputeTile2, 2, line_ty) + of_out = object_fifo("out", ComputeTile2, ShimTile, 2, line_ty) + + # Set up compute tiles + + # Compute tile 2 + @core(ComputeTile2, "kernel.o") + def core_body(): + # Pass both ping-pong buffers and lock IDs to C kernel + in_buf0 = of_in.get_buffer(0) + in_buf1 = of_in.get_buffer(1) + in_acq, in_rel = of_in.get_lock(ObjectFifoPort.Consume) + + out_buf0 = of_out.get_buffer(0) + out_buf1 = of_out.get_buffer(1) + out_acq, out_rel = of_out.get_lock(ObjectFifoPort.Produce) + + # C kernel owns the compute loop and buffer rotation + passThroughLine( + in_buf0, + in_buf1, + out_buf0, + out_buf1, + in_acq, + in_rel, + out_acq, + out_rel, + ) + + @runtime_sequence(vector_ty, vector_ty, vector_ty) + def sequence(inTensor, outTensor, notUsed): + in_task = shim_dma_single_bd_task( + of_in, inTensor, sizes=[1, 1, 1, N], issue_token=True + ) + out_task = shim_dma_single_bd_task( + of_out, outTensor, sizes=[1, 1, 1, N], issue_token=True + ) + + dma_start_task(in_task, out_task) + dma_await_task(in_task, out_task) + + +if len(sys.argv) < 4: + raise ValueError("[ERROR] Need at least 4 arguments (dev, in1_size, out_size)") + + +p = argparse.ArgumentParser() +p.add_argument("-d", "--dev", required=True, dest="device", help="AIE Device") +p.add_argument( + "-i1s", "--in1_size", required=True, dest="in1_size", help="Input 1 size" +) +p.add_argument("-os", "--out_size", required=True, dest="out_size", help="Output size") +opts = p.parse_args(sys.argv[1:]) + +if opts.device == "npu": + dev = AIEDevice.npu1_1col +elif opts.device == "npu2": + dev = AIEDevice.npu2 +else: + raise ValueError("[ERROR] Device name {} is unknown".format(sys.argv[1])) +in1_size = int(opts.in1_size) +if in1_size % 64 != 0 or in1_size < 512: + print( + "In1 buffer size (" + + str(in1_size) + + ") must be a multiple of 64 and greater than or equal to 512" + ) + raise ValueError +out_size = int(opts.out_size) + +with mlir_mod_ctx() as ctx: + my_passthrough_kernel(dev, in1_size, out_size) + res = ctx.module.operation.verify() + if res == True: + print(ctx.module) + else: + print(res) diff --git a/programming_examples/basic/passthrough_kernel_c_objfifo/run_makefile.lit b/programming_examples/basic/passthrough_kernel_c_objfifo/run_makefile.lit new file mode 100644 index 00000000000..4e765c2b365 --- /dev/null +++ b/programming_examples/basic/passthrough_kernel_c_objfifo/run_makefile.lit @@ -0,0 +1,11 @@ +// (c) Copyright 2024 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai_npu1, peano +// +// RUN: mkdir -p test_c_objfifo +// RUN: cd test_c_objfifo +// RUN: make -f %S/Makefile clean +// RUN: make -f %S/Makefile +// RUN: %run_on_npu1% make -f %S/Makefile run +// RUN: make -f %S/Makefile clean diff --git a/programming_examples/basic/passthrough_kernel_c_objfifo/test.cpp b/programming_examples/basic/passthrough_kernel_c_objfifo/test.cpp new file mode 100644 index 00000000000..4841543b34f --- /dev/null +++ b/programming_examples/basic/passthrough_kernel_c_objfifo/test.cpp @@ -0,0 +1,75 @@ +//===- test.cpp -------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2023, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include "xrt_test_wrapper.h" +#include + +//***************************************************************************** +// Modify this section to customize buffer datatypes, initialization functions, +// and verify function. The other place to reconfigure your design is the +// Makefile. +//***************************************************************************** + +#ifndef DATATYPES_USING_DEFINED +#define DATATYPES_USING_DEFINED +// ------------------------------------------------------ +// Configure this to match your buffer data type +// ------------------------------------------------------ +using DATATYPE_IN1 = std::uint8_t; +using DATATYPE_OUT = std::uint8_t; +#endif + +// Initialize Input buffer 1 +void initialize_bufIn1(DATATYPE_IN1 *bufIn1, int SIZE) { + for (int i = 0; i < SIZE; i++) + bufIn1[i] = i; +} + +// Initialize Output buffer +void initialize_bufOut(DATATYPE_OUT *bufOut, int SIZE) { + memset(bufOut, 0, SIZE); +} + +// Functional correctness verifyer +int verify_passthrough_kernel(DATATYPE_IN1 *bufIn1, DATATYPE_OUT *bufOut, + int SIZE, int verbosity) { + int errors = 0; + + for (int i = 0; i < SIZE; i++) { + int32_t ref = bufIn1[i]; + int32_t test = bufOut[i]; + if (test != ref) { + if (verbosity >= 1) + std::cout << "Error in output " << test << " != " << ref << std::endl; + errors++; + } else { + if (verbosity >= 1) + std::cout << "Correct output " << test << " == " << ref << std::endl; + } + } + return errors; +} + +//***************************************************************************** +// Should not need to modify below section +//***************************************************************************** + +int main(int argc, const char *argv[]) { + + constexpr int IN1_VOLUME = IN1_SIZE / sizeof(DATATYPE_IN1); + constexpr int OUT_VOLUME = OUT_SIZE / sizeof(DATATYPE_OUT); + + args myargs = parse_args(argc, argv); + + int res = setup_and_run_aie( + IN1_VOLUME, OUT_VOLUME, myargs); + return res; +} diff --git a/python/dialects/aie.py b/python/dialects/aie.py index cf854771ca2..2ffc79340e9 100644 --- a/python/dialects/aie.py +++ b/python/dialects/aie.py @@ -488,6 +488,51 @@ def set_aie_stream(self, stream_end, stream_port): self.attributes["aie_stream"] = int_stream_end self.attributes["aie_stream_port"] = int_stream_port + def get_lock(self, port): + """Get acquire and release lock IDs for this ObjectFIFO port. + + Returns the two lock IDs needed for C-side acquire/release operations. + On AIE2, these correspond to different physical locks. + + Args: + port: ObjectFifoPort.Produce or ObjectFifoPort.Consume + + Returns: + Tuple of (acq_lock, rel_lock) as index SSA values. + """ + op = ObjectFifoGetLockOp(port, self.sym_name.value) + return op.acq_lock, op.rel_lock + + def get_buffer(self, index=0): + """Get a buffer reference from this ObjectFIFO without acquiring. + + Returns a memref to the buffer at the given element index. + Use with get_lock() for C-side locking. + + Args: + index: Element index within the ObjectFIFO depth. Defaults to 0. + + Returns: + memref SSA value for the buffer. + """ + return ObjectFifoGetBufferOp(self.datatype, self.sym_name.value, index).output + + def get_depth(self): + """Get the ObjectFIFO depth as an index constant. + + Returns the depth (number of buffers) as an arith.constant index value, + suitable for passing to C kernels. + + Returns: + index SSA value for the depth. + """ + elem_num = self.elemNumber + if isinstance(elem_num, ArrayAttr): + depth_val = IntegerAttr(elem_num[0]).value + else: + depth_val = IntegerAttr(elem_num).value + return constant(depth_val, index=True) + # Create an aie objectFifo_link between input and output objectFifos. class object_fifo_link(ObjectFifoLinkOp): diff --git a/python/iron/dataflow/objectfifo.py b/python/iron/dataflow/objectfifo.py index 660b9a38710..6d0296f9672 100644 --- a/python/iron/dataflow/objectfifo.py +++ b/python/iron/dataflow/objectfifo.py @@ -407,6 +407,45 @@ def release( ) self._object_fifo._release(self._port, num_elem) + def get_lock(self): + """Get acquire and release lock IDs for this ObjectFIFO port. + + Returns the two lock IDs needed for C-side acquire/release operations. + On AIE2, these correspond to different physical locks (producer and + consumer locks). The returned values can be passed as index arguments + to precompiled C kernels. + + Returns: + Tuple of (acq_lock, rel_lock) as index SSA values. + """ + return self._object_fifo.op.get_lock(self._port) + + def get_buffer(self, index: int = 0): + """Get a buffer reference from this ObjectFIFO without acquiring. + + Returns a memref to the buffer at the given element index, without + performing lock acquisition. Use together with get_lock() for C-side + locking. + + Args: + index: Element index within the ObjectFIFO depth. Defaults to 0. + + Returns: + memref SSA value for the buffer. + """ + return self._object_fifo.op.get_buffer(index) + + def get_depth(self): + """Get the ObjectFIFO depth as an index constant. + + Returns the depth (number of buffers) as an arith.constant index + value, suitable for passing to C kernels. + + Returns: + index SSA value for the depth. + """ + return self._object_fifo.op.get_depth() + @property def name(self) -> str: """The name of the ObjectFifo""" diff --git a/test/lower-to-standard/objectfifo_lock_in_func_aie2.mlir b/test/lower-to-standard/objectfifo_lock_in_func_aie2.mlir new file mode 100644 index 00000000000..b12c9325034 --- /dev/null +++ b/test/lower-to-standard/objectfifo_lock_in_func_aie2.mlir @@ -0,0 +1,48 @@ +//===- objectfifo_lock_in_func_aie2.mlir ------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// Verify the full pipeline: objectfifo-stateful-transform resolves +// objectfifo.lock/buffer ops to concrete locks and buffers, then +// localize-locks + standard-lowering convert lock SSA values to constants +// suitable for passing to external C kernels. + +// RUN: aie-opt --aie-objectFifo-stateful-transform --aie-localize-locks --aie-standard-lowering="tilecol=1 tilerow=2" %s | FileCheck --check-prefix=CHECK %s + +// After the full pipeline, the core function should call @kernel with: +// - buffer memref +// - localized lock constant indices (prod_lock and cons_lock) +// CHECK: module @test attributes {llvm.target_triple = "aie2"} { +// CHECK: func.func private @kernel(memref<256xi32>, index, index) +// CHECK: func.func @core_1_2() { +// CHECK-DAG: %c{{[0-9]+}} = arith.constant +// CHECK-DAG: %c{{[0-9]+}} = arith.constant +// CHECK: call @kernel +// CHECK: return +// CHECK: } + +module @test { + aie.device(xcve2302) { + %tile12 = aie.tile(1, 2) + %tile13 = aie.tile(1, 3) + + aie.objectfifo @of0(%tile12, {%tile13}, 2 : i32) : !aie.objectfifo> + + func.func private @kernel(%buf: memref<256xi32>, + %acq_lock: index, %rel_lock: index) -> () + + %core12 = aie.core(%tile12) { + %buf = aie.objectfifo.buffer @of0 (0) : memref<256xi32> + %acq_lock, %rel_lock = aie.objectfifo.lock @of0 (Produce) : (index, index) + func.call @kernel(%buf, %acq_lock, %rel_lock) + : (memref<256xi32>, index, index) -> () + aie.end + } { link_with = "kernel.o" } + } +} diff --git a/test/lower-to-standard/useLock_in_func_external_aie2.mlir b/test/lower-to-standard/useLock_in_func_external_aie2.mlir new file mode 100644 index 00000000000..bf4f5179dfa --- /dev/null +++ b/test/lower-to-standard/useLock_in_func_external_aie2.mlir @@ -0,0 +1,46 @@ +//===- useLock_in_func_external_aie2.mlir -----------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// Verify that lock SSA values passed to external (body-less) function calls +// get correctly localized to constant indices by AIELocalizeLocks, and that +// the standard lowering preserves these index arguments for linking with +// precompiled C kernels. + +// RUN: aie-opt --aie-localize-locks --aie-standard-lowering="tilecol=1 tilerow=3" %s | FileCheck --check-prefix=CHECK %s + +// CHECK: module @test attributes {llvm.target_triple = "aie2"} { +// CHECK: func.func private @func(memref<256xi32>, memref<256xi32>, index, index) +// CHECK: func.func @core_1_3() { +// CHECK: %c48 = arith.constant 48 : index +// CHECK: %c49 = arith.constant 49 : index +// CHECK-DAG: call @func({{.*}}, {{.*}}, %c48, %c49) : (memref<256xi32>, memref<256xi32>, index, index) -> () +// CHECK: return +// CHECK: } +// CHECK: } + +module @test { + aie.device(xcve2302) { + %tile13 = aie.tile(1, 3) + %buf_in = aie.buffer(%tile13) { sym_name = "a" } : memref<256xi32> + %buf_out = aie.buffer(%tile13) { sym_name = "b" } : memref<256xi32> + %prod_lock = aie.lock(%tile13, 0) { sym_name = "prod_lock", init = 1 : i32 } + %cons_lock = aie.lock(%tile13, 1) { sym_name = "cons_lock", init = 0 : i32 } + + // External function declaration (no body) — linked with precompiled C kernel + func.func private @func(%A: memref<256xi32>, %B: memref<256xi32>, + %acq_lock: index, %rel_lock: index) -> () + + %core13 = aie.core(%tile13) { + func.call @func(%buf_in, %buf_out, %prod_lock, %cons_lock) + : (memref<256xi32>, memref<256xi32>, index, index) -> () + aie.end + } { link_with = "kernel.o" } + } +} diff --git a/test/lower-to-standard/useLock_in_func_external_aie2p.mlir b/test/lower-to-standard/useLock_in_func_external_aie2p.mlir new file mode 100644 index 00000000000..f8843dbec1f --- /dev/null +++ b/test/lower-to-standard/useLock_in_func_external_aie2p.mlir @@ -0,0 +1,46 @@ +//===- useLock_in_func_external_aie2p.mlir ----------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// Verify that lock SSA values passed to external (body-less) function calls +// get correctly localized to constant indices by AIELocalizeLocks, and that +// the standard lowering preserves these index arguments for linking with +// precompiled C kernels. + +// RUN: aie-opt --aie-localize-locks --aie-standard-lowering="tilecol=1 tilerow=3" %s | FileCheck --check-prefix=CHECK %s + +// CHECK: module @test attributes {llvm.target_triple = "aie2p"} { +// CHECK: func.func private @func(memref<256xi32>, memref<256xi32>, index, index) +// CHECK: func.func @core_1_3() { +// CHECK: %c48 = arith.constant 48 : index +// CHECK: %c49 = arith.constant 49 : index +// CHECK-DAG: call @func({{.*}}, {{.*}}, %c48, %c49) : (memref<256xi32>, memref<256xi32>, index, index) -> () +// CHECK: return +// CHECK: } +// CHECK: } + +module @test { + aie.device(npu2) { + %tile13 = aie.tile(1, 3) + %buf_in = aie.buffer(%tile13) { sym_name = "a" } : memref<256xi32> + %buf_out = aie.buffer(%tile13) { sym_name = "b" } : memref<256xi32> + %prod_lock = aie.lock(%tile13, 0) { sym_name = "prod_lock", init = 1 : i32 } + %cons_lock = aie.lock(%tile13, 1) { sym_name = "cons_lock", init = 0 : i32 } + + // External function declaration (no body) — linked with precompiled C kernel + func.func private @func(%A: memref<256xi32>, %B: memref<256xi32>, + %acq_lock: index, %rel_lock: index) -> () + + %core13 = aie.core(%tile13) { + func.call @func(%buf_in, %buf_out, %prod_lock, %cons_lock) + : (memref<256xi32>, memref<256xi32>, index, index) -> () + aie.end + } { link_with = "kernel.o" } + } +} diff --git a/test/npu-xrt/lock_pass_to_c/aie.mlir b/test/npu-xrt/lock_pass_to_c/aie.mlir new file mode 100644 index 00000000000..cdf4aae5be0 --- /dev/null +++ b/test/npu-xrt/lock_pass_to_c/aie.mlir @@ -0,0 +1,90 @@ +//===- aie.mlir ------------------------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// This test verifies that lock SSA values can be passed to precompiled C +// kernels as index arguments. The C kernel uses acquire_equal()/release() +// intrinsics directly with the localized lock IDs. + +module { + aie.device(NPUDEVICE) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_2 = aie.tile(0, 2) + + %in_buff = aie.buffer(%tile_0_2) {sym_name = "in_buff"} : memref<1024xi32> + %out_buff = aie.buffer(%tile_0_2) {sym_name = "out_buff"} : memref<1024xi32> + + // Input locks: DMA writes data, core reads data + %in_prod_lock = aie.lock(%tile_0_2, 0) {init = 1 : i32, sym_name = "in_prod_lock"} + %in_cons_lock = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "in_cons_lock"} + + // Output locks: core writes data, DMA reads data + %out_prod_lock = aie.lock(%tile_0_2, 2) {init = 1 : i32, sym_name = "out_prod_lock"} + %out_cons_lock = aie.lock(%tile_0_2, 3) {init = 0 : i32, sym_name = "out_cons_lock"} + + aie.flow(%tile_0_0, DMA : 0, %tile_0_2, DMA : 0) + aie.flow(%tile_0_2, DMA : 0, %tile_0_0, DMA : 0) + + // External C function: receives buffers and lock IDs for both input/output + func.func private @scale_with_locks(%in: memref<1024xi32>, + %out: memref<1024xi32>, + %in_cons_lk: index, + %in_prod_lk: index, + %out_prod_lk: index, + %out_cons_lk: index) -> () + + %core_0_2 = aie.core(%tile_0_2) { + // Pass all lock IDs to C kernel + func.call @scale_with_locks(%in_buff, %out_buff, + %in_cons_lock, %in_prod_lock, + %out_prod_lock, %out_cons_lock) + : (memref<1024xi32>, memref<1024xi32>, index, index, index, index) -> () + aie.end + } { link_with = "kernel.o" } + + aie.runtime_sequence(%arg0: memref<1024xi32>, %arg1: memref<1024xi32>) { + // Configure shim DMA to send data to core tile + %t0 = aiex.dma_configure_task(%tile_0_0, MM2S, 0) { + aie.dma_bd(%arg0 : memref<1024xi32>, 0, 1024) {bd_id = 0 : i32} + aie.end + } + + // Configure core DMA to receive from shim into in_buff + %t1 = aiex.dma_configure_task(%tile_0_2, S2MM, 0) { + aie.use_lock(%in_prod_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%in_buff : memref<1024xi32>, 0, 1024) {bd_id = 0 : i32} + aie.use_lock(%in_cons_lock, Release, 1) + aie.end + } + + // Start input path + aiex.dma_start_task(%t0) + aiex.dma_start_task(%t1) + + // Configure core DMA to send out_buff to shim + %t2 = aiex.dma_configure_task(%tile_0_2, MM2S, 0) { + aie.use_lock(%out_cons_lock, AcquireGreaterEqual, 1) + aie.dma_bd(%out_buff : memref<1024xi32>, 0, 1024) {bd_id = 1 : i32} + aie.use_lock(%out_prod_lock, Release, 1) + aie.end + } + + // Configure shim DMA to receive output from core + %t3 = aiex.dma_configure_task(%tile_0_0, S2MM, 0) { + aie.dma_bd(%arg1 : memref<1024xi32>, 0, 1024) {bd_id = 1 : i32} + aie.end + } {issue_token = true} + + // Start output path + aiex.dma_start_task(%t2) + aiex.dma_start_task(%t3) + aiex.dma_await_task(%t3) + } + } +} diff --git a/test/npu-xrt/lock_pass_to_c/kernel.cc b/test/npu-xrt/lock_pass_to_c/kernel.cc new file mode 100644 index 00000000000..7e6d7531ba9 --- /dev/null +++ b/test/npu-xrt/lock_pass_to_c/kernel.cc @@ -0,0 +1,40 @@ +//===- kernel.cc ------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// This kernel demonstrates receiving lock IDs as function arguments and using +// them to perform acquire/release operations in C code. The lock IDs are +// localized constant integers passed from MLIR. + +#include + +extern "C" { + +void scale_with_locks(int32_t *in, int32_t *out, int64_t in_cons_lock, + int64_t in_prod_lock, int64_t out_prod_lock, + int64_t out_cons_lock) { + // Acquire input consumer lock — wait for input data ready + acquire_equal(in_cons_lock, -1); + + // Acquire output producer lock — wait for output buffer free + acquire_equal(out_prod_lock, -1); + + // Scale each element by 3 + for (int i = 0; i < 1024; i++) { + out[i] = in[i] * 3; + } + + // Release input producer lock — signal input buffer free + release(in_prod_lock, 1); + + // Release output consumer lock — signal output data ready + release(out_cons_lock, 1); +} + +} // extern "C" diff --git a/test/npu-xrt/lock_pass_to_c/run_chess.lit b/test/npu-xrt/lock_pass_to_c/run_chess.lit new file mode 100644 index 00000000000..d464eaeed5a --- /dev/null +++ b/test/npu-xrt/lock_pass_to_c/run_chess.lit @@ -0,0 +1,14 @@ +// (c) Copyright 2026 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, valid_xchess_license +// +// Compile kernel with Chess for AIE2. +// RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/kernel.cc -o ./kernel.o +// RUN: cp %S/aie.mlir aie_arch.mlir +// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir +// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.bin aie_arch.mlir +// RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags +// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin +// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin diff --git a/test/npu-xrt/lock_pass_to_c/run_peano.lit b/test/npu-xrt/lock_pass_to_c/run_peano.lit new file mode 100644 index 00000000000..23941666511 --- /dev/null +++ b/test/npu-xrt/lock_pass_to_c/run_peano.lit @@ -0,0 +1,15 @@ +// (c) Copyright 2026 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai, peano +// +// Compile kernel with Peano for the correct target architecture. +// RUN: cp %S/aie.mlir aie_arch.mlir +// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir +// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir +// RUN: %run_on_npu1% %PEANO_INSTALL_DIR/bin/clang++ -O2 -std=c++20 --target=aie2-none-unknown-elf -DNDEBUG -c %S/kernel.cc -o kernel.o +// RUN: %run_on_npu2% %PEANO_INSTALL_DIR/bin/clang++ -O2 -std=c++20 --target=aie2p-none-unknown-elf -DNDEBUG -c %S/kernel.cc -o kernel.o +// RUN: %python aiecc.py --no-aiesim --no-xchesscc --no-xbridge --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.bin aie_arch.mlir +// RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags +// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin +// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin diff --git a/test/npu-xrt/lock_pass_to_c/test.cpp b/test/npu-xrt/lock_pass_to_c/test.cpp new file mode 100644 index 00000000000..691010d826d --- /dev/null +++ b/test/npu-xrt/lock_pass_to_c/test.cpp @@ -0,0 +1,137 @@ +//===- test.cpp -------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include "cxxopts.hpp" +#include "test_utils.h" +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +constexpr int N = 1024; + +int main(int argc, const char *argv[]) { + cxxopts::Options options("lock_pass_to_c"); + test_utils::add_default_options(options); + + cxxopts::ParseResult vm; + test_utils::parse_options(argc, argv, options, vm); + + std::vector instr_v = + test_utils::load_instr_binary(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << std::endl; + + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() + << std::endl; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() + << std::endl; + std::string Node = vm["kernel"].as(); + + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + std::cout << "Name: " << name << std::endl; + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + + device.register_xclbin(xclbin); + + if (verbosity >= 1) + std::cout << "Getting hardware context." << std::endl; + xrt::hw_context context(device, xclbin.get_uuid()); + + if (verbosity >= 1) + std::cout << "Getting handle to kernel:" << kernelName << std::endl; + auto kernel = xrt::kernel(context, kernelName); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_in = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(3)); + auto bo_out = xrt::bo(device, N * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(4)); + + if (verbosity >= 1) + std::cout << "Writing data into buffer objects." << std::endl; + + int32_t *bufIn = bo_in.map(); + std::vector srcVec; + for (int i = 0; i < N; i++) + srcVec.push_back(i + 1); + memcpy(bufIn, srcVec.data(), (srcVec.size() * sizeof(uint32_t))); + + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_in.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + if (verbosity >= 1) + std::cout << "Running Kernel." << std::endl; + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in, bo_out); + ert_cmd_state r = run.wait(); + if (r != ERT_CMD_STATE_COMPLETED) { + std::cout << "Kernel did not complete. Returned status: " << r << "\n"; + return 1; + } + + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + uint32_t *bufOut = bo_out.map(); + + int errors = 0; + + for (uint32_t i = 0; i < N; i++) { + // Kernel multiplies each element by 3 + uint32_t ref = (i + 1) * 3; + if (*(bufOut + i) != ref) { + errors++; + if (verbosity >= 1) { + std::cout << "Error at index " << i << ": expected " << ref << ", got " + << *(bufOut + i) << std::endl; + } + } + } + + if (!errors) { + std::cout << std::endl << "PASS!" << std::endl << std::endl; + return 0; + } else { + std::cout << std::endl + << errors << " mismatches." << std::endl + << std::endl; + std::cout << std::endl << "fail." << std::endl << std::endl; + return 1; + } +} diff --git a/test/npu-xrt/objectfifo_lock_c_api/aie2.py b/test/npu-xrt/objectfifo_lock_c_api/aie2.py new file mode 100644 index 00000000000..67dd1db0ed2 --- /dev/null +++ b/test/npu-xrt/objectfifo_lock_c_api/aie2.py @@ -0,0 +1,103 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2026, Advanced Micro Devices, Inc. + +# REQUIRES: ryzen_ai, peano +# +# RUN: %run_on_npu1% %PEANO_INSTALL_DIR/bin/clang++ -O2 -std=c++20 --target=aie2-none-unknown-elf -DNDEBUG -I %aie_runtime_lib%/AIE2 -c %S/kernel.cc -o kernel.o +# RUN: %run_on_npu2% %PEANO_INSTALL_DIR/bin/clang++ -O2 -std=c++20 --target=aie2p-none-unknown-elf -DNDEBUG -I %aie_runtime_lib%/AIE2P -c %S/kernel.cc -o kernel.o +# RUN: %python %S/aie2.py > ./aie2.mlir +# RUN: %python aiecc.py --no-aiesim --no-xchesscc --no-xbridge --aie-generate-npu-insts --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.bin ./aie2.mlir +# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags +# RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin +# RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin + +import numpy as np +from aie.extras.context import mlir_mod_ctx + +from aie.dialects.aie import * +from aie.dialects.aiex import * + +N = 1024 +tile_ty = np.ndarray[(N,), np.dtype[np.int32]] + + +def design(): + + with mlir_mod_ctx() as ctx: + + @device(AIEDevice.npu1_1col) + def device_body(): + # Define tiles + shim_tile = tile(0, 0) + compute_tile = tile(0, 2) + + # Define ObjectFIFOs with depth 2 for ping-pong buffering + of_in = object_fifo("of_in", shim_tile, compute_tile, 2, tile_ty) + of_out = object_fifo("of_out", compute_tile, shim_tile, 2, tile_ty) + + # External C function: ping-pong buffer pointers + lock IDs + scale_fn = external_func( + "scale_kernel", + inputs=[ + tile_ty, # in buffer 0 + tile_ty, # in buffer 1 + tile_ty, # out buffer 0 + tile_ty, # out buffer 1 + T.index(), # in acq_lock + T.index(), # in rel_lock + T.index(), # out acq_lock + T.index(), # out rel_lock + ], + ) + + @core(compute_tile, "kernel.o") + def core_body(): + # Pass both ping-pong buffers and lock IDs to C kernel + in_buf0 = of_in.get_buffer(0) + in_buf1 = of_in.get_buffer(1) + in_acq, in_rel = of_in.get_lock(ObjectFifoPort.Consume) + + out_buf0 = of_out.get_buffer(0) + out_buf1 = of_out.get_buffer(1) + out_acq, out_rel = of_out.get_lock(ObjectFifoPort.Produce) + + # C kernel owns the compute loop and buffer rotation + scale_fn( + in_buf0, + in_buf1, + out_buf0, + out_buf1, + in_acq, + in_rel, + out_acq, + out_rel, + ) + + @runtime_sequence( + np.ndarray[(N * 8,), np.dtype[np.int32]], + np.ndarray[(N * 8,), np.dtype[np.int32]], + ) + def sequence(inTensor, outTensor): + npu_dma_memcpy_nd( + metadata=of_out, + bd_id=1, + mem=outTensor, + sizes=[1, 1, 1, N * 8], + issue_token=True, + ) + npu_dma_memcpy_nd( + metadata=of_in, + bd_id=0, + mem=inTensor, + sizes=[1, 1, 1, N * 8], + ) + dma_wait(of_out) + + print(ctx.module) + + +design() diff --git a/test/npu-xrt/objectfifo_lock_c_api/kernel.cc b/test/npu-xrt/objectfifo_lock_c_api/kernel.cc new file mode 100644 index 00000000000..e0613c1515c --- /dev/null +++ b/test/npu-xrt/objectfifo_lock_c_api/kernel.cc @@ -0,0 +1,46 @@ +//===- kernel.cc ------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// Demonstrates ObjectFIFO C API usage with depth-2 ping-pong buffering. +// The kernel receives buffer pointers and lock IDs, constructs objectfifo_t +// structs, and manages acquire/release and buffer rotation using +// aie_objectfifo.h. + +#include + +#include "aie_objectfifo.h" + +extern "C" { + +void scale_kernel(int32_t *in_buf0, int32_t *in_buf1, int32_t *out_buf0, + int32_t *out_buf1, int64_t in_acq_lock, int64_t in_rel_lock, + int64_t out_acq_lock, int64_t out_rel_lock) { + objectfifo_t of_in = {(int32_t)in_acq_lock, (int32_t)in_rel_lock, -1, 1, 2, + {in_buf0, in_buf1}}; + objectfifo_t of_out = {(int32_t)out_acq_lock, (int32_t)out_rel_lock, -1, 1, 2, + {out_buf0, out_buf1}}; + + for (int iter = 0; iter < 8; iter++) { + objectfifo_acquire(&of_in); + objectfifo_acquire(&of_out); + + int32_t *in = (int32_t *)objectfifo_get_buffer(&of_in, iter); + int32_t *out = (int32_t *)objectfifo_get_buffer(&of_out, iter); + + for (int i = 0; i < 1024; i++) { + out[i] = in[i] * 3; + } + + objectfifo_release(&of_in); + objectfifo_release(&of_out); + } +} + +} // extern "C" diff --git a/test/npu-xrt/objectfifo_lock_c_api/test.cpp b/test/npu-xrt/objectfifo_lock_c_api/test.cpp new file mode 100644 index 00000000000..c8a828834a1 --- /dev/null +++ b/test/npu-xrt/objectfifo_lock_c_api/test.cpp @@ -0,0 +1,119 @@ +//===- test.cpp -------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include "cxxopts.hpp" +#include "test_utils.h" +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +constexpr int N = 1024; +constexpr int NUM_ITER = 8; +constexpr int TOTAL = N * NUM_ITER; + +int main(int argc, const char *argv[]) { + cxxopts::Options options("objectfifo_lock_c_api"); + test_utils::add_default_options(options); + + cxxopts::ParseResult vm; + test_utils::parse_options(argc, argv, options, vm); + + std::vector instr_v = + test_utils::load_instr_binary(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << std::endl; + + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() + << std::endl; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + std::string Node = vm["kernel"].as(); + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + device.register_xclbin(xclbin); + xrt::hw_context context(device, xclbin.get_uuid()); + auto kernel = xrt::kernel(context, kernelName); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_in = xrt::bo(device, TOTAL * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(3)); + auto bo_out = xrt::bo(device, TOTAL * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(4)); + + int32_t *bufIn = bo_in.map(); + for (int i = 0; i < TOTAL; i++) + bufIn[i] = i + 1; + + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_in.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + if (verbosity >= 1) + std::cout << "Running Kernel." << std::endl; + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in, bo_out); + ert_cmd_state r = run.wait(); + if (r != ERT_CMD_STATE_COMPLETED) { + std::cout << "Kernel did not complete. Returned status: " << r << "\n"; + return 1; + } + + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + uint32_t *bufOut = bo_out.map(); + + int errors = 0; + + for (uint32_t i = 0; i < TOTAL; i++) { + // Kernel multiplies each element by 3 + uint32_t ref = (i + 1) * 3; + if (*(bufOut + i) != ref) { + errors++; + if (verbosity >= 1) { + std::cout << "Error at index " << i << ": expected " << ref << ", got " + << *(bufOut + i) << std::endl; + } + } + } + + if (!errors) { + std::cout << std::endl << "PASS!" << std::endl << std::endl; + return 0; + } else { + std::cout << std::endl + << errors << " mismatches." << std::endl + << std::endl; + std::cout << std::endl << "fail." << std::endl << std::endl; + return 1; + } +} diff --git a/test/npu-xrt/objectfifo_lock_c_api_depth1/aie2.py b/test/npu-xrt/objectfifo_lock_c_api_depth1/aie2.py new file mode 100644 index 00000000000..b0022a8abe9 --- /dev/null +++ b/test/npu-xrt/objectfifo_lock_c_api_depth1/aie2.py @@ -0,0 +1,94 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2026, Advanced Micro Devices, Inc. + +# REQUIRES: ryzen_ai, peano +# +# RUN: %run_on_npu1% %PEANO_INSTALL_DIR/bin/clang++ -O2 -std=c++20 --target=aie2-none-unknown-elf -DNDEBUG -I %aie_runtime_lib%/AIE2 -c %S/kernel.cc -o kernel.o +# RUN: %python %S/aie2.py > ./aie2.mlir +# RUN: %python aiecc.py --no-aiesim --no-xchesscc --no-xbridge --aie-generate-npu-insts --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.bin ./aie2.mlir +# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags +# RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin + +# ObjectFIFO lock C API test with depth 1 (single buffering). + +import numpy as np +from aie.extras.context import mlir_mod_ctx + +from aie.dialects.aie import * +from aie.dialects.aiex import * + +N = 1024 +NUM_ITER = 8 +tile_ty = np.ndarray[(N,), np.dtype[np.int32]] + + +def design(): + + with mlir_mod_ctx() as ctx: + + @device(AIEDevice.npu1_1col) + def device_body(): + shim_tile = tile(0, 0) + compute_tile = tile(0, 2) + + # Depth 1: single buffer, no ping-pong + of_in = object_fifo("of_in", shim_tile, compute_tile, 1, tile_ty) + of_out = object_fifo("of_out", compute_tile, shim_tile, 1, tile_ty) + + scale_fn = external_func( + "scale_kernel", + inputs=[ + tile_ty, # in buffer 0 + tile_ty, # out buffer 0 + T.index(), # in acq_lock + T.index(), # in rel_lock + T.index(), # out acq_lock + T.index(), # out rel_lock + ], + ) + + @core(compute_tile, "kernel.o") + def core_body(): + in_buf0 = of_in.get_buffer(0) + in_acq, in_rel = of_in.get_lock(ObjectFifoPort.Consume) + + out_buf0 = of_out.get_buffer(0) + out_acq, out_rel = of_out.get_lock(ObjectFifoPort.Produce) + + scale_fn( + in_buf0, + out_buf0, + in_acq, + in_rel, + out_acq, + out_rel, + ) + + @runtime_sequence( + np.ndarray[(N * NUM_ITER,), np.dtype[np.int32]], + np.ndarray[(N * NUM_ITER,), np.dtype[np.int32]], + ) + def sequence(inTensor, outTensor): + npu_dma_memcpy_nd( + metadata=of_out, + bd_id=1, + mem=outTensor, + sizes=[1, 1, 1, N * NUM_ITER], + issue_token=True, + ) + npu_dma_memcpy_nd( + metadata=of_in, + bd_id=0, + mem=inTensor, + sizes=[1, 1, 1, N * NUM_ITER], + ) + dma_wait(of_out) + + print(ctx.module) + + +design() diff --git a/test/npu-xrt/objectfifo_lock_c_api_depth1/kernel.cc b/test/npu-xrt/objectfifo_lock_c_api_depth1/kernel.cc new file mode 100644 index 00000000000..b6f586656b3 --- /dev/null +++ b/test/npu-xrt/objectfifo_lock_c_api_depth1/kernel.cc @@ -0,0 +1,44 @@ +//===- kernel.cc ------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// ObjectFIFO C API test with depth-1 (single buffer, no ping-pong). + +#include + +#include "aie_objectfifo.h" + +extern "C" { + +void scale_kernel(int32_t *in_buf0, int32_t *out_buf0, int64_t in_acq_lock, + int64_t in_rel_lock, int64_t out_acq_lock, + int64_t out_rel_lock) { + objectfifo_t of_in = { + (int32_t)in_acq_lock, (int32_t)in_rel_lock, -1, 1, 1, {in_buf0}}; + objectfifo_t of_out = { + (int32_t)out_acq_lock, (int32_t)out_rel_lock, -1, 1, 1, {out_buf0}}; + + for (int iter = 0; iter < 8; iter++) { + objectfifo_acquire(&of_in); + objectfifo_acquire(&of_out); + + // Depth 1: objectfifo_get_buffer always returns buffer 0 + int32_t *in = (int32_t *)objectfifo_get_buffer(&of_in, iter); + int32_t *out = (int32_t *)objectfifo_get_buffer(&of_out, iter); + + for (int i = 0; i < 1024; i++) { + out[i] = in[i] * 3; + } + + objectfifo_release(&of_in); + objectfifo_release(&of_out); + } +} + +} // extern "C" diff --git a/test/npu-xrt/objectfifo_lock_c_api_depth1/test.cpp b/test/npu-xrt/objectfifo_lock_c_api_depth1/test.cpp new file mode 100644 index 00000000000..8fc3abea3bf --- /dev/null +++ b/test/npu-xrt/objectfifo_lock_c_api_depth1/test.cpp @@ -0,0 +1,118 @@ +//===- test.cpp -------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include "cxxopts.hpp" +#include "test_utils.h" +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +constexpr int N = 1024; +constexpr int NUM_ITER = 8; +constexpr int TOTAL = N * NUM_ITER; + +int main(int argc, const char *argv[]) { + cxxopts::Options options("objectfifo_lock_c_api_depth1"); + test_utils::add_default_options(options); + + cxxopts::ParseResult vm; + test_utils::parse_options(argc, argv, options, vm); + + std::vector instr_v = + test_utils::load_instr_binary(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << std::endl; + + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() + << std::endl; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + std::string Node = vm["kernel"].as(); + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + device.register_xclbin(xclbin); + xrt::hw_context context(device, xclbin.get_uuid()); + auto kernel = xrt::kernel(context, kernelName); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_in = xrt::bo(device, TOTAL * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(3)); + auto bo_out = xrt::bo(device, TOTAL * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(4)); + + int32_t *bufIn = bo_in.map(); + for (int i = 0; i < TOTAL; i++) + bufIn[i] = i + 1; + + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_in.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + if (verbosity >= 1) + std::cout << "Running Kernel." << std::endl; + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in, bo_out); + ert_cmd_state r = run.wait(); + if (r != ERT_CMD_STATE_COMPLETED) { + std::cout << "Kernel did not complete. Returned status: " << r << "\n"; + return 1; + } + + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + uint32_t *bufOut = bo_out.map(); + + int errors = 0; + + for (uint32_t i = 0; i < TOTAL; i++) { + uint32_t ref = (i + 1) * 3; + if (*(bufOut + i) != ref) { + errors++; + if (verbosity >= 1) { + std::cout << "Error at index " << i << ": expected " << ref << ", got " + << *(bufOut + i) << std::endl; + } + } + } + + if (!errors) { + std::cout << std::endl << "PASS!" << std::endl << std::endl; + return 0; + } else { + std::cout << std::endl + << errors << " mismatches." << std::endl + << std::endl; + std::cout << std::endl << "fail." << std::endl << std::endl; + return 1; + } +} diff --git a/test/npu-xrt/objectfifo_lock_c_api_depth3/aie2.py b/test/npu-xrt/objectfifo_lock_c_api_depth3/aie2.py new file mode 100644 index 00000000000..53d88afc55f --- /dev/null +++ b/test/npu-xrt/objectfifo_lock_c_api_depth3/aie2.py @@ -0,0 +1,106 @@ +# +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# Copyright (C) 2026, Advanced Micro Devices, Inc. + +# REQUIRES: ryzen_ai, peano +# +# RUN: %run_on_npu1% %PEANO_INSTALL_DIR/bin/clang++ -O2 -std=c++20 --target=aie2-none-unknown-elf -DNDEBUG -I %aie_runtime_lib%/AIE2 -c %S/kernel.cc -o kernel.o +# RUN: %python %S/aie2.py > ./aie2.mlir +# RUN: %python aiecc.py --no-aiesim --no-xchesscc --no-xbridge --aie-generate-npu-insts --aie-generate-xclbin --no-compile-host --xclbin-name=aie.xclbin --npu-insts-name=insts.bin ./aie2.mlir +# RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags +# RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin + +# ObjectFIFO lock C API test with depth 3 (triple buffering). + +import numpy as np +from aie.extras.context import mlir_mod_ctx + +from aie.dialects.aie import * +from aie.dialects.aiex import * + +N = 1024 +NUM_ITER = 9 # multiple of 3 for clean triple-buffer cycling +tile_ty = np.ndarray[(N,), np.dtype[np.int32]] + + +def design(): + + with mlir_mod_ctx() as ctx: + + @device(AIEDevice.npu1_1col) + def device_body(): + shim_tile = tile(0, 0) + compute_tile = tile(0, 2) + + # Depth 3: triple buffering + of_in = object_fifo("of_in", shim_tile, compute_tile, 3, tile_ty) + of_out = object_fifo("of_out", compute_tile, shim_tile, 3, tile_ty) + + scale_fn = external_func( + "scale_kernel", + inputs=[ + tile_ty, # in buffer 0 + tile_ty, # in buffer 1 + tile_ty, # in buffer 2 + tile_ty, # out buffer 0 + tile_ty, # out buffer 1 + tile_ty, # out buffer 2 + T.index(), # in acq_lock + T.index(), # in rel_lock + T.index(), # out acq_lock + T.index(), # out rel_lock + ], + ) + + @core(compute_tile, "kernel.o") + def core_body(): + in_buf0 = of_in.get_buffer(0) + in_buf1 = of_in.get_buffer(1) + in_buf2 = of_in.get_buffer(2) + in_acq, in_rel = of_in.get_lock(ObjectFifoPort.Consume) + + out_buf0 = of_out.get_buffer(0) + out_buf1 = of_out.get_buffer(1) + out_buf2 = of_out.get_buffer(2) + out_acq, out_rel = of_out.get_lock(ObjectFifoPort.Produce) + + scale_fn( + in_buf0, + in_buf1, + in_buf2, + out_buf0, + out_buf1, + out_buf2, + in_acq, + in_rel, + out_acq, + out_rel, + ) + + @runtime_sequence( + np.ndarray[(N * NUM_ITER,), np.dtype[np.int32]], + np.ndarray[(N * NUM_ITER,), np.dtype[np.int32]], + ) + def sequence(inTensor, outTensor): + npu_dma_memcpy_nd( + metadata=of_out, + bd_id=1, + mem=outTensor, + sizes=[1, 1, 1, N * NUM_ITER], + issue_token=True, + ) + npu_dma_memcpy_nd( + metadata=of_in, + bd_id=0, + mem=inTensor, + sizes=[1, 1, 1, N * NUM_ITER], + ) + dma_wait(of_out) + + print(ctx.module) + + +design() diff --git a/test/npu-xrt/objectfifo_lock_c_api_depth3/kernel.cc b/test/npu-xrt/objectfifo_lock_c_api_depth3/kernel.cc new file mode 100644 index 00000000000..2b3367c234c --- /dev/null +++ b/test/npu-xrt/objectfifo_lock_c_api_depth3/kernel.cc @@ -0,0 +1,48 @@ +//===- kernel.cc ------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// ObjectFIFO C API test with depth-3 (triple buffering). +// Uses objectfifo_get_buffer() for automatic buffer rotation across 3 buffers. + +#include + +#include "aie_objectfifo.h" + +extern "C" { + +void scale_kernel(int32_t *in_buf0, int32_t *in_buf1, int32_t *in_buf2, + int32_t *out_buf0, int32_t *out_buf1, int32_t *out_buf2, + int64_t in_acq_lock, int64_t in_rel_lock, + int64_t out_acq_lock, int64_t out_rel_lock) { + objectfifo_t of_in = { + (int32_t)in_acq_lock, (int32_t)in_rel_lock, -1, 1, 3, + {in_buf0, in_buf1, in_buf2}}; + objectfifo_t of_out = { + (int32_t)out_acq_lock, (int32_t)out_rel_lock, -1, 1, 3, + {out_buf0, out_buf1, out_buf2}}; + + for (int iter = 0; iter < 9; iter++) { + objectfifo_acquire(&of_in); + objectfifo_acquire(&of_out); + + // objectfifo_get_buffer cycles through 0, 1, 2, 0, 1, 2, ... + int32_t *in = (int32_t *)objectfifo_get_buffer(&of_in, iter); + int32_t *out = (int32_t *)objectfifo_get_buffer(&of_out, iter); + + for (int i = 0; i < 1024; i++) { + out[i] = in[i] * 3; + } + + objectfifo_release(&of_in); + objectfifo_release(&of_out); + } +} + +} // extern "C" diff --git a/test/npu-xrt/objectfifo_lock_c_api_depth3/test.cpp b/test/npu-xrt/objectfifo_lock_c_api_depth3/test.cpp new file mode 100644 index 00000000000..5c149352930 --- /dev/null +++ b/test/npu-xrt/objectfifo_lock_c_api_depth3/test.cpp @@ -0,0 +1,118 @@ +//===- test.cpp -------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include + +#include "cxxopts.hpp" +#include "test_utils.h" +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +constexpr int N = 1024; +constexpr int NUM_ITER = 9; +constexpr int TOTAL = N * NUM_ITER; + +int main(int argc, const char *argv[]) { + cxxopts::Options options("objectfifo_lock_c_api_depth3"); + test_utils::add_default_options(options); + + cxxopts::ParseResult vm; + test_utils::parse_options(argc, argv, options, vm); + + std::vector instr_v = + test_utils::load_instr_binary(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << std::endl; + + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() + << std::endl; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + std::string Node = vm["kernel"].as(); + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + device.register_xclbin(xclbin); + xrt::hw_context context(device, xclbin.get_uuid()); + auto kernel = xrt::kernel(context, kernelName); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + auto bo_in = xrt::bo(device, TOTAL * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(3)); + auto bo_out = xrt::bo(device, TOTAL * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(4)); + + int32_t *bufIn = bo_in.map(); + for (int i = 0; i < TOTAL; i++) + bufIn[i] = i + 1; + + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + bo_in.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + if (verbosity >= 1) + std::cout << "Running Kernel." << std::endl; + unsigned int opcode = 3; + auto run = kernel(opcode, bo_instr, instr_v.size(), bo_in, bo_out); + ert_cmd_state r = run.wait(); + if (r != ERT_CMD_STATE_COMPLETED) { + std::cout << "Kernel did not complete. Returned status: " << r << "\n"; + return 1; + } + + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + uint32_t *bufOut = bo_out.map(); + + int errors = 0; + + for (uint32_t i = 0; i < TOTAL; i++) { + uint32_t ref = (i + 1) * 3; + if (*(bufOut + i) != ref) { + errors++; + if (verbosity >= 1) { + std::cout << "Error at index " << i << ": expected " << ref << ", got " + << *(bufOut + i) << std::endl; + } + } + } + + if (!errors) { + std::cout << std::endl << "PASS!" << std::endl << std::endl; + return 0; + } else { + std::cout << std::endl + << errors << " mismatches." << std::endl + << std::endl; + std::cout << std::endl << "fail." << std::endl << std::endl; + return 1; + } +} diff --git a/test/objectFifo-stateful-transform/base/objectfifo_lock_buffer_aie2.mlir b/test/objectFifo-stateful-transform/base/objectfifo_lock_buffer_aie2.mlir new file mode 100644 index 00000000000..a90c34097f0 --- /dev/null +++ b/test/objectFifo-stateful-transform/base/objectfifo_lock_buffer_aie2.mlir @@ -0,0 +1,51 @@ +//===- objectfifo_lock_buffer_aie2.mlir -------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// Verify that aie.objectfifo.lock and aie.objectfifo.buffer ops are correctly +// lowered by the stateful transform on AIE2. The lock op should resolve to +// the producer/consumer lock SSA values based on port direction. + +// RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s + +// CHECK: module @test_objectfifo_lock { +// CHECK: aie.device(xcve2302) { +// CHECK: %[[TILE0:.*]] = aie.tile(1, 2) +// CHECK: %[[TILE1:.*]] = aie.tile(1, 3) +// CHECK: %[[BUF0:.*]] = aie.buffer(%[[TILE0]]) {sym_name = "of0_buff_0"} : memref<256xi32> +// CHECK: %[[BUF1:.*]] = aie.buffer(%[[TILE0]]) {sym_name = "of0_buff_1"} : memref<256xi32> +// CHECK: %[[PROD_LOCK:.*]] = aie.lock(%[[TILE0]], 0) {init = 2 : i32, sym_name = "of0_prod_lock_0"} +// CHECK: %[[CONS_LOCK:.*]] = aie.lock(%[[TILE0]], 1) {init = 0 : i32, sym_name = "of0_cons_lock_0"} + +// For Produce port on AIE2: +// acq_lock = prod_lock (lock[0]) +// rel_lock = cons_lock (lock[1]) +// CHECK: %[[CORE:.*]] = aie.core(%[[TILE0]]) { +// CHECK: func.call @kernel(%[[BUF0]], %[[PROD_LOCK]], %[[CONS_LOCK]]) +// CHECK: aie.end + +module @test_objectfifo_lock { + aie.device(xcve2302) { + %tile12 = aie.tile(1, 2) + %tile13 = aie.tile(1, 3) + + aie.objectfifo @of0(%tile12, {%tile13}, 2 : i32) : !aie.objectfifo> + + func.func private @kernel(%buf: memref<256xi32>, + %acq_lock: index, %rel_lock: index) -> () + + %core12 = aie.core(%tile12) { + %buf = aie.objectfifo.buffer @of0 (0) : memref<256xi32> + %acq_lock, %rel_lock = aie.objectfifo.lock @of0 (Produce) : (index, index) + func.call @kernel(%buf, %acq_lock, %rel_lock) + : (memref<256xi32>, index, index) -> () + aie.end + } { link_with = "kernel.o" } + } +} diff --git a/test/objectFifo-stateful-transform/base/objectfifo_lock_buffer_depth1_aie2.mlir b/test/objectFifo-stateful-transform/base/objectfifo_lock_buffer_depth1_aie2.mlir new file mode 100644 index 00000000000..8edaa860474 --- /dev/null +++ b/test/objectFifo-stateful-transform/base/objectfifo_lock_buffer_depth1_aie2.mlir @@ -0,0 +1,50 @@ +//===- objectfifo_lock_buffer_depth1_aie2.mlir -------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// Verify that aie.objectfifo.lock and aie.objectfifo.buffer ops work correctly +// with depth-1 ObjectFIFOs (single buffer, no ping-pong). + +// RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s + +// CHECK: module @test_objectfifo_lock_depth1 { +// CHECK: aie.device(xcve2302) { +// CHECK: %[[TILE0:.*]] = aie.tile(1, 2) +// CHECK: %[[TILE1:.*]] = aie.tile(1, 3) +// CHECK: %[[BUF0:.*]] = aie.buffer(%[[TILE0]]) {sym_name = "of0_buff_0"} : memref<256xi32> +// CHECK: %[[PROD_LOCK:.*]] = aie.lock(%[[TILE0]], 0) {init = 1 : i32, sym_name = "of0_prod_lock_0"} +// CHECK: %[[CONS_LOCK:.*]] = aie.lock(%[[TILE0]], 1) {init = 0 : i32, sym_name = "of0_cons_lock_0"} + +// For Produce port on AIE2 with depth 1: +// acq_lock = prod_lock (lock[0]) +// rel_lock = cons_lock (lock[1]) +// Only one buffer exists (buf_0) +// CHECK: %[[CORE:.*]] = aie.core(%[[TILE0]]) { +// CHECK: func.call @kernel(%[[BUF0]], %[[PROD_LOCK]], %[[CONS_LOCK]]) +// CHECK: aie.end + +module @test_objectfifo_lock_depth1 { + aie.device(xcve2302) { + %tile12 = aie.tile(1, 2) + %tile13 = aie.tile(1, 3) + + aie.objectfifo @of0(%tile12, {%tile13}, 1 : i32) : !aie.objectfifo> + + func.func private @kernel(%buf: memref<256xi32>, + %acq_lock: index, %rel_lock: index) -> () + + %core12 = aie.core(%tile12) { + %buf = aie.objectfifo.buffer @of0 (0) : memref<256xi32> + %acq_lock, %rel_lock = aie.objectfifo.lock @of0 (Produce) : (index, index) + func.call @kernel(%buf, %acq_lock, %rel_lock) + : (memref<256xi32>, index, index) -> () + aie.end + } { link_with = "kernel.o" } + } +} diff --git a/test/objectFifo-stateful-transform/base/objectfifo_lock_buffer_depth3_aie2.mlir b/test/objectFifo-stateful-transform/base/objectfifo_lock_buffer_depth3_aie2.mlir new file mode 100644 index 00000000000..034b129057b --- /dev/null +++ b/test/objectFifo-stateful-transform/base/objectfifo_lock_buffer_depth3_aie2.mlir @@ -0,0 +1,73 @@ +//===- objectfifo_lock_buffer_depth3_aie2.mlir -------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2026, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// Verify that aie.objectfifo.lock and aie.objectfifo.buffer ops work correctly +// with depth-3 ObjectFIFOs (triple buffering). + +// RUN: aie-opt --aie-objectFifo-stateful-transform %s | FileCheck %s + +// CHECK: module @test_objectfifo_lock_depth3 { +// CHECK: aie.device(xcve2302) { +// CHECK: %[[TILE0:.*]] = aie.tile(1, 2) +// CHECK: %[[TILE1:.*]] = aie.tile(1, 3) +// CHECK: %[[BUF0:.*]] = aie.buffer(%[[TILE0]]) {sym_name = "of0_buff_0"} : memref<256xi32> +// CHECK: %[[BUF1:.*]] = aie.buffer(%[[TILE0]]) {sym_name = "of0_buff_1"} : memref<256xi32> +// CHECK: %[[BUF2:.*]] = aie.buffer(%[[TILE0]]) {sym_name = "of0_buff_2"} : memref<256xi32> +// CHECK: %[[PROD_LOCK:.*]] = aie.lock(%[[TILE0]], 0) {init = 3 : i32, sym_name = "of0_prod_lock_0"} +// CHECK: %[[CONS_LOCK:.*]] = aie.lock(%[[TILE0]], 1) {init = 0 : i32, sym_name = "of0_cons_lock_0"} + +// For Produce port on AIE2 with depth 3: +// acq_lock = prod_lock, rel_lock = cons_lock +// Three buffers: buf_0, buf_1, buf_2 +// CHECK: %[[CORE:.*]] = aie.core(%[[TILE0]]) { +// CHECK: func.call @kernel(%[[BUF0]], %[[BUF1]], %[[BUF2]], %[[PROD_LOCK]], %[[CONS_LOCK]]) +// CHECK: aie.end + +// Consume port on the consumer tile: +// acq_lock = cons_lock, rel_lock = prod_lock +// CHECK: %[[CORE1:.*]] = aie.core(%[[TILE1]]) { +// CHECK: func.call @consumer(%[[BUF0]], %[[BUF1]], %[[BUF2]], %[[CONS_LOCK]], %[[PROD_LOCK]]) +// CHECK: aie.end + +module @test_objectfifo_lock_depth3 { + aie.device(xcve2302) { + %tile12 = aie.tile(1, 2) + %tile13 = aie.tile(1, 3) + + aie.objectfifo @of0(%tile12, {%tile13}, 3 : i32) : !aie.objectfifo> + + func.func private @kernel(%buf0: memref<256xi32>, %buf1: memref<256xi32>, + %buf2: memref<256xi32>, + %acq_lock: index, %rel_lock: index) -> () + func.func private @consumer(%buf0: memref<256xi32>, %buf1: memref<256xi32>, + %buf2: memref<256xi32>, + %acq_lock: index, %rel_lock: index) -> () + + %core12 = aie.core(%tile12) { + %buf0 = aie.objectfifo.buffer @of0 (0) : memref<256xi32> + %buf1 = aie.objectfifo.buffer @of0 (1) : memref<256xi32> + %buf2 = aie.objectfifo.buffer @of0 (2) : memref<256xi32> + %acq_lock, %rel_lock = aie.objectfifo.lock @of0 (Produce) : (index, index) + func.call @kernel(%buf0, %buf1, %buf2, %acq_lock, %rel_lock) + : (memref<256xi32>, memref<256xi32>, memref<256xi32>, index, index) -> () + aie.end + } { link_with = "kernel.o" } + + %core13 = aie.core(%tile13) { + %buf0 = aie.objectfifo.buffer @of0 (0) : memref<256xi32> + %buf1 = aie.objectfifo.buffer @of0 (1) : memref<256xi32> + %buf2 = aie.objectfifo.buffer @of0 (2) : memref<256xi32> + %acq_lock, %rel_lock = aie.objectfifo.lock @of0 (Consume) : (index, index) + func.call @consumer(%buf0, %buf1, %buf2, %acq_lock, %rel_lock) + : (memref<256xi32>, memref<256xi32>, memref<256xi32>, index, index) -> () + aie.end + } { link_with = "consumer.o" } + } +}