diff --git a/.github/workflows/ci_build.yml b/.github/workflows/ci_build.yml index 5843f686..eb6e48cc 100644 --- a/.github/workflows/ci_build.yml +++ b/.github/workflows/ci_build.yml @@ -34,9 +34,13 @@ on: types: - 'published' +concurrency: + group: '${{ github.workflow }}-${{ github.event.pull_request.head.label || github.head_ref || github.ref }}' + cancel-in-progress: ${{ !contains(github.ref, 'release/')}} + jobs: build: - name: Build for Python${{ matrix.python-version }} + name: Build with Python${{ matrix.python-version }} runs-on: ubuntu-latest defaults: run: diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake index 3c081fce..9be06e9b 100644 --- a/cmake/Dependencies.cmake +++ b/cmake/Dependencies.cmake @@ -1,5 +1,6 @@ # ---[ Googletest if(BUILD_TEST) + enable_testing() add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/googletest) include_directories(BEFORE SYSTEM ${PROJECT_SOURCE_DIR}/third_party/googletest/googletest/include) include_directories(BEFORE SYSTEM ${PROJECT_SOURCE_DIR}/third_party/googletest/googlemock/include) diff --git a/csrc/core/.keep b/csrc/core/.keep deleted file mode 100644 index e69de29b..00000000 diff --git a/csrc/core/PrivateUse1Guard.h b/csrc/core/PrivateUse1Guard.h new file mode 100644 index 00000000..066e641a --- /dev/null +++ b/csrc/core/PrivateUse1Guard.h @@ -0,0 +1,146 @@ +#pragma once + +#include +#include +#include + +#include "csrc/core/impl/PrivateUse1GuardImpl.h" + +namespace c10_backend { + +// This code is kind of boilerplatey. See Note [Whither the DeviceGuard +// boilerplate] + +/// A variant of DeviceGuard that is specialized for PrivateUse1. It accepts +/// integer indices (interpreting them as PrivateUse1 devices) and is a little +/// more efficient than DeviceGuard (it compiles to straight line +/// cudaSetDevice/cudaGetDevice calls); however, it can only be used +/// from code that links against PrivateUse1 directly. +template +struct PrivateUse1Guard { + /// No default constructor; see Note [Omitted default constructor from RAII] + explicit PrivateUse1Guard() = delete; + + /// Set the current PrivateUse1 device to the passed device index. + explicit PrivateUse1Guard(c10::DeviceIndex device_index) + : guard_(device_index) {} + + /// Sets the current PrivateUse1 device to the passed device. Errors if the + /// passed device is not a PrivateUse1 device. + explicit PrivateUse1Guard(c10::Device device) : guard_(device) {} + + // Copy is not allowed + PrivateUse1Guard(const PrivateUse1Guard&) = delete; + PrivateUse1Guard& operator=(const PrivateUse1Guard&) = delete; + + // Move is not allowed (there is no uninitialized state) + PrivateUse1Guard(PrivateUse1Guard&& other) = delete; + PrivateUse1Guard& operator=(PrivateUse1Guard&& other) = delete; + + /// Sets the PrivateUse1 device to the given device. Errors if the given + /// device is not a PrivateUse1 device. + void set_device(c10::Device device) { + guard_.set_device(device); + } + + /// Sets the PrivateUse1 device to the given device. Errors if the given + /// device is not a PrivateUse1 device. (This method is provided for + /// uniformity with DeviceGuard). + void reset_device(c10::Device device) { + guard_.reset_device(device); + } + + /// Sets the PrivateUse1 device to the given device index. + void set_index(c10::DeviceIndex device_index) { + guard_.set_index(device_index); + } + + /// Returns the device that was set upon construction of the guard + c10::Device original_device() const { + return guard_.original_device(); + } + + /// Returns the last device that was set via `set_device`, if any, otherwise + /// the device passed during construction. + c10::Device current_device() const { + return guard_.current_device(); + } + + private: + /// The guard for the current device. + c10::impl::InlineDeviceGuard guard_; +}; + +/// A variant of OptionalDeviceGuard that is specialized for PrivateUse1. See +/// PrivateUse1Guard for when you can use this. +template +struct OptionalPrivateUse1Guard { + /// Create an uninitialized OptionalPrivateUse1Guard. + explicit OptionalPrivateUse1Guard() : guard_() {} + + /// Set the current PrivateUse1 device to the passed Device, if it is not + /// nullopt. + explicit OptionalPrivateUse1Guard(std::optional device_opt) + : guard_(device_opt) {} + + /// Set the current PrivateUse1 device to the passed device index, if it is + /// not nullopt + explicit OptionalPrivateUse1Guard( + std::optional device_index_opt) + : guard_(device_index_opt) {} + + // Copy is not allowed + OptionalPrivateUse1Guard(const OptionalPrivateUse1Guard&) = delete; + OptionalPrivateUse1Guard& operator=(const OptionalPrivateUse1Guard&) = delete; + + // See Note [Move construction for RAII guards is tricky] + OptionalPrivateUse1Guard(OptionalPrivateUse1Guard&& other) = delete; + // See Note [Move assignment for RAII guards is tricky] + OptionalPrivateUse1Guard& operator=(OptionalPrivateUse1Guard&& other) = + delete; + + /// Sets the PrivateUse1 device to the given device, initializing the guard if + /// it is not already initialized. Errors if the given device is not a + /// PrivateUse1 device. + void set_device(c10::Device device) { + guard_.set_device(device); + } + + /// Sets the PrivateUse1 device to the given device, initializing the guard if + /// it is not already initialized. Errors if the given device is not a + /// PrivateUse1 device. (This method is provided for uniformity with + /// OptionalDeviceGuard). + void reset_device(c10::Device device) { + guard_.reset_device(device); + } + + /// Sets the PrivateUse1 device to the given device index, initializing the + /// guard if it is not already initialized. + void set_index(c10::DeviceIndex device_index) { + guard_.set_index(device_index); + } + + /// Returns the device that was set immediately prior to initialization of the + /// guard, or nullopt if the guard is uninitialized. + std::optional original_device() const { + return guard_.original_device(); + } + + /// Returns the most recent device that was set using this device guard, + /// either from construction, or via set_device, if the guard is initialized, + /// or nullopt if the guard is uninitialized. + std::optional current_device() const { + return guard_.current_device(); + } + + /// Restore the original PrivateUse1 device, resetting this guard to + /// uninitialized state. + void reset() { + guard_.reset(); + } + + private: + c10::impl::InlineOptionalDeviceGuard guard_; +}; + +} // namespace c10_backend diff --git a/csrc/core/impl/PrivateUse1GuardImpl.h b/csrc/core/impl/PrivateUse1GuardImpl.h new file mode 100644 index 00000000..37c1be5f --- /dev/null +++ b/csrc/core/impl/PrivateUse1GuardImpl.h @@ -0,0 +1,27 @@ +#pragma once + +#include +#include +#include + +namespace c10_backend::impl { + +/** + * All classes which inherit from PrivateUse1GuardImpl should be declared + * 'final'. + */ +struct PrivateUse1GuardImpl : public c10::impl::DeviceGuardImplInterface { + static constexpr c10::DeviceType static_type = c10::DeviceType::PrivateUse1; + + PrivateUse1GuardImpl() = default; + + explicit PrivateUse1GuardImpl(c10::DeviceType t) { + TORCH_INTERNAL_ASSERT(t == c10::DeviceType::PrivateUse1); + } + + c10::DeviceType type() const final { + return c10::DeviceType::PrivateUse1; + } +}; + +} // namespace c10_backend::impl diff --git a/csrc/npu/CachingAllocatorHelper.h b/csrc/npu/CachingAllocatorHelper.h index 11811705..b695aeee 100644 --- a/csrc/npu/CachingAllocatorHelper.h +++ b/csrc/npu/CachingAllocatorHelper.h @@ -129,7 +129,7 @@ ExpandableSegment* createExpandableSegment( size_t size); // Wraps the insert event function -void insertEventWrapper(int device, std::function insertEventFn); +void insertEventWrapper(c10::DeviceIndex device, std::function insertEventFn); // Returns the current stream for the given device void* getCurrentStream(c10::DeviceIndex); diff --git a/csrc/npu/NPUCachingAllocator.cpp b/csrc/npu/NPUCachingAllocator.cpp index 8a3b2071..99132b22 100644 --- a/csrc/npu/NPUCachingAllocator.cpp +++ b/csrc/npu/NPUCachingAllocator.cpp @@ -128,7 +128,7 @@ struct BlockPool { }; struct Block { - int device; + c10::DeviceIndex device; void* stream; // allocation stream stream_set stream_uses; // streams on which the block was used size_t size; // block size in bytes @@ -155,7 +155,7 @@ struct Block { // memory out from our cache. std::shared_ptr context_when_segment_allocated; - Block(int device, void* stream, size_t size, BlockPool* pool, void* ptr) + Block(c10::DeviceIndex device, void* stream, size_t size, BlockPool* pool, void* ptr) : device(device), stream(stream), stream_uses(), @@ -170,7 +170,7 @@ struct Block { gc_count(0) {} // constructor for search key - Block(int device, void* stream, size_t size) + Block(c10::DeviceIndex device, void* stream, size_t size) : device(device), stream(stream), stream_uses(), @@ -244,7 +244,7 @@ static std::string format_size(uint64_t size) { struct AllocParams { AllocParams( - int device, + c10::DeviceIndex device, size_t size, void* stream, BlockPool* pool, @@ -256,7 +256,7 @@ struct AllocParams { block(nullptr), err(ACL_ERROR_NONE) {} - int device() const { + c10::DeviceIndex device() const { return search_key.device; } void* stream() const { @@ -641,7 +641,7 @@ class DeviceCachingAllocator { // All public methods (except the above) acquire the allocator mutex. // Thus, do not call a public method from another public method. - Block* malloc(int device, size_t orig_size, void* stream) { + Block* malloc(c10::DeviceIndex device, size_t orig_size, void* stream) { // done outside the lock because we don't know what locks the recorder needs // to have... auto context = maybeGatherContext(RecordContext::STATE); @@ -2012,7 +2012,7 @@ class NpuCachingAllocator : public NPUAllocator { return !device_allocator.empty(); } /** allocates a block which is safe to use from the provided stream */ - void malloc(void** devPtr, int device, size_t size, void* stream) { + void malloc(void** devPtr, c10::DeviceIndex device, size_t size, void* stream) { Block* block = device_allocator[device]->malloc(device, size, stream); add_allocated_block(block); *devPtr = static_cast(block->ptr); @@ -2060,7 +2060,7 @@ class NpuCachingAllocator : public NPUAllocator { } bool isHistoryEnabled() override { - int device = 0; + c10::DeviceIndex device = 0; NPU_CHECK_ERROR(c10_npu::GetDevice(&device)); return device_allocator[device]->isHistoryEnabled(); } @@ -2154,7 +2154,7 @@ class NpuCachingAllocator : public NPUAllocator { } c10::DataPtr allocate(size_t size) override { - int device = 0; + c10::DeviceIndex device = 0; NPU_CHECK_ERROR(c10_npu::GetDevice(&device)); void* devPtr = nullptr; void (*deleteFunc)(void*) = &local_raw_delete; @@ -2215,7 +2215,7 @@ class NpuCachingAllocator : public NPUAllocator { if (nbytes == 0) { return nullptr; } - int device = 0; + c10::DeviceIndex device = 0; NPU_CHECK_ERROR(c10_npu::GetDevice(&device)); void* r = nullptr; malloc(&r, device, nbytes, getCurrentStream(device)); @@ -2226,7 +2226,7 @@ class NpuCachingAllocator : public NPUAllocator { if (nbytes == 0) { return nullptr; } - int device; + c10::DeviceIndex device; NPU_CHECK_ERROR(c10_npu::GetDevice(&device)); void* r = nullptr; malloc(&r, device, nbytes, stream); diff --git a/csrc/npu/NPUFunctions.cpp b/csrc/npu/NPUFunctions.cpp index f04dd180..3b96cfda 100644 --- a/csrc/npu/NPUFunctions.cpp +++ b/csrc/npu/NPUFunctions.cpp @@ -1,5 +1,4 @@ #include "csrc/npu/NPUFunctions.h" -#include #include #include #include "csrc/npu/NPUStream.h" @@ -7,40 +6,91 @@ namespace c10_npu { -static uint32_t dev_count = 0; -static thread_local int local_device = -1; -static std::unordered_map used_devices; +static thread_local c10::DeviceIndex local_device = -1; +// TODO: remove used_devices +static std::unordered_map used_devices; std::mutex mtx; +int device_count_impl() { + int count = 0; + NPU_CHECK_ERROR(GetDeviceCount(&count)); + return count; +} + c10::DeviceIndex device_count() noexcept { // initialize number of devices only once - if (dev_count == 0) { - aclError error = aclrtGetDeviceCount(&dev_count); - if (error != ACL_ERROR_NONE) { - ASCEND_LOGE("get device count of NPU failed"); + static int count = []() { + try { + auto result = device_count_impl(); + TORCH_INTERNAL_ASSERT( + result <= std::numeric_limits::max(), + "Too many NPU devices, DeviceIndex overflowed"); + return result; + } catch (const c10::Error& ex) { + // We don't want to fail, but still log the warning + // msg() returns the message without the stack trace + TORCH_WARN("NPU initialization: ", ex.msg()); return 0; } - return static_cast(dev_count); - } - return static_cast(dev_count); + }(); + return static_cast(count); } c10::DeviceIndex device_count_ensure_non_zero() { - unsigned int count = 0; - - NPU_CHECK_ERROR(aclrtGetDeviceCount(&count)); + // Call the implementation every time to throw the exception + int count = device_count_impl(); + // Zero npus doesn't produce a warning in `device_count` but we fail here TORCH_CHECK(count, "No NPUs are available", PTA_ERROR(ErrCode::UNAVAIL)); - + TORCH_INTERNAL_ASSERT( + count <= std::numeric_limits::max(), + "Too many NPU devices, DeviceIndex overflowed"); return static_cast(count); } -aclError GetDevice(int32_t* device) { +c10::DeviceIndex current_device() { + c10::DeviceIndex cur_device = -1; + NPU_CHECK_ERROR(c10_npu::GetDevice(&cur_device)); + return cur_device; +} + +void set_device(c10::DeviceIndex device) { + NPU_CHECK_ERROR(c10_npu::SetDevice(device)); +} + +void device_synchronize() { + NPU_CHECK_ERROR(aclrtSynchronizeDevice()); +} + +// this function has to be called from callers performing npu synchronizing +// operations, to raise proper error or warning +void warn_or_error_on_sync() { + if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_ERROR) { + TORCH_CHECK( + false, "called a synchronizing NPU operation", PTA_ERROR(ErrCode::ACL)); + } else if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_WARN) { + TORCH_NPU_WARN("called a synchronizing NPU operation"); + } +} + +// Wrappers for raw CUDA device management functions +aclError GetDeviceCount(int* dev_count) { + return aclrtGetDeviceCount(reinterpret_cast(dev_count)); +} + +aclError GetDevice(c10::DeviceIndex* device) { if (local_device >= 0) { *device = local_device; return ACL_ERROR_NONE; } - aclError err = aclrtGetDevice(device); + int tmp_device = -1; + auto err = aclrtGetDevice(&tmp_device); if (err == ACL_ERROR_NONE) { + TORCH_INTERNAL_ASSERT( + tmp_device >= 0 && + tmp_device <= std::numeric_limits::max(), + "aclrtGetDevice returns invalid device ", + tmp_device); + *device = static_cast(tmp_device); local_device = *device; } else if ( err == ACL_ERROR_RT_CONTEXT_NULL && aclrtSetDevice(0) == ACL_ERROR_NONE) { @@ -57,19 +107,6 @@ aclError GetDevice(int32_t* device) { return err; } -inline bool has_set_pthread_affinity() { - int core_nums = sysconf(_SC_NPROCESSORS_ONLN); - int count = 0; - - cpu_set_t mask; - pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask); - for (int i = 0; i < core_nums; i++) { - count += CPU_ISSET(i, &mask); - } - - return count != core_nums; -} - aclError SetDevice(c10::DeviceIndex device) { TORCH_CHECK( device >= 0, "device id must be positive!", PTA_ERROR(ErrCode::VALUE)); @@ -78,26 +115,6 @@ aclError SetDevice(c10::DeviceIndex device) { return ACL_ERROR_NONE; } - static const bool set_pthread_affinity = has_set_pthread_affinity(); - if (!set_pthread_affinity) { - uint32_t bind_conf = c10_npu::option::OptionsManager::GetBindCpuConf(); - // bind_conf=1, bind cores averagely based on device_id - if (bind_conf == 1) { - int core_nums = sysconf(_SC_NPROCESSORS_ONLN); - int device_nums = device_count_ensure_non_zero(); - int block_size = (core_nums + device_nums - 1) / device_nums; - int start_core = device * block_size; - int end_core = std::min((device + 1) * block_size, core_nums); - - cpu_set_t mask; - CPU_ZERO(&mask); - for (int i = start_core; i < end_core; i++) { - CPU_SET(i, &mask); - } - pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask); - } - } - aclError err = aclrtSetDevice(device); if (err == ACL_ERROR_NONE) { local_device = device; @@ -111,6 +128,15 @@ aclError SetDevice(c10::DeviceIndex device) { return err; } +aclrtContext GetDeviceContext(c10::DeviceIndex device) { + if (used_devices.find(device) == used_devices.end()) { + ASCEND_LOGE( + "NPU device %d has been initialized! Can not get context", device); + return nullptr; + } + return used_devices[device]; +} + aclError ResetUsedDevices() { for (const auto it : used_devices) { aclError err = aclrtResetDevice(it.first); @@ -123,7 +149,7 @@ aclError ResetUsedDevices() { } aclError DestroyUsedStreams() { - int32_t cur_device = 0; + c10::DeviceIndex cur_device = 0; NPU_CHECK_ERROR(GetDevice(&cur_device)); for (const auto it : used_devices) { NPU_CHECK_ERROR(SetDevice(it.first)); @@ -138,7 +164,7 @@ aclError DestroyUsedStreams() { } aclError SynchronizeUsedDevices() { - int32_t cur_device = 0; + c10::DeviceIndex cur_device = 0; NPU_CHECK_ERROR(GetDevice(&cur_device)); for (const auto it : used_devices) { NPU_CHECK_ERROR(SetDevice(it.first)); @@ -151,37 +177,4 @@ aclError SynchronizeUsedDevices() { return ACL_ERROR_NONE; } -aclrtContext GetDeviceContext(int32_t device) { - if (used_devices.find(device) == used_devices.end()) { - ASCEND_LOGE( - "NPU device %d has been initialized! Can not get context", device); - return nullptr; - } - return used_devices[device]; -} - -c10::DeviceIndex current_device() { - int cur_device = 0; - NPU_CHECK_ERROR(c10_npu::GetDevice(&cur_device)); - return static_cast(cur_device); -} - -void set_device(c10::DeviceIndex device) { - NPU_CHECK_ERROR(c10_npu::SetDevice(device)); -} - -void device_synchronize() { - NPU_CHECK_ERROR(aclrtSynchronizeDevice()); -} - -int ExchangeDevice(int device) { - NPU_CHECK_ERROR(SetDevice(device)); - - return device; -} - -int GetLocalDevice() { - return local_device; -} - } // namespace c10_npu diff --git a/csrc/npu/NPUFunctions.h b/csrc/npu/NPUFunctions.h index 18cad7f0..575e14c4 100644 --- a/csrc/npu/NPUFunctions.h +++ b/csrc/npu/NPUFunctions.h @@ -1,11 +1,11 @@ #pragma once -// This header provides C++ wrappers around commonly used CUDA API functions. +// This header provides C++ wrappers around commonly used AscendCL API functions. // The benefit of using C++ here is that we can raise an exception in the // event of an error, rather than explicitly pass around error codes. This // leads to more natural APIs. // -// The naming convention used here matches the naming convention of torch.cuda +// The naming convention used here matches the naming convention of torch.npu #include #include @@ -19,56 +19,34 @@ namespace c10_npu { C10_NPU_API c10::DeviceIndex device_count() noexcept; +// Version of device_count that throws is no devices are detected C10_NPU_API c10::DeviceIndex device_count_ensure_non_zero(); -/** - * @ingroup torch_npu - * @brief get device id from local thread cache preferentially for performance. - * If the thread cache has not been initialized, it will get from ACL interface: - * aclrtGetDevice, and initialize the local thread cache. - * If the context is empty, it will set device 0. - * - * @param device [IN] device id - * @retval ACL_ERROR_NONE The function is successfully executed. - * @retval OtherValues Failure - */ -C10_NPU_API aclError GetDevice(int32_t* device); - -/** - * @ingroup torch_npu - * @brief set device id by ACL interface: aclrtSetDevice, - * and update the local thread cache - * - * @param device [IN] device id - * @retval ACL_ERROR_NONE The function is successfully executed. - * @retval OtherValues Failure - */ -C10_NPU_API aclError SetDevice(c10::DeviceIndex device); +C10_NPU_API c10::DeviceIndex current_device(); -/** - * @ingroup torch_npu - * @brief reset all device id by ACL interface: aclrtResetDevice. - * - * @retval ACL_ERROR_NONE The function is successfully executed. - * @retval OtherValues Failure - */ -aclError ResetUsedDevices(); +C10_NPU_API void set_device(c10::DeviceIndex device); -aclError DestroyUsedStreams(); +C10_NPU_API void device_synchronize(); -aclError SynchronizeUsedDevices(); +// this function has to be called from callers performing npu synchronizing +// operations, to raise proper error or warning +C10_NPU_API void warn_or_error_on_sync(); -aclrtContext GetDeviceContext(int32_t device); +// Raw CUDA device management functions +C10_NPU_API aclError GetDeviceCount(int* dev_count); -C10_NPU_API c10::DeviceIndex current_device(); +C10_NPU_API aclError GetDevice(c10::DeviceIndex* device); -C10_NPU_API void set_device(c10::DeviceIndex device); +C10_NPU_API aclError SetDevice(c10::DeviceIndex device); -C10_NPU_API void device_synchronize(); +C10_NPU_API aclrtContext GetDeviceContext(c10::DeviceIndex device); + +// TODO: remove the following three functions +aclError ResetUsedDevices(); -C10_NPU_API int ExchangeDevice(int device); +aclError DestroyUsedStreams(); -int GetLocalDevice(); +aclError SynchronizeUsedDevices(); enum class SyncDebugMode { L_DISABLED = 0, L_WARN, L_ERROR }; @@ -76,8 +54,8 @@ enum class SyncDebugMode { L_DISABLED = 0, L_WARN, L_ERROR }; // through this global state to determine the synchronization debug mode class WarningState { public: - void set_sync_debug_mode(SyncDebugMode level) { - sync_debug_mode = level; + void set_sync_debug_mode(SyncDebugMode l) { + sync_debug_mode = l; } SyncDebugMode get_sync_debug_mode() { @@ -88,20 +66,9 @@ class WarningState { SyncDebugMode sync_debug_mode = SyncDebugMode::L_DISABLED; }; -C10_NPU_API inline WarningState& warning_state() { +C10_NPU_API __inline__ WarningState& warning_state() { static WarningState warning_state_; return warning_state_; } -// this function has to be called from callers performing npu synchronizing -// operations, to raise proper error or warning -C10_NPU_API inline void warn_or_error_on_sync() { - if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_ERROR) { - TORCH_CHECK( - false, "called a synchronizing NPU operation", PTA_ERROR(ErrCode::ACL)); - } else if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_WARN) { - TORCH_NPU_WARN("called a synchronizing NPU operation"); - } -} - } // namespace c10_npu diff --git a/csrc/npu/NPUGuardImpl.cpp b/csrc/npu/NPUGuardImpl.cpp index d120a8ce..bfc123f5 100644 --- a/csrc/npu/NPUGuardImpl.cpp +++ b/csrc/npu/NPUGuardImpl.cpp @@ -9,8 +9,6 @@ namespace c10_npu { namespace impl { -constexpr c10::DeviceType NPUGuardImpl::static_type; - C10_REGISTER_GUARD_IMPL(PrivateUse1, NPUGuardImpl); #define REGISTER_PRIVATEUSE1_BACKEND(name) \ diff --git a/csrc/npu/NPUGuardImpl.h b/csrc/npu/NPUGuardImpl.h index 4b7d9954..61550e65 100644 --- a/csrc/npu/NPUGuardImpl.h +++ b/csrc/npu/NPUGuardImpl.h @@ -8,29 +8,25 @@ #include #include #include "csrc/aten/generated/NPUNativeFunctions.h" -#include "npu/core/NPUException.h" +#include "csrc/core/impl/PrivateUse1GuardImpl.h" #include "csrc/npu/NPUFunctions.h" #include "csrc/npu/NPUStream.h" +#include "npu/core/NPUException.h" #include "npu/core/interface/AsyncTaskQueueInterface.h" #include "npu/core/sys_ctrl/npu_sys_ctrl.h" namespace c10_npu { namespace impl { +struct NPUGuardImpl final : public c10_backend::impl::PrivateUse1GuardImpl { + NPUGuardImpl() = default; -struct NPUGuardImpl final : public c10::impl::DeviceGuardImplInterface { - static constexpr c10::DeviceType static_type = c10::DeviceType::PrivateUse1; - - NPUGuardImpl() {} explicit NPUGuardImpl(c10::DeviceType t) { TORCH_INTERNAL_ASSERT( - t == c10::DeviceType::PrivateUse1, - "DeviceType must be NPU. Actual DeviceType is: ", + t == static_type, + "DeviceType must be 'c10::DeviceType::PrivateUse1'. Actual DeviceType is: ", t, PTA_ERROR(ErrCode::PARAM)); } - c10::DeviceType type() const override { - return c10::DeviceType::PrivateUse1; - } c10::Device exchangeDevice(c10::Device d) const override { TORCH_INTERNAL_ASSERT( d.type() == c10::DeviceType::PrivateUse1, @@ -44,14 +40,14 @@ struct NPUGuardImpl final : public c10::impl::DeviceGuardImplInterface { return old_device; } c10::Device getDevice() const override { - int device = 0; + c10::DeviceIndex device = 0; NPU_CHECK_ERROR(c10_npu::GetDevice(&device)); return c10::Device(c10::DeviceType::PrivateUse1, device); } void setDevice(c10::Device d) const override { TORCH_INTERNAL_ASSERT( d.type() == c10::DeviceType::PrivateUse1, - "DeviceType must be NPU. Actual DeviceType is: ", + "DeviceType must be 'c10::DeviceType::PrivateUse1'. Actual DeviceType is: ", d.type(), PTA_ERROR(ErrCode::PARAM)); NPU_CHECK_ERROR(c10_npu::SetDevice(d.index())); diff --git a/csrc/npu/NPUStream.cpp b/csrc/npu/NPUStream.cpp index e653fea2..361bf0d2 100644 --- a/csrc/npu/NPUStream.cpp +++ b/csrc/npu/NPUStream.cpp @@ -143,7 +143,7 @@ static void initGlobalStreamState() { "). Increase that and recompile.", PTA_ERROR(ErrCode::VALUE)); - int device_id = 0; + c10::DeviceIndex device_id = 0; auto ret = c10_npu::GetDevice(&device_id); if (ret != ACL_ERROR_NONE) { ASCEND_LOGE("Device has not been set"); diff --git a/csrc/npu/THNPUCachingHostAllocator.cpp b/csrc/npu/THNPUCachingHostAllocator.cpp index a3deb5e5..d6247394 100644 --- a/csrc/npu/THNPUCachingHostAllocator.cpp +++ b/csrc/npu/THNPUCachingHostAllocator.cpp @@ -246,7 +246,7 @@ struct HostAllocator { aclError insertEvents(Block& block) { aclError err = ACL_ERROR_NONE; - int prev_device = 0; + c10::DeviceIndex prev_device = 0; err = c10_npu::GetDevice(&prev_device); if (err != ACL_ERROR_NONE) { return err; @@ -338,9 +338,7 @@ at::Allocator* getTHNPUCachingHostAllocator() { c10::Allocator* getPinnedMemoryAllocator() { C10_LOG_API_USAGE_ONCE("aten.init.npu"); - c10_npu::NpuSysCtrl::SysStatus status = - c10_npu::NpuSysCtrl::GetInstance().Initialize(); - if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) { + if (!c10_npu::NpuSysCtrl::IsInitializeSuccess()) { ASCEND_LOGE("Npu init fail."); } return getTHNPUCachingHostAllocator(); diff --git a/npu/core/CachingAllocatorHelper.cpp b/npu/core/CachingAllocatorHelper.cpp index 9fe60e8b..086ce80e 100644 --- a/npu/core/CachingAllocatorHelper.cpp +++ b/npu/core/CachingAllocatorHelper.cpp @@ -177,7 +177,7 @@ ExpandableSegment* createExpandableSegment( return new NPUExpandableSegment(device, stream, size); } -void insertEventWrapper(int device, std::function fn) { +void insertEventWrapper(c10::DeviceIndex device, std::function fn) { aclrtContext compiler_ctx = aclrtContext(); aclError ret_ctx = aclrtGetCurrentContext(&compiler_ctx); NPU_CHECK_ERROR(aclrtSetCurrentContext(c10_npu::GetDeviceContext(device))); diff --git a/npu/core/DeviceUtils.h b/npu/core/DeviceUtils.h index 243398df..129c2d4d 100644 --- a/npu/core/DeviceUtils.h +++ b/npu/core/DeviceUtils.h @@ -46,9 +46,7 @@ inline c10::DeviceType get_npu_device_type() { inline void maybe_initialize_npu(const at::TensorOptions& options) { if (torch_npu::utils::is_npu(options)) { - c10_npu::NpuSysCtrl::SysStatus status = - c10_npu::NpuSysCtrl::GetInstance().Initialize(options.device().index()); - if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) { + if (!c10_npu::NpuSysCtrl::IsInitializeSuccess(options.device().index())) { TORCH_CHECK( false, "npu device ", @@ -61,9 +59,7 @@ inline void maybe_initialize_npu(const at::TensorOptions& options) { inline void maybe_initialize_npu(const at::Device& device) { if (torch_npu::utils::is_npu(device)) { - c10_npu::NpuSysCtrl::SysStatus status = - c10_npu::NpuSysCtrl::GetInstance().Initialize(device.index()); - if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) { + if (!c10_npu::NpuSysCtrl::IsInitializeSuccess(device.index())) { TORCH_CHECK( false, "npu device ", diff --git a/npu/core/NPUException.cpp b/npu/core/NPUException.cpp index 2474c7c4..13fdcc07 100644 --- a/npu/core/NPUException.cpp +++ b/npu/core/NPUException.cpp @@ -38,7 +38,7 @@ void warn_(const ::c10::Warning& warning) { std::string formatErrorCode(SubModule submodule, ErrCode errorCode) { std::ostringstream oss; - int deviceIndex = -1; + c10::DeviceIndex deviceIndex = -1; c10_npu::GetDevice(&deviceIndex); auto rank_id = c10_npu::option::OptionsManager::GetRankId(); oss << "\n[ERROR] " << getCurrentTimestamp() << " (PID:" << getpid() diff --git a/npu/core/NPUGuard.h b/npu/core/NPUGuard.h index 816eb5fc..83fbc892 100644 --- a/npu/core/NPUGuard.h +++ b/npu/core/NPUGuard.h @@ -4,6 +4,7 @@ #include #include #include "npu/core/NPUMacros.h" +#include "csrc/core/PrivateUse1Guard.h" #include "csrc/npu/NPUGuardImpl.h" #include @@ -18,16 +19,9 @@ namespace c10_npu { /// more efficient than DeviceGuard (it compiles to straight line /// NPUSetDevice/NPUGetDevice calls); however, it can only be used /// from code that links against NPU directly. -struct NPUGuard { - /// No default constructor; see Note [Omitted default constructor from RAII] - explicit NPUGuard() = delete; - - /// Set the current NPU device to the passed device index. - explicit NPUGuard(c10::DeviceIndex device_index) : guard_(device_index) {} - - /// Sets the current NPU device to the passed device. Errors if the passed - /// device is not a NPU device. - explicit NPUGuard(c10::Device device) : guard_(device) {} +struct NPUGuard : public c10_backend::PrivateUse1Guard { + using PrivateUse1Guard = c10_backend::PrivateUse1Guard; + using PrivateUse1Guard::PrivateUse1Guard; // Copy is not allowed NPUGuard(const NPUGuard&) = delete; @@ -36,55 +30,15 @@ struct NPUGuard { // Move is not allowed (there is no uninitialized state) NPUGuard(NPUGuard&& other) = delete; NPUGuard& operator=(NPUGuard&& other) = delete; - - /// Sets the NPU device to the given device. Errors if the given device - /// is not a NPU device. - void set_device(c10::Device device) { - guard_.set_device(device); - } - - /// Sets the NPU device to the given device. Errors if the given device - /// is not a NPU device. (This method is provided for uniformity with - /// DeviceGuard). - void reset_device(c10::Device device) { - guard_.reset_device(device); - } - - /// Sets the NPU device to the given device index. - void set_index(c10::DeviceIndex device_index) { - guard_.set_index(device_index); - } - - /// Returns the device that was set upon construction of the guard - c10::Device original_device() const { - return guard_.original_device(); - } - - /// Returns the last device that was set via `set_device`, if any, otherwise - /// the device passed during construction. - c10::Device current_device() const { - return guard_.current_device(); - } - - private: - /// The guard for the current device. - c10::impl::InlineDeviceGuard guard_; }; /// A variant of OptionalDeviceGuard that is specialized for NPU. See /// NPUGuard for when you can use this. -struct OptionalNPUGuard { - /// Create an uninitialized OptionalNPUGuard. - explicit OptionalNPUGuard() : guard_() {} - - /// Set the current NPU device to the passed Device, if it is not nullopt. - explicit OptionalNPUGuard(c10::optional device_opt) - : guard_(device_opt) {} - - /// Set the current NPU device to the passed device index, if it is not - /// nullopt - explicit OptionalNPUGuard(c10::optional device_index_opt) - : guard_(device_index_opt) {} +struct OptionalNPUGuard + : public c10_backend::OptionalPrivateUse1Guard { + using OptionalPrivateUse1Guard = + c10_backend::OptionalPrivateUse1Guard; + using OptionalPrivateUse1Guard::OptionalPrivateUse1Guard; // Copy is not allowed OptionalNPUGuard(const OptionalNPUGuard&) = delete; @@ -92,51 +46,8 @@ struct OptionalNPUGuard { // See Note [Move construction for RAII guards is tricky] OptionalNPUGuard(OptionalNPUGuard&& other) = delete; - // See Note [Move assignment for RAII guards is tricky] OptionalNPUGuard& operator=(OptionalNPUGuard&& other) = delete; - - /// Sets the NPU device to the given device, initializing the guard if it - /// is not already initialized. Errors if the given device is not a NPU - /// device. - void set_device(c10::Device device) { - guard_.set_device(device); - } - - /// Sets the NPU device to the given device, initializing the guard if it is - /// not already initialized. Errors if the given device is not a NPU device. - /// (This method is provided for uniformity with OptionalDeviceGuard). - void reset_device(c10::Device device) { - guard_.reset_device(device); - } - - /// Sets the NPU device to the given device index, initializing the guard if - /// it is not already initialized. - void set_index(c10::DeviceIndex device_index) { - guard_.set_index(device_index); - } - - /// Returns the device that was set immediately prior to initialization of the - /// guard, or nullopt if the guard is uninitialized. - c10::optional original_device() const { - return guard_.original_device(); - } - - /// Returns the most recent device that was set using this device guard, - /// either from construction, or via set_device, if the guard is initialized, - /// or nullopt if the guard is uninitialized. - c10::optional current_device() const { - return guard_.current_device(); - } - - /// Restore the original NPU device, resetting this guard to uninitialized - /// state. - void reset() { - guard_.reset(); - } - - private: - c10::impl::InlineOptionalDeviceGuard guard_; }; /// A variant of StreamGuard that is specialized for NPU. See NPUGuard diff --git a/npu/core/sys_ctrl/npu_sys_ctrl.cpp b/npu/core/sys_ctrl/npu_sys_ctrl.cpp index a26cc060..2a094ad4 100644 --- a/npu/core/sys_ctrl/npu_sys_ctrl.cpp +++ b/npu/core/sys_ctrl/npu_sys_ctrl.cpp @@ -150,6 +150,16 @@ void initCachingAllocator() { ASCEND_LOGD("Npu caching allocator initialize successfully"); } +bool NpuSysCtrl::IsInitializeSuccess(int device_id) { + SysStatus status = GetInstance().Initialize(device_id); + return status == SysStatus::INIT_SUCC; +} + +bool NpuSysCtrl::IsFinalizeSuccess() { + SysStatus status = GetInstance().Finalize(); + return status == SysStatus::FINALIZE_SUCC; +} + // Environment Initialize, return Status: SUCCESS, FAILED NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) { if (init_flag_) { diff --git a/npu/core/sys_ctrl/npu_sys_ctrl.h b/npu/core/sys_ctrl/npu_sys_ctrl.h index 08446231..27d595dd 100644 --- a/npu/core/sys_ctrl/npu_sys_ctrl.h +++ b/npu/core/sys_ctrl/npu_sys_ctrl.h @@ -37,6 +37,10 @@ class NpuSysCtrl { // Get NpuSysCtrl singleton instance C10_NPU_API static NpuSysCtrl& GetInstance(); + C10_NPU_API static bool IsInitializeSuccess(int device_id = -1); + + C10_NPU_API static bool IsFinalizeSuccess(); + // Environment Initialize, return SysStatus SysStatus Initialize(int device_id = -1); diff --git a/npu/framework/OpCommand.cpp b/npu/framework/OpCommand.cpp index d184f2f9..f4e63b7e 100644 --- a/npu/framework/OpCommand.cpp +++ b/npu/framework/OpCommand.cpp @@ -270,7 +270,7 @@ at::Tensor OpCommand::CopyHostToDevice( at::Tensor OpCommand::CopyHostToDevice(const at::Tensor& cpuTensor) { at::Tensor cpuPinMemTensor = cpuTensor.pin_memory(); - int deviceIndex = 0; + c10::DeviceIndex deviceIndex = 0; NPU_CHECK_ERROR(c10_npu::GetDevice(&deviceIndex)); auto tensor = cpuPinMemTensor.to( c10::Device(c10::DeviceType::PrivateUse1, deviceIndex), diff --git a/npu/framework/utils/CalcuOpUtil.cpp b/npu/framework/utils/CalcuOpUtil.cpp index 9fc8da98..bb5a0084 100644 --- a/npu/framework/utils/CalcuOpUtil.cpp +++ b/npu/framework/utils/CalcuOpUtil.cpp @@ -186,7 +186,7 @@ at::Tensor CalcuOpUtil::CopyScalarToDevice( at::Tensor CalcuOpUtil::CopyTensorHostToDevice(const at::Tensor& cpu_tensor) { at::Tensor cpuPinMemTensor = cpu_tensor.pin_memory(); - int deviceIndex = 0; + c10::DeviceIndex deviceIndex = 0; NPU_CHECK_ERROR(c10_npu::GetDevice(&deviceIndex)); return cpuPinMemTensor.to( c10::Device(c10::DeviceType::PrivateUse1, deviceIndex), diff --git a/npu/framework/utils/NpuUtils.cpp b/npu/framework/utils/NpuUtils.cpp index 2878493b..fc9fb52b 100644 --- a/npu/framework/utils/NpuUtils.cpp +++ b/npu/framework/utils/NpuUtils.cpp @@ -256,7 +256,7 @@ at::Tensor NpuUtils::format_contiguous_add_copy_optimize( bool NpuUtils::IsOomError(aclError ret, int index) { if (ret == ACL_ERROR_GE_DEVICE_MEMORY_ALLOCATION_FAILED) { - int deviceId = 0; + c10::DeviceIndex deviceId = 0; // free devcie cached memory when return value of the first op execution is // oom if (index == 1) { diff --git a/test/cpp/core/CMakeLists.txt b/test/cpp/core/CMakeLists.txt index dade077c..f86b855e 100644 --- a/test/cpp/core/CMakeLists.txt +++ b/test/cpp/core/CMakeLists.txt @@ -1,9 +1,11 @@ if(BUILD_TEST) set(TORCH_BACKEND_CORE_TEST_SOURCES - ${PROJECT_SOURCE_DIR}/test/cpp/common/main.cpp) + ${PROJECT_SOURCE_DIR}/test/cpp/common/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp) add_executable(test_core ${TORCH_BACKEND_CORE_TEST_SOURCES}) target_link_libraries(test_core PRIVATE torch_npu gtest_main gtest) + add_test(NAME test_core COMMAND $) endif() if(INSTALL_TEST) diff --git a/test/cpp/core/device.cpp b/test/cpp/core/device.cpp new file mode 100644 index 00000000..5ec9c54d --- /dev/null +++ b/test/cpp/core/device.cpp @@ -0,0 +1,9 @@ +#include + +#include "csrc/core/impl/PrivateUse1GuardImpl.h" + +TEST(DeviceGuardTest, StaticType) { + EXPECT_EQ( + c10_backend::impl::PrivateUse1GuardImpl::static_type, + c10::DeviceType::PrivateUse1); +} diff --git a/torch_npu/csrc/Module.cpp b/torch_npu/csrc/Module.cpp index 0e4f2e48..9facb8d6 100644 --- a/torch_npu/csrc/Module.cpp +++ b/torch_npu/csrc/Module.cpp @@ -63,9 +63,7 @@ PyObject* THPModule_npu_shutdown(PyObject* /* unused */) { } ASCEND_LOGI("NPU shutdown NpuSysCtrl Finalize."); - c10_npu::NpuSysCtrl::SysStatus status = - c10_npu::NpuSysCtrl::GetInstance().Finalize(); - if (status != c10_npu::NpuSysCtrl::SysStatus::FINALIZE_SUCC) { + if (!c10_npu::NpuSysCtrl::IsFinalizeSuccess()) { ASCEND_LOGE("NPU shutdown failed."); } else { ASCEND_LOGI("NPU shutdown success."); diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp index 4e192110..35fd94dc 100644 --- a/torch_npu/csrc/npu/Module.cpp +++ b/torch_npu/csrc/npu/Module.cpp @@ -3,15 +3,11 @@ #include #include -#include #include #include #include #include -#include -#include #include -#include #include #include #include @@ -30,11 +26,9 @@ #include "npu/core/NpuVariables.h" #include "npu/core/register/OptionRegister.h" #include "npu/core/sys_ctrl/npu_sys_ctrl.h" -#include "npu/framework/StorageDescHelper.h" #include "npu/acl/include/acl/acl.h" #include "torch_npu/csrc/npu/Module.h" #include "torch_npu/csrc/npu/NPUPluggableAllocator.h" -#include "torch_npu/csrc/npu/Stream.h" #include "torch_npu/csrc/npu/memory_snapshot.h" struct NPUDeviceProp { @@ -107,7 +101,7 @@ void RegisterNPUDeviceMemories(PyObject* module) { .def_readonly("free_memory", &NPUDeviceMem::freeMem); } -NPUDeviceMem* GetDeviceMemories(int64_t deviceid) { +NPUDeviceMem* GetDeviceMemories(c10::DeviceIndex deviceid) { c10_npu::NPUGuard guard(deviceid); size_t device_free; size_t device_total; @@ -121,7 +115,7 @@ void BindGetDeviceMemories(PyObject* module) { auto m = py::handle(module).cast(); m.def( "_npu_getDeviceMemories", - [](int deviceid) -> NPUDeviceMem* { return GetDeviceMemories(deviceid); }, + [](c10::DeviceIndex deviceid) -> NPUDeviceMem* { return GetDeviceMemories(deviceid); }, py::return_value_policy::reference); } @@ -230,9 +224,9 @@ static PyObject* THNPModule_initExtension(PyObject* self, PyObject* noargs) { throw python_error(); } }; - auto num_npus = c10_npu::device_count(); + c10::DeviceIndex num_npus = c10_npu::device_count(); auto default_npu_generators = PyTuple_New(static_cast(num_npus)); - for (int i = 0; i < num_npus; i++) { + for (c10::DeviceIndex i = 0; i < num_npus; i++) { auto gen = at_npu::detail::getDefaultNPUGenerator(i); auto cast_gen = (THPGenerator*)THPGenerator_initDefaultGenerator(gen); // This reference is meant to be given away, so no need to incref here. @@ -259,13 +253,13 @@ void THNPModule_setDevice(int device) { PyObject* THNPModule_setDevice_wrap(PyObject* self, PyObject* arg) { HANDLE_TH_ERRORS - int device = THPUtils_unpackLong(arg); + c10::DeviceIndex device = THPUtils_unpackDeviceIndex(arg); { pybind11::gil_scoped_release no_gil; at::globalContext().lazyInitPrivateUse1(); } - int pre_device = 0; + c10::DeviceIndex pre_device = 0; auto ret = c10_npu::GetDevice(&pre_device); if (ret != ACL_ERROR_NONE) { NPU_CHECK_ERROR(c10_npu::SetDevice(device)); @@ -279,22 +273,16 @@ PyObject* THNPModule_setDevice_wrap(PyObject* self, PyObject* arg) { PyObject* THNPModule_getDevice_wrap(PyObject* self, PyObject* noargs) { HANDLE_TH_ERRORS - int device; + c10::DeviceIndex device; torch::utils::device_lazy_init(at::kPrivateUse1); NPU_CHECK_ERROR(c10_npu::GetDevice(&device)); - return PyLong_FromLong(device); + return THPUtils_packInt32(device); END_HANDLE_TH_ERRORS } PyObject* THNPModule_getDeviceCount_wrap(PyObject* self, PyObject* noargs) { HANDLE_TH_ERRORS - return PyLong_FromLong(c10_npu::device_count()); - END_HANDLE_TH_ERRORS -} - -PyObject* THNPModule_getLocalDevice_wrap(PyObject* self, PyObject* noargs) { - HANDLE_TH_ERRORS - return PyLong_FromLong(c10_npu::GetLocalDevice()); + return THPUtils_packInt32(c10_npu::device_count()); END_HANDLE_TH_ERRORS } @@ -308,53 +296,14 @@ PyObject* THNPModule_npuCanDeviceAccessPeer_wrap( throw torch::TypeError( "Pybind failed to parse parameters." + PTA_ERROR(ErrCode::TYPE)); } - int32_t device_id = THPUtils_unpackInt(value_1); - int32_t peer_device_id = THPUtils_unpackInt(value_2); + c10::DeviceIndex device_id = THPUtils_unpackDeviceIndex(value_1); + c10::DeviceIndex peer_device_id = THPUtils_unpackDeviceIndex(value_2); auto can_access_peer = c10_npu::acl::can_device_access_peer(device_id, peer_device_id); return PyBool_FromLong(can_access_peer); END_HANDLE_TH_ERRORS } -PyObject* THNPModule_getDeviceUtilizationRate_wrap( - PyObject* self, - PyObject* device_index) { - HANDLE_TH_ERRORS - TORCH_CHECK( - THPUtils_checkLong(device_index), - "invalid argument to getDeviceUtilizationRate", - PTA_ERROR(ErrCode::VALUE)); - int32_t device = static_cast(THPUtils_unpackUInt32(device_index)); - aclrtUtilizationInfo util_info; - util_info.cubeUtilization = 0; - util_info.vectorUtilization = 0; - util_info.utilizationExtend = nullptr; - NPU_CHECK_ERROR( - c10_npu::acl::AclrtGetDeviceUtilizationRate(device, &util_info)); - int32_t cube = util_info.cubeUtilization; - int32_t vector = util_info.vectorUtilization; - int32_t util_rate = 0; - // 如果vector和cube谁支持,就返回谁的使用率,如果都支持计算(vector*1+cube*1)/2 - if (cube == DEVICE_UTILIZATION_NOT_SUPPORT && - vector != DEVICE_UTILIZATION_NOT_SUPPORT) { - util_rate = vector; - } else if ( - cube != DEVICE_UTILIZATION_NOT_SUPPORT && - vector == DEVICE_UTILIZATION_NOT_SUPPORT) { - util_rate = cube; - } else if ( - cube != DEVICE_UTILIZATION_NOT_SUPPORT && - vector != DEVICE_UTILIZATION_NOT_SUPPORT) { - util_rate = (cube + vector) / 2; - } - TORCH_CHECK( - util_rate <= 100 && util_rate >= 0, - "invalid result to util_rate", - PTA_ERROR(ErrCode::VALUE)); - return PyLong_FromLong(util_rate); - END_HANDLE_TH_ERRORS -} - PyObject* THNPModule_getCurrentStream_wrap( PyObject* /* unused */, PyObject* device_index) { @@ -363,7 +312,7 @@ PyObject* THNPModule_getCurrentStream_wrap( THPUtils_checkLong(device_index), "invalid argument to getCurrentStream", PTA_ERROR(ErrCode::PARAM)); - int64_t device = THPUtils_unpackLong(device_index); + c10::DeviceIndex device = THPUtils_unpackDeviceIndex(device_index); auto stream = c10_npu::getCurrentNPUStream(device); PyObject* output_tuple = PyTuple_New(3); PyTuple_SetItem( @@ -388,7 +337,7 @@ PyObject* THNPModule_getDefaultStream_wrap( THPUtils_checkLong(device_index), "invalid argument to getDefaultStream", PTA_ERROR(ErrCode::PARAM)); - int64_t device = THPUtils_unpackLong(device_index); + c10::DeviceIndex device = THPUtils_unpackDeviceIndex(device_index); auto stream = c10_npu::getDefaultNPUStream(device); PyObject* output_tuple = PyTuple_New(3); PyTuple_SetItem( @@ -428,9 +377,11 @@ PyObject* THNPModule_setStream_wrap( } auto stream = c10_npu::NPUStream::unpack3( - stream_id, device_index, static_cast(device_type)); + stream_id, + static_cast(device_index), + static_cast(device_type)); - int device; + c10::DeviceIndex device; NPU_CHECK_ERROR(c10_npu::GetDevice(&device)); if (device != stream.device_index()) { THNPModule_setDevice(stream.device_index()); @@ -1064,10 +1015,6 @@ static struct PyMethodDef THNPModule_methods[] = { (PyCFunction)THNPModule_getDevice_wrap, METH_NOARGS, nullptr}, - {"_npu_getLocalDevice", - (PyCFunction)THNPModule_getLocalDevice_wrap, - METH_NOARGS, - nullptr}, {"_npu_getDeviceCount", (PyCFunction)THNPModule_getDeviceCount_wrap, METH_NOARGS, @@ -1076,10 +1023,6 @@ static struct PyMethodDef THNPModule_methods[] = { (PyCFunction)THNPModule_npuCanDeviceAccessPeer_wrap, METH_VARARGS, nullptr}, - {"_npu_getDeviceUtilizationRate", - (PyCFunction)THNPModule_getDeviceUtilizationRate_wrap, - METH_O, - nullptr}, {"_npu_getCurrentStream", (PyCFunction)THNPModule_getCurrentStream_wrap, METH_O, diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp index 7d5764b1..4f1122f1 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp @@ -8,7 +8,7 @@ namespace torch::npu::NPUPluggableAllocator { -int device_count = 0; +c10::DeviceIndex device_count = 0; void custom_raw_deleter(void* ptr); @@ -17,7 +17,7 @@ _AllocationMetadata::_AllocationMetadata() _AllocationMetadata::_AllocationMetadata( size_t size, - int device_idx, + c10::DeviceIndex device_idx, aclrtStream stream) : size(size), device_idx(device_idx), stream(stream) {} @@ -70,7 +70,7 @@ void NPUPluggableAllocator::set_erase_stream_fn( void* NPUPluggableAllocator::malloc( size_t size, - int device, + c10::DeviceIndex device, aclrtStream stream) { void* r = alloc_fn_(size, device, stream); { @@ -81,7 +81,7 @@ void* NPUPluggableAllocator::malloc( } c10::DataPtr NPUPluggableAllocator::allocate(size_t size) { - int device = -1; + c10::DeviceIndex device = -1; NPU_CHECK_ERROR(c10_npu::GetDevice(&device)); aclrtStream stream = c10_npu::getCurrentNPUStreamNoWait(device); void* r = this->malloc(size, device, stream); @@ -89,8 +89,7 @@ c10::DataPtr NPUPluggableAllocator::allocate(size_t size) { r, r, raw_deleter(), - c10::Device( - c10::DeviceType::PrivateUse1, static_cast(device))}; + c10::Device(c10::DeviceType::PrivateUse1, device)}; return data_ptr; } @@ -99,7 +98,7 @@ c10::DeleterFnPtr NPUPluggableAllocator::raw_deleter() const { } void* NPUPluggableAllocator::raw_alloc(size_t nbytes) { - int device = -1; + c10::DeviceIndex device = -1; NPU_CHECK_ERROR(c10_npu::GetDevice(&device)); aclrtStream stream = c10_npu::getCurrentNPUStreamNoWait(device); return malloc(nbytes, device, stream); @@ -108,14 +107,14 @@ void* NPUPluggableAllocator::raw_alloc(size_t nbytes) { void* NPUPluggableAllocator::raw_alloc_with_stream( size_t nbytes, aclrtStream stream) { - int device = -1; + c10::DeviceIndex device = -1; NPU_CHECK_ERROR(c10_npu::GetDevice(&device)); return malloc(nbytes, device, stream); } void NPUPluggableAllocator::raw_delete(void* ptr) { aclrtStream stream{}; - int device_idx = -1; + c10::DeviceIndex device_idx = -1; size_t size = 0; { const std::lock_guard lock(allocator_mutex_); diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.h b/torch_npu/csrc/npu/NPUPluggableAllocator.h index 49ec440d..44e1f727 100644 --- a/torch_npu/csrc/npu/NPUPluggableAllocator.h +++ b/torch_npu/csrc/npu/NPUPluggableAllocator.h @@ -23,9 +23,9 @@ void changeCurrentAllocator( struct _AllocationMetadata { _AllocationMetadata(); - _AllocationMetadata(size_t size, int device_idx, aclrtStream stream); + _AllocationMetadata(size_t size, c10::DeviceIndex device_idx, aclrtStream stream); size_t size; - int device_idx; + c10::DeviceIndex device_idx; aclrtStream stream; }; @@ -46,7 +46,7 @@ struct NPUPluggableAllocator std::function record_stream_fn); void set_erase_stream_fn( std::function erase_stream_fn); - void* malloc(size_t size, int device, aclrtStream stream); + void* malloc(size_t size, c10::DeviceIndex device, aclrtStream stream); c10::DataPtr allocate(size_t size) override; c10::DeleterFnPtr raw_deleter() const override; @@ -79,8 +79,8 @@ struct NPUPluggableAllocator c10_npu::NPUCachingAllocator::OutOfMemoryObserver observer) override; protected: - std::function alloc_fn_; - std::function free_fn_; + std::function alloc_fn_; + std::function free_fn_; std::function init_fn_; std::function reset_fn_; std::function memory_fraction_fn_; diff --git a/torch_npu/csrc/npu/Stream.cpp b/torch_npu/csrc/npu/Stream.cpp index eeccec88..aa082b39 100644 --- a/torch_npu/csrc/npu/Stream.cpp +++ b/torch_npu/csrc/npu/Stream.cpp @@ -19,7 +19,7 @@ static PyObject* THNPStream_pynew( PyObject* kwargs) { HANDLE_TH_ERRORS - int current_device; + c10::DeviceIndex current_device; NPU_CHECK_ERROR(c10_npu::GetDevice(¤t_device)); int priority = 0; diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py index f76cdbe2..741b6911 100644 --- a/torch_npu/npu/__init__.py +++ b/torch_npu/npu/__init__.py @@ -21,7 +21,6 @@ "set_sync_debug_mode", "get_sync_debug_mode", "init_dump", - "utilization", "finalize_dump", "manual_seed", "manual_seed_all", @@ -111,7 +110,6 @@ get_sync_debug_mode, init_dump, is_bf16_supported, - utilization, finalize_dump, set_dump, get_npu_overflow_flag, diff --git a/torch_npu/npu/device.py b/torch_npu/npu/device.py index de71a8d0..841a649a 100644 --- a/torch_npu/npu/device.py +++ b/torch_npu/npu/device.py @@ -10,7 +10,7 @@ from torch_npu.utils.error_code import ErrCode, pta_error __all__ = ["synchronize", "device_count", "can_device_access_peer", "set_device", "current_device", "get_device_name", - "get_device_properties", "mem_get_info", "get_device_capability", "utilization", "device", "device_of", + "get_device_properties", "mem_get_info", "get_device_capability", "device", "device_of", "_get_device_index"] @@ -87,16 +87,6 @@ def get_device_capability(device=None): return None -def utilization(device=None): - r"""Query the comprehensive utilization rate of device - """ - device_id = _get_device_index(device, optional=True) - if device_id < 0 or device_id >= device_count(): - raise AssertionError("Invalid device id" + pta_error(ErrCode.VALUE)) - torch_npu.npu._lazy_init() - return torch_npu._C._npu_getDeviceUtilizationRate(device_id) - - class device(object): r"""Context-manager that changes the selected device. diff --git a/torch_npu/npu/utils.py b/torch_npu/npu/utils.py index 8399c096..a2c5b03c 100644 --- a/torch_npu/npu/utils.py +++ b/torch_npu/npu/utils.py @@ -17,14 +17,13 @@ get_device_properties, get_device_capability, _get_device_index, - utilization, device, device_of, mem_get_info, ) __all__ = ["synchronize", "device_count", "can_device_access_peer", "set_device", "current_device", "get_device_name", - "get_device_properties", "mem_get_info", "get_device_capability", "utilization", "device", "device_of", + "get_device_properties", "mem_get_info", "get_device_capability", "device", "device_of", "stream", "set_stream", "current_stream", "default_stream", "set_sync_debug_mode", "get_sync_debug_mode", "init_dump", "set_dump", "finalize_dump", "get_soc_version", "is_support_inf_nan", "is_bf16_supported", "get_npu_overflow_flag", "npu_check_overflow", "clear_npu_overflow_flag", "current_blas_handle"] diff --git a/torch_npu/utils/error_code.py b/torch_npu/utils/error_code.py index 23a8a4be..61dde1e7 100644 --- a/torch_npu/utils/error_code.py +++ b/torch_npu/utils/error_code.py @@ -45,13 +45,6 @@ def msg(self): def _format_error_msg(submodule, error_code): - def get_device_id(): - try: - import torch_npu._C - return torch_npu._C._npu_getLocalDevice() - except Exception: - return -1 - def get_rank_id(): try: import torch.distributed as dist @@ -60,12 +53,11 @@ def get_rank_id(): except Exception: return -1 - error_msg = "\n[ERROR] {time} (PID:{pid}, Device:{device}, RankID:{rank}) {error_code} {submodule_name} {error_code_msg}" + error_msg = "\n[ERROR] {time} (PID:{pid}, RankID:{rank}) {error_code} {submodule_name} {error_code_msg}" return error_msg.format( time=time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()), pid=os.getpid(), - device=get_device_id(), rank=get_rank_id(), error_code="ERR{:0>2d}{:0>3d}".format(submodule.value, error_code.code,), submodule_name=submodule.name,