Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Device][Backend] refactor: extract methods and deviceguard #4

Merged
merged 16 commits into from
Jul 17, 2024
4 changes: 4 additions & 0 deletions .github/workflows/ci_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,10 @@ on:
types:
- 'published'

concurrency:
group: '${{ github.workflow }}-${{ github.event.pull_request.head.label || github.head_ref || github.ref }}'
cancel-in-progress: ${{ !contains(github.ref, 'release/')}}

jobs:
build:
name: Build for Python${{ matrix.python-version }}
Expand Down
Empty file removed csrc/core/.keep
Empty file.
149 changes: 149 additions & 0 deletions csrc/core/PrivateUse1Guard.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
#pragma once

#include <c10/core/DeviceType.h>
#include <c10/core/impl/InlineDeviceGuard.h>
#include <c10/core/impl/InlineStreamGuard.h>

#include "csrc/core/impl/PrivateUse1GuardImpl.h"

namespace c10_backend {

// This code is kind of boilerplatey. See Note [Whither the DeviceGuard
FFFrog marked this conversation as resolved.
Show resolved Hide resolved
// boilerplate]

/// A variant of DeviceGuard that is specialized for PrivateUse1. It accepts
/// integer indices (interpreting them as PrivateUse1 devices) and is a little
/// more efficient than DeviceGuard (it compiles to straight line
/// cudaSetDevice/cudaGetDevice calls); however, it can only be used
/// from code that links against PrivateUse1 directly.
template <typename T>
struct PrivateUse1Guard {
/// No default constructor; see Note [Omitted default constructor from RAII]
explicit PrivateUse1Guard() = delete;

/// Set the current PrivateUse1 device to the passed device index.
explicit PrivateUse1Guard(c10::DeviceIndex device_index)
: guard_(device_index) {}

/// Sets the current PrivateUse1 device to the passed device. Errors if the
/// passed device is not a PrivateUse1 device.
explicit PrivateUse1Guard(c10::Device device) : guard_(device) {}

// Copy is not allowed
PrivateUse1Guard(const PrivateUse1Guard&) = delete;

PrivateUse1Guard& operator=(const PrivateUse1Guard&) = delete;

// Move is not allowed (there is no uninitialized state)
PrivateUse1Guard(PrivateUse1Guard&& other) = delete;

PrivateUse1Guard& operator=(PrivateUse1Guard&& other) = delete;
/// Sets the PrivateUse1 device to the given device. Errors if the given
/// device is not a PrivateUse1 device.
void set_device(c10::Device device) {
guard_.set_device(device);
}

/// Sets the PrivateUse1 device to the given device. Errors if the given
/// device is not a PrivateUse1 device. (This method is provided for
/// uniformity with DeviceGuard).
void reset_device(c10::Device device) {
guard_.reset_device(device);
}

/// Sets the PrivateUse1 device to the given device index.
void set_index(c10::DeviceIndex device_index) {
guard_.set_index(device_index);
}

/// Returns the device that was set upon construction of the guard
c10::Device original_device() const {
return guard_.original_device();
}

/// Returns the last device that was set via `set_device`, if any, otherwise
/// the device passed during construction.
c10::Device current_device() const {
return guard_.current_device();
}

private:
/// The guard for the current device.
c10::impl::InlineDeviceGuard<T> guard_;
};

/// A variant of OptionalDeviceGuard that is specialized for PrivateUse1. See
/// PrivateUse1Guard for when you can use this.
template <typename T>
struct OptionalPrivateUse1Guard {
/// Create an uninitialized OptionalPrivateUse1Guard.
explicit OptionalPrivateUse1Guard() : guard_() {}

/// Set the current PrivateUse1 device to the passed Device, if it is not
/// nullopt.
explicit OptionalPrivateUse1Guard(std::optional<c10::Device> device_opt)
: guard_(device_opt) {}

/// Set the current PrivateUse1 device to the passed device index, if it is
/// not nullopt
explicit OptionalPrivateUse1Guard(
std::optional<c10::DeviceIndex> device_index_opt)
: guard_(device_index_opt) {}

// Copy is not allowed
OptionalPrivateUse1Guard(const OptionalPrivateUse1Guard&) = delete;

OptionalPrivateUse1Guard& operator=(const OptionalPrivateUse1Guard&) = delete;

// See Note [Move construction for RAII guards is tricky]
OptionalPrivateUse1Guard(OptionalPrivateUse1Guard&& other) = delete;

// See Note [Move assignment for RAII guards is tricky]
OptionalPrivateUse1Guard& operator=(OptionalPrivateUse1Guard&& other) =
delete;

/// Sets the PrivateUse1 device to the given device, initializing the guard if
/// it is not already initialized. Errors if the given device is not a
/// PrivateUse1 device.
void set_device(c10::Device device) {
guard_.set_device(device);
}

/// Sets the PrivateUse1 device to the given device, initializing the guard if
/// it is not already initialized. Errors if the given device is not a
/// PrivateUse1 device. (This method is provided for uniformity with
/// OptionalDeviceGuard).
void reset_device(c10::Device device) {
guard_.reset_device(device);
}

/// Sets the PrivateUse1 device to the given device index, initializing the
/// guard if it is not already initialized.
void set_index(c10::DeviceIndex device_index) {
guard_.set_index(device_index);
}

/// Returns the device that was set immediately prior to initialization of the
/// guard, or nullopt if the guard is uninitialized.
std::optional<c10::Device> original_device() const {
return guard_.original_device();
}

/// Returns the most recent device that was set using this device guard,
/// either from construction, or via set_device, if the guard is initialized,
/// or nullopt if the guard is uninitialized.
std::optional<c10::Device> current_device() const {
return guard_.current_device();
}

/// Restore the original PrivateUse1 device, resetting this guard to
/// uninitialized state.
void reset() {
guard_.reset();
}

private:
c10::impl::InlineOptionalDeviceGuard<T> guard_;
};

} // namespace c10_backend
27 changes: 27 additions & 0 deletions csrc/core/impl/PrivateUse1GuardImpl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#pragma once

#include <c10/core/DeviceType.h>
#include <c10/core/impl/InlineDeviceGuard.h>
#include <c10/macros/Macros.h>

namespace c10_backend::impl {

/**
* All classes which inherit from PrivateUse1GuardImpl should be declared
* 'final'.
*/
struct PrivateUse1GuardImpl : public c10::impl::DeviceGuardImplInterface {
static constexpr c10::DeviceType static_type = c10::DeviceType::PrivateUse1;

PrivateUse1GuardImpl() = default;

explicit PrivateUse1GuardImpl(c10::DeviceType t) {
TORCH_INTERNAL_ASSERT(t == c10::DeviceType::PrivateUse1);
}

c10::DeviceType type() const final {
return c10::DeviceType::PrivateUse1;
}
};

} // namespace c10_backend::impl
36 changes: 30 additions & 6 deletions csrc/npu/NPUFunctions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <mutex>
#include <unordered_map>
#include "csrc/npu/NPUStream.h"
#include "npu/core/NPUDevice.h"
#include "npu/core/register/OptionsManager.h"

namespace c10_npu {
Expand All @@ -15,12 +16,7 @@ std::mutex mtx;
c10::DeviceIndex device_count() noexcept {
// initialize number of devices only once
if (dev_count == 0) {
aclError error = aclrtGetDeviceCount(&dev_count);
if (error != ACL_ERROR_NONE) {
ASCEND_LOGE("get device count of NPU failed");
return 0;
}
return static_cast<c10::DeviceIndex>(dev_count);
return c10_npu::acl::device_count_impl(&dev_count);
}
return static_cast<c10::DeviceIndex>(dev_count);
}
Expand Down Expand Up @@ -184,4 +180,32 @@ int GetLocalDevice() {
return local_device;
}

int32_t GetDeviceUtilizationRate(int32_t device) {
FFFrog marked this conversation as resolved.
Show resolved Hide resolved
aclrtUtilizationInfo util_info;
util_info.cubeUtilization = 0;
util_info.vectorUtilization = 0;
util_info.utilizationExtend = nullptr;
NPU_CHECK_ERROR(
c10_npu::acl::AclrtGetDeviceUtilizationRate(device, &util_info));
int32_t cube = util_info.cubeUtilization;
int32_t vector = util_info.vectorUtilization;
int32_t util_rate = 0;

// If either vector or cube supports, return its utilization rate.
// If both support, return the average rate: (vector + cube) / 2.
if (cube == DEVICE_UTILIZATION_NOT_SUPPORT &&
vector != DEVICE_UTILIZATION_NOT_SUPPORT) {
util_rate = vector;
} else if (
cube != DEVICE_UTILIZATION_NOT_SUPPORT &&
vector == DEVICE_UTILIZATION_NOT_SUPPORT) {
util_rate = cube;
} else if (
cube != DEVICE_UTILIZATION_NOT_SUPPORT &&
vector != DEVICE_UTILIZATION_NOT_SUPPORT) {
util_rate = (cube + vector) / 2;
}
return util_rate;
}

} // namespace c10_npu
4 changes: 3 additions & 1 deletion csrc/npu/NPUFunctions.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,9 @@ C10_NPU_API void device_synchronize();

C10_NPU_API int ExchangeDevice(int device);

int GetLocalDevice();
C10_NPU_API int GetLocalDevice();

C10_NPU_API int32_t GetDeviceUtilizationRate(int32_t device);

enum class SyncDebugMode { L_DISABLED = 0, L_WARN, L_ERROR };

Expand Down
2 changes: 0 additions & 2 deletions csrc/npu/NPUGuardImpl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ namespace c10_npu {

namespace impl {

constexpr c10::DeviceType NPUGuardImpl::static_type;

C10_REGISTER_GUARD_IMPL(PrivateUse1, NPUGuardImpl);

#define REGISTER_PRIVATEUSE1_BACKEND(name) \
Expand Down
16 changes: 6 additions & 10 deletions csrc/npu/NPUGuardImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,25 @@
#include <npu/acl/include/acl/acl_base.h>
#include <npu/acl/include/acl/acl_rt.h>
#include "csrc/aten/generated/NPUNativeFunctions.h"
#include "npu/core/NPUException.h"
#include "csrc/core/impl/PrivateUse1GuardImpl.h"
#include "csrc/npu/NPUFunctions.h"
#include "csrc/npu/NPUStream.h"
#include "npu/core/NPUException.h"
#include "npu/core/interface/AsyncTaskQueueInterface.h"
#include "npu/core/sys_ctrl/npu_sys_ctrl.h"

namespace c10_npu {
namespace impl {
struct NPUGuardImpl final : public c10_backend::impl::PrivateUse1GuardImpl {
NPUGuardImpl() = default;

struct NPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
static constexpr c10::DeviceType static_type = c10::DeviceType::PrivateUse1;

NPUGuardImpl() {}
explicit NPUGuardImpl(c10::DeviceType t) {
TORCH_INTERNAL_ASSERT(
t == c10::DeviceType::PrivateUse1,
"DeviceType must be NPU. Actual DeviceType is: ",
t == static_type,
"DeviceType must be 'c10::DeviceType::PrivateUse1'. Actual DeviceType is: ",
t,
PTA_ERROR(ErrCode::PARAM));
}
c10::DeviceType type() const override {
return c10::DeviceType::PrivateUse1;
}
c10::Device exchangeDevice(c10::Device d) const override {
FFFrog marked this conversation as resolved.
Show resolved Hide resolved
TORCH_INTERNAL_ASSERT(
d.type() == c10::DeviceType::PrivateUse1,
Expand Down
4 changes: 1 addition & 3 deletions csrc/npu/THNPUCachingHostAllocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -353,9 +353,7 @@ at::Allocator* getTHNPUCachingHostAllocator() {

c10::Allocator* getPinnedMemoryAllocator() {
C10_LOG_API_USAGE_ONCE("aten.init.npu");
c10_npu::NpuSysCtrl::SysStatus status =
c10_npu::NpuSysCtrl::GetInstance().Initialize();
if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) {
if (!c10_npu::NpuSysCtrl::IsInitializeSuccess()) {
ASCEND_LOGE("Npu init fail.");
}
return getTHNPUCachingHostAllocator();
Expand Down
8 changes: 2 additions & 6 deletions npu/core/DeviceUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,7 @@ inline c10::DeviceType get_npu_device_type() {

inline void maybe_initialize_npu(const at::TensorOptions& options) {
if (torch_npu::utils::is_npu(options)) {
c10_npu::NpuSysCtrl::SysStatus status =
c10_npu::NpuSysCtrl::GetInstance().Initialize(options.device().index());
if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) {
if (!c10_npu::NpuSysCtrl::IsInitializeSuccess(options.device().index())) {
TORCH_CHECK(
false,
"npu device ",
Expand All @@ -63,9 +61,7 @@ inline void maybe_initialize_npu(const at::TensorOptions& options) {

inline void maybe_initialize_npu(const at::Device& device) {
if (torch_npu::utils::is_npu(device)) {
c10_npu::NpuSysCtrl::SysStatus status =
c10_npu::NpuSysCtrl::GetInstance().Initialize(device.index());
if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) {
if (!c10_npu::NpuSysCtrl::IsInitializeSuccess(device.index())) {
TORCH_CHECK(
false,
"npu device ",
Expand Down
16 changes: 16 additions & 0 deletions npu/core/NPUDevice.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#include "NPUDevice.h"
#include "npu/core/npu_log.h"

namespace c10_npu::acl {

c10::DeviceIndex device_count_impl(uint32_t* count) noexcept {
aclError error = aclrtGetDeviceCount(count);
if (error != ACL_ERROR_NONE) {
ASCEND_LOGE("get device count of NPU failed");
return 0;
}

return static_cast<c10::DeviceIndex>(*count);
}

} // namespace c10_npu::acl
10 changes: 10 additions & 0 deletions npu/core/NPUDevice.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#pragma once

#include <c10/core/Device.h>
#include <npu/acl/include/acl/acl.h>

namespace c10_npu::acl {

c10::DeviceIndex device_count_impl(uint32_t* count) noexcept;

}
Loading