diff --git a/.github/workflows/ci_build.yml b/.github/workflows/ci_build.yml
index 5843f686..eb6e48cc 100644
--- a/.github/workflows/ci_build.yml
+++ b/.github/workflows/ci_build.yml
@@ -34,9 +34,13 @@ on:
     types:
       - 'published'
 
+concurrency:
+  group: '${{ github.workflow }}-${{ github.event.pull_request.head.label || github.head_ref || github.ref }}'
+  cancel-in-progress: ${{ !contains(github.ref, 'release/')}}
+
 jobs:
   build:
-    name: Build for Python${{ matrix.python-version }}
+    name: Build with Python${{ matrix.python-version }}
     runs-on: ubuntu-latest
     defaults:
       run:
diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
index 3c081fce..9be06e9b 100644
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@@ -1,5 +1,6 @@
 # ---[ Googletest
 if(BUILD_TEST)
+  enable_testing()
   add_subdirectory(${PROJECT_SOURCE_DIR}/third_party/googletest)
   include_directories(BEFORE SYSTEM ${PROJECT_SOURCE_DIR}/third_party/googletest/googletest/include)
   include_directories(BEFORE SYSTEM ${PROJECT_SOURCE_DIR}/third_party/googletest/googlemock/include)
diff --git a/csrc/core/.keep b/csrc/core/.keep
deleted file mode 100644
index e69de29b..00000000
diff --git a/csrc/core/PrivateUse1Guard.h b/csrc/core/PrivateUse1Guard.h
new file mode 100644
index 00000000..066e641a
--- /dev/null
+++ b/csrc/core/PrivateUse1Guard.h
@@ -0,0 +1,146 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/core/impl/InlineDeviceGuard.h>
+#include <c10/core/impl/InlineStreamGuard.h>
+
+#include "csrc/core/impl/PrivateUse1GuardImpl.h"
+
+namespace c10_backend {
+
+// This code is kind of boilerplatey.  See Note [Whither the DeviceGuard
+// boilerplate]
+
+/// A variant of DeviceGuard that is specialized for PrivateUse1.  It accepts
+/// integer indices (interpreting them as PrivateUse1 devices) and is a little
+/// more efficient than DeviceGuard (it compiles to straight line
+/// cudaSetDevice/cudaGetDevice calls); however, it can only be used
+/// from code that links against PrivateUse1 directly.
+template <typename T>
+struct PrivateUse1Guard {
+  /// No default constructor; see Note [Omitted default constructor from RAII]
+  explicit PrivateUse1Guard() = delete;
+
+  /// Set the current PrivateUse1 device to the passed device index.
+  explicit PrivateUse1Guard(c10::DeviceIndex device_index)
+      : guard_(device_index) {}
+
+  /// Sets the current PrivateUse1 device to the passed device.  Errors if the
+  /// passed device is not a PrivateUse1 device.
+  explicit PrivateUse1Guard(c10::Device device) : guard_(device) {}
+
+  // Copy is not allowed
+  PrivateUse1Guard(const PrivateUse1Guard&) = delete;
+  PrivateUse1Guard& operator=(const PrivateUse1Guard&) = delete;
+
+  // Move is not allowed (there is no uninitialized state)
+  PrivateUse1Guard(PrivateUse1Guard&& other) = delete;
+  PrivateUse1Guard& operator=(PrivateUse1Guard&& other) = delete;
+
+  /// Sets the PrivateUse1 device to the given device.  Errors if the given
+  /// device is not a PrivateUse1 device.
+  void set_device(c10::Device device) {
+    guard_.set_device(device);
+  }
+
+  /// Sets the PrivateUse1 device to the given device.  Errors if the given
+  /// device is not a PrivateUse1 device.  (This method is provided for
+  /// uniformity with DeviceGuard).
+  void reset_device(c10::Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// Sets the PrivateUse1 device to the given device index.
+  void set_index(c10::DeviceIndex device_index) {
+    guard_.set_index(device_index);
+  }
+
+  /// Returns the device that was set upon construction of the guard
+  c10::Device original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the last device that was set via `set_device`, if any, otherwise
+  /// the device passed during construction.
+  c10::Device current_device() const {
+    return guard_.current_device();
+  }
+
+ private:
+  /// The guard for the current device.
+  c10::impl::InlineDeviceGuard<T> guard_;
+};
+
+/// A variant of OptionalDeviceGuard that is specialized for PrivateUse1.  See
+/// PrivateUse1Guard for when you can use this.
+template <typename T>
+struct OptionalPrivateUse1Guard {
+  /// Create an uninitialized OptionalPrivateUse1Guard.
+  explicit OptionalPrivateUse1Guard() : guard_() {}
+
+  /// Set the current PrivateUse1 device to the passed Device, if it is not
+  /// nullopt.
+  explicit OptionalPrivateUse1Guard(std::optional<c10::Device> device_opt)
+      : guard_(device_opt) {}
+
+  /// Set the current PrivateUse1 device to the passed device index, if it is
+  /// not nullopt
+  explicit OptionalPrivateUse1Guard(
+      std::optional<c10::DeviceIndex> device_index_opt)
+      : guard_(device_index_opt) {}
+
+  // Copy is not allowed
+  OptionalPrivateUse1Guard(const OptionalPrivateUse1Guard&) = delete;
+  OptionalPrivateUse1Guard& operator=(const OptionalPrivateUse1Guard&) = delete;
+
+  // See Note [Move construction for RAII guards is tricky]
+  OptionalPrivateUse1Guard(OptionalPrivateUse1Guard&& other) = delete;
+  // See Note [Move assignment for RAII guards is tricky]
+  OptionalPrivateUse1Guard& operator=(OptionalPrivateUse1Guard&& other) =
+      delete;
+
+  /// Sets the PrivateUse1 device to the given device, initializing the guard if
+  /// it is not already initialized.  Errors if the given device is not a
+  /// PrivateUse1 device.
+  void set_device(c10::Device device) {
+    guard_.set_device(device);
+  }
+
+  /// Sets the PrivateUse1 device to the given device, initializing the guard if
+  /// it is not already initialized.  Errors if the given device is not a
+  /// PrivateUse1 device. (This method is provided for uniformity with
+  /// OptionalDeviceGuard).
+  void reset_device(c10::Device device) {
+    guard_.reset_device(device);
+  }
+
+  /// Sets the PrivateUse1 device to the given device index, initializing the
+  /// guard if it is not already initialized.
+  void set_index(c10::DeviceIndex device_index) {
+    guard_.set_index(device_index);
+  }
+
+  /// Returns the device that was set immediately prior to initialization of the
+  /// guard, or nullopt if the guard is uninitialized.
+  std::optional<c10::Device> original_device() const {
+    return guard_.original_device();
+  }
+
+  /// Returns the most recent device that was set using this device guard,
+  /// either from construction, or via set_device, if the guard is initialized,
+  /// or nullopt if the guard is uninitialized.
+  std::optional<c10::Device> current_device() const {
+    return guard_.current_device();
+  }
+
+  /// Restore the original PrivateUse1 device, resetting this guard to
+  /// uninitialized state.
+  void reset() {
+    guard_.reset();
+  }
+
+ private:
+  c10::impl::InlineOptionalDeviceGuard<T> guard_;
+};
+
+} // namespace c10_backend
diff --git a/csrc/core/impl/PrivateUse1GuardImpl.h b/csrc/core/impl/PrivateUse1GuardImpl.h
new file mode 100644
index 00000000..37c1be5f
--- /dev/null
+++ b/csrc/core/impl/PrivateUse1GuardImpl.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <c10/core/DeviceType.h>
+#include <c10/core/impl/InlineDeviceGuard.h>
+#include <c10/macros/Macros.h>
+
+namespace c10_backend::impl {
+
+/**
+ * All classes which inherit from PrivateUse1GuardImpl should be declared
+ * 'final'.
+ */
+struct PrivateUse1GuardImpl : public c10::impl::DeviceGuardImplInterface {
+  static constexpr c10::DeviceType static_type = c10::DeviceType::PrivateUse1;
+
+  PrivateUse1GuardImpl() = default;
+
+  explicit PrivateUse1GuardImpl(c10::DeviceType t) {
+    TORCH_INTERNAL_ASSERT(t == c10::DeviceType::PrivateUse1);
+  }
+
+  c10::DeviceType type() const final {
+    return c10::DeviceType::PrivateUse1;
+  }
+};
+
+} // namespace c10_backend::impl
diff --git a/csrc/npu/CachingAllocatorHelper.h b/csrc/npu/CachingAllocatorHelper.h
index 11811705..b695aeee 100644
--- a/csrc/npu/CachingAllocatorHelper.h
+++ b/csrc/npu/CachingAllocatorHelper.h
@@ -129,7 +129,7 @@ ExpandableSegment* createExpandableSegment(
     size_t size);
 
 // Wraps the insert event function
-void insertEventWrapper(int device, std::function<void()> insertEventFn);
+void insertEventWrapper(c10::DeviceIndex device, std::function<void()> insertEventFn);
 
 // Returns the current stream for the given device
 void* getCurrentStream(c10::DeviceIndex);
diff --git a/csrc/npu/NPUCachingAllocator.cpp b/csrc/npu/NPUCachingAllocator.cpp
index 8a3b2071..99132b22 100644
--- a/csrc/npu/NPUCachingAllocator.cpp
+++ b/csrc/npu/NPUCachingAllocator.cpp
@@ -128,7 +128,7 @@ struct BlockPool {
 };
 
 struct Block {
-  int device;
+  c10::DeviceIndex device;
   void* stream; // allocation stream
   stream_set stream_uses; // streams on which the block was used
   size_t size; // block size in bytes
@@ -155,7 +155,7 @@ struct Block {
   // memory out from our cache.
   std::shared_ptr<c10::GatheredContext> context_when_segment_allocated;
 
-  Block(int device, void* stream, size_t size, BlockPool* pool, void* ptr)
+  Block(c10::DeviceIndex device, void* stream, size_t size, BlockPool* pool, void* ptr)
       : device(device),
         stream(stream),
         stream_uses(),
@@ -170,7 +170,7 @@ struct Block {
         gc_count(0) {}
 
   // constructor for search key
-  Block(int device, void* stream, size_t size)
+  Block(c10::DeviceIndex device, void* stream, size_t size)
       : device(device),
         stream(stream),
         stream_uses(),
@@ -244,7 +244,7 @@ static std::string format_size(uint64_t size) {
 
 struct AllocParams {
   AllocParams(
-      int device,
+      c10::DeviceIndex device,
       size_t size,
       void* stream,
       BlockPool* pool,
@@ -256,7 +256,7 @@ struct AllocParams {
         block(nullptr),
         err(ACL_ERROR_NONE) {}
 
-  int device() const {
+  c10::DeviceIndex device() const {
     return search_key.device;
   }
   void* stream() const {
@@ -641,7 +641,7 @@ class DeviceCachingAllocator {
   // All public methods (except the above) acquire the allocator mutex.
   // Thus, do not call a public method from another public method.
 
-  Block* malloc(int device, size_t orig_size, void* stream) {
+  Block* malloc(c10::DeviceIndex device, size_t orig_size, void* stream) {
     // done outside the lock because we don't know what locks the recorder needs
     // to have...
     auto context = maybeGatherContext(RecordContext::STATE);
@@ -2012,7 +2012,7 @@ class NpuCachingAllocator : public NPUAllocator {
     return !device_allocator.empty();
   }
   /** allocates a block which is safe to use from the provided stream */
-  void malloc(void** devPtr, int device, size_t size, void* stream) {
+  void malloc(void** devPtr, c10::DeviceIndex device, size_t size, void* stream) {
     Block* block = device_allocator[device]->malloc(device, size, stream);
     add_allocated_block(block);
     *devPtr = static_cast<void*>(block->ptr);
@@ -2060,7 +2060,7 @@ class NpuCachingAllocator : public NPUAllocator {
   }
 
   bool isHistoryEnabled() override {
-    int device = 0;
+    c10::DeviceIndex device = 0;
     NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
     return device_allocator[device]->isHistoryEnabled();
   }
@@ -2154,7 +2154,7 @@ class NpuCachingAllocator : public NPUAllocator {
   }
 
   c10::DataPtr allocate(size_t size) override {
-    int device = 0;
+    c10::DeviceIndex device = 0;
     NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
     void* devPtr = nullptr;
     void (*deleteFunc)(void*) = &local_raw_delete;
@@ -2215,7 +2215,7 @@ class NpuCachingAllocator : public NPUAllocator {
     if (nbytes == 0) {
       return nullptr;
     }
-    int device = 0;
+    c10::DeviceIndex device = 0;
     NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
     void* r = nullptr;
     malloc(&r, device, nbytes, getCurrentStream(device));
@@ -2226,7 +2226,7 @@ class NpuCachingAllocator : public NPUAllocator {
     if (nbytes == 0) {
       return nullptr;
     }
-    int device;
+    c10::DeviceIndex device;
     NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
     void* r = nullptr;
     malloc(&r, device, nbytes, stream);
diff --git a/csrc/npu/NPUFunctions.cpp b/csrc/npu/NPUFunctions.cpp
index f04dd180..3b96cfda 100644
--- a/csrc/npu/NPUFunctions.cpp
+++ b/csrc/npu/NPUFunctions.cpp
@@ -1,5 +1,4 @@
 #include "csrc/npu/NPUFunctions.h"
-#include <unistd.h>
 #include <mutex>
 #include <unordered_map>
 #include "csrc/npu/NPUStream.h"
@@ -7,40 +6,91 @@
 
 namespace c10_npu {
 
-static uint32_t dev_count = 0;
-static thread_local int local_device = -1;
-static std::unordered_map<int8_t, aclrtContext> used_devices;
+static thread_local c10::DeviceIndex local_device = -1;
+// TODO: remove used_devices
+static std::unordered_map<c10::DeviceIndex, aclrtContext> used_devices;
 std::mutex mtx;
 
+int device_count_impl() {
+  int count = 0;
+  NPU_CHECK_ERROR(GetDeviceCount(&count));
+  return count;
+}
+
 c10::DeviceIndex device_count() noexcept {
   // initialize number of devices only once
-  if (dev_count == 0) {
-    aclError error = aclrtGetDeviceCount(&dev_count);
-    if (error != ACL_ERROR_NONE) {
-      ASCEND_LOGE("get device count of NPU failed");
+  static int count = []() {
+    try {
+      auto result = device_count_impl();
+      TORCH_INTERNAL_ASSERT(
+          result <= std::numeric_limits<c10::DeviceIndex>::max(),
+          "Too many NPU devices, DeviceIndex overflowed");
+      return result;
+    } catch (const c10::Error& ex) {
+      // We don't want to fail, but still log the warning
+      // msg() returns the message without the stack trace
+      TORCH_WARN("NPU initialization: ", ex.msg());
       return 0;
     }
-    return static_cast<c10::DeviceIndex>(dev_count);
-  }
-  return static_cast<c10::DeviceIndex>(dev_count);
+  }();
+  return static_cast<c10::DeviceIndex>(count);
 }
 
 c10::DeviceIndex device_count_ensure_non_zero() {
-  unsigned int count = 0;
-
-  NPU_CHECK_ERROR(aclrtGetDeviceCount(&count));
+  // Call the implementation every time to throw the exception
+  int count = device_count_impl();
+  // Zero npus doesn't produce a warning in `device_count` but we fail here
   TORCH_CHECK(count, "No NPUs are available", PTA_ERROR(ErrCode::UNAVAIL));
-
+  TORCH_INTERNAL_ASSERT(
+      count <= std::numeric_limits<c10::DeviceIndex>::max(),
+      "Too many NPU devices, DeviceIndex overflowed");
   return static_cast<c10::DeviceIndex>(count);
 }
 
-aclError GetDevice(int32_t* device) {
+c10::DeviceIndex current_device() {
+  c10::DeviceIndex cur_device = -1;
+  NPU_CHECK_ERROR(c10_npu::GetDevice(&cur_device));
+  return cur_device;
+}
+
+void set_device(c10::DeviceIndex device) {
+  NPU_CHECK_ERROR(c10_npu::SetDevice(device));
+}
+
+void device_synchronize() {
+  NPU_CHECK_ERROR(aclrtSynchronizeDevice());
+}
+
+// this function has to be called from callers performing npu synchronizing
+// operations, to raise proper error or warning
+void warn_or_error_on_sync() {
+  if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_ERROR) {
+    TORCH_CHECK(
+        false, "called a synchronizing NPU operation", PTA_ERROR(ErrCode::ACL));
+  } else if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_WARN) {
+    TORCH_NPU_WARN("called a synchronizing NPU operation");
+  }
+}
+
+// Wrappers for raw CUDA device management functions
+aclError GetDeviceCount(int* dev_count) {
+  return aclrtGetDeviceCount(reinterpret_cast<uint32_t*>(dev_count));
+}
+
+aclError GetDevice(c10::DeviceIndex* device) {
   if (local_device >= 0) {
     *device = local_device;
     return ACL_ERROR_NONE;
   }
-  aclError err = aclrtGetDevice(device);
+  int tmp_device = -1;
+  auto err = aclrtGetDevice(&tmp_device);
   if (err == ACL_ERROR_NONE) {
+    TORCH_INTERNAL_ASSERT(
+        tmp_device >= 0 &&
+            tmp_device <= std::numeric_limits<c10::DeviceIndex>::max(),
+        "aclrtGetDevice returns invalid device ",
+        tmp_device);
+    *device = static_cast<c10::DeviceIndex>(tmp_device);
     local_device = *device;
   } else if (
       err == ACL_ERROR_RT_CONTEXT_NULL && aclrtSetDevice(0) == ACL_ERROR_NONE) {
@@ -57,19 +107,6 @@ aclError GetDevice(int32_t* device) {
   return err;
 }
 
-inline bool has_set_pthread_affinity() {
-  int core_nums = sysconf(_SC_NPROCESSORS_ONLN);
-  int count = 0;
-
-  cpu_set_t mask;
-  pthread_getaffinity_np(pthread_self(), sizeof(mask), &mask);
-  for (int i = 0; i < core_nums; i++) {
-    count += CPU_ISSET(i, &mask);
-  }
-
-  return count != core_nums;
-}
-
 aclError SetDevice(c10::DeviceIndex device) {
   TORCH_CHECK(
       device >= 0, "device id must be positive!", PTA_ERROR(ErrCode::VALUE));
@@ -78,26 +115,6 @@ aclError SetDevice(c10::DeviceIndex device) {
     return ACL_ERROR_NONE;
   }
 
-  static const bool set_pthread_affinity = has_set_pthread_affinity();
-  if (!set_pthread_affinity) {
-    uint32_t bind_conf = c10_npu::option::OptionsManager::GetBindCpuConf();
-    // bind_conf=1, bind cores averagely based on device_id
-    if (bind_conf == 1) {
-      int core_nums = sysconf(_SC_NPROCESSORS_ONLN);
-      int device_nums = device_count_ensure_non_zero();
-      int block_size = (core_nums + device_nums - 1) / device_nums;
-      int start_core = device * block_size;
-      int end_core = std::min((device + 1) * block_size, core_nums);
-
-      cpu_set_t mask;
-      CPU_ZERO(&mask);
-      for (int i = start_core; i < end_core; i++) {
-        CPU_SET(i, &mask);
-      }
-      pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask);
-    }
-  }
-
   aclError err = aclrtSetDevice(device);
   if (err == ACL_ERROR_NONE) {
     local_device = device;
@@ -111,6 +128,15 @@ aclError SetDevice(c10::DeviceIndex device) {
   return err;
 }
 
+aclrtContext GetDeviceContext(c10::DeviceIndex device) {
+  if (used_devices.find(device) == used_devices.end()) {
+    ASCEND_LOGE(
+        "NPU device %d has been initialized! Can not get context", device);
+    return nullptr;
+  }
+  return used_devices[device];
+}
+
 aclError ResetUsedDevices() {
   for (const auto it : used_devices) {
     aclError err = aclrtResetDevice(it.first);
@@ -123,7 +149,7 @@ aclError ResetUsedDevices() {
 }
 
 aclError DestroyUsedStreams() {
-  int32_t cur_device = 0;
+  c10::DeviceIndex cur_device = 0;
   NPU_CHECK_ERROR(GetDevice(&cur_device));
   for (const auto it : used_devices) {
     NPU_CHECK_ERROR(SetDevice(it.first));
@@ -138,7 +164,7 @@ aclError DestroyUsedStreams() {
 }
 
 aclError SynchronizeUsedDevices() {
-  int32_t cur_device = 0;
+  c10::DeviceIndex cur_device = 0;
   NPU_CHECK_ERROR(GetDevice(&cur_device));
   for (const auto it : used_devices) {
     NPU_CHECK_ERROR(SetDevice(it.first));
@@ -151,37 +177,4 @@ aclError SynchronizeUsedDevices() {
   return ACL_ERROR_NONE;
 }
 
-aclrtContext GetDeviceContext(int32_t device) {
-  if (used_devices.find(device) == used_devices.end()) {
-    ASCEND_LOGE(
-        "NPU device %d has been initialized! Can not get context", device);
-    return nullptr;
-  }
-  return used_devices[device];
-}
-
-c10::DeviceIndex current_device() {
-  int cur_device = 0;
-  NPU_CHECK_ERROR(c10_npu::GetDevice(&cur_device));
-  return static_cast<c10::DeviceIndex>(cur_device);
-}
-
-void set_device(c10::DeviceIndex device) {
-  NPU_CHECK_ERROR(c10_npu::SetDevice(device));
-}
-
-void device_synchronize() {
-  NPU_CHECK_ERROR(aclrtSynchronizeDevice());
-}
-
-int ExchangeDevice(int device) {
-  NPU_CHECK_ERROR(SetDevice(device));
-
-  return device;
-}
-
-int GetLocalDevice() {
-  return local_device;
-}
-
 } // namespace c10_npu
diff --git a/csrc/npu/NPUFunctions.h b/csrc/npu/NPUFunctions.h
index 18cad7f0..575e14c4 100644
--- a/csrc/npu/NPUFunctions.h
+++ b/csrc/npu/NPUFunctions.h
@@ -1,11 +1,11 @@
 #pragma once
 
-// This header provides C++ wrappers around commonly used CUDA API functions.
+// This header provides C++ wrappers around commonly used AscendCL API functions.
 // The benefit of using C++ here is that we can raise an exception in the
 // event of an error, rather than explicitly pass around error codes.  This
 // leads to more natural APIs.
 //
-// The naming convention used here matches the naming convention of torch.cuda
+// The naming convention used here matches the naming convention of torch.npu
 
 #include <c10/core/Device.h>
 #include <c10/macros/Macros.h>
@@ -19,56 +19,34 @@ namespace c10_npu {
 
 C10_NPU_API c10::DeviceIndex device_count() noexcept;
 
+// Version of device_count that throws is no devices are detected
 C10_NPU_API c10::DeviceIndex device_count_ensure_non_zero();
 
-/**
- * @ingroup torch_npu
- * @brief get device id from local thread cache preferentially for performance.
- * If the thread cache has not been initialized, it will get from ACL interface:
- * aclrtGetDevice, and initialize the local thread cache.
- * If the context is empty, it will set device 0.
- *
- * @param device [IN]           device id
- * @retval ACL_ERROR_NONE The function is successfully executed.
- * @retval OtherValues Failure
- */
-C10_NPU_API aclError GetDevice(int32_t* device);
-
-/**
- * @ingroup torch_npu
- * @brief set device id by ACL interface: aclrtSetDevice,
- * and update the local thread cache
- *
- * @param device [IN]           device id
- * @retval ACL_ERROR_NONE The function is successfully executed.
- * @retval OtherValues Failure
- */
-C10_NPU_API aclError SetDevice(c10::DeviceIndex device);
+C10_NPU_API c10::DeviceIndex current_device();
 
-/**
- * @ingroup torch_npu
- * @brief reset all device id by ACL interface: aclrtResetDevice.
- *
- * @retval ACL_ERROR_NONE The function is successfully executed.
- * @retval OtherValues Failure
- */
-aclError ResetUsedDevices();
+C10_NPU_API void set_device(c10::DeviceIndex device);
 
-aclError DestroyUsedStreams();
+C10_NPU_API void device_synchronize();
 
-aclError SynchronizeUsedDevices();
+// this function has to be called from callers performing npu synchronizing
+// operations, to raise proper error or warning
+C10_NPU_API void warn_or_error_on_sync();
 
-aclrtContext GetDeviceContext(int32_t device);
+// Raw CUDA device management functions
+C10_NPU_API aclError GetDeviceCount(int* dev_count);
 
-C10_NPU_API c10::DeviceIndex current_device();
+C10_NPU_API aclError GetDevice(c10::DeviceIndex* device);
 
-C10_NPU_API void set_device(c10::DeviceIndex device);
+C10_NPU_API aclError SetDevice(c10::DeviceIndex device);
 
-C10_NPU_API void device_synchronize();
+C10_NPU_API aclrtContext GetDeviceContext(c10::DeviceIndex device);
+
+// TODO: remove the following three functions
+aclError ResetUsedDevices();
 
-C10_NPU_API int ExchangeDevice(int device);
+aclError DestroyUsedStreams();
 
-int GetLocalDevice();
+aclError SynchronizeUsedDevices();
 
 enum class SyncDebugMode { L_DISABLED = 0, L_WARN, L_ERROR };
 
@@ -76,8 +54,8 @@ enum class SyncDebugMode { L_DISABLED = 0, L_WARN, L_ERROR };
 // through this global state to determine the synchronization debug mode
 class WarningState {
  public:
-  void set_sync_debug_mode(SyncDebugMode level) {
-    sync_debug_mode = level;
+  void set_sync_debug_mode(SyncDebugMode l) {
+    sync_debug_mode = l;
   }
 
   SyncDebugMode get_sync_debug_mode() {
@@ -88,20 +66,9 @@ class WarningState {
   SyncDebugMode sync_debug_mode = SyncDebugMode::L_DISABLED;
 };
 
-C10_NPU_API inline WarningState& warning_state() {
+C10_NPU_API __inline__ WarningState& warning_state() {
   static WarningState warning_state_;
   return warning_state_;
 }
 
-// this function has to be called from callers performing npu synchronizing
-// operations, to raise proper error or warning
-C10_NPU_API inline void warn_or_error_on_sync() {
-  if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_ERROR) {
-    TORCH_CHECK(
-        false, "called a synchronizing NPU operation", PTA_ERROR(ErrCode::ACL));
-  } else if (warning_state().get_sync_debug_mode() == SyncDebugMode::L_WARN) {
-    TORCH_NPU_WARN("called a synchronizing NPU operation");
-  }
-}
-
 } // namespace c10_npu
diff --git a/csrc/npu/NPUGuardImpl.cpp b/csrc/npu/NPUGuardImpl.cpp
index d120a8ce..bfc123f5 100644
--- a/csrc/npu/NPUGuardImpl.cpp
+++ b/csrc/npu/NPUGuardImpl.cpp
@@ -9,8 +9,6 @@ namespace c10_npu {
 
 namespace impl {
 
-constexpr c10::DeviceType NPUGuardImpl::static_type;
-
 C10_REGISTER_GUARD_IMPL(PrivateUse1, NPUGuardImpl);
 
 #define REGISTER_PRIVATEUSE1_BACKEND(name)                                \
diff --git a/csrc/npu/NPUGuardImpl.h b/csrc/npu/NPUGuardImpl.h
index 4b7d9954..61550e65 100644
--- a/csrc/npu/NPUGuardImpl.h
+++ b/csrc/npu/NPUGuardImpl.h
@@ -8,29 +8,25 @@
 #include <npu/acl/include/acl/acl_base.h>
 #include <npu/acl/include/acl/acl_rt.h>
 #include "csrc/aten/generated/NPUNativeFunctions.h"
-#include "npu/core/NPUException.h"
+#include "csrc/core/impl/PrivateUse1GuardImpl.h"
 #include "csrc/npu/NPUFunctions.h"
 #include "csrc/npu/NPUStream.h"
+#include "npu/core/NPUException.h"
 #include "npu/core/interface/AsyncTaskQueueInterface.h"
 #include "npu/core/sys_ctrl/npu_sys_ctrl.h"
 
 namespace c10_npu {
 namespace impl {
+struct NPUGuardImpl final : public c10_backend::impl::PrivateUse1GuardImpl {
+  NPUGuardImpl() = default;
 
-struct NPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
-  static constexpr c10::DeviceType static_type = c10::DeviceType::PrivateUse1;
-
-  NPUGuardImpl() {}
   explicit NPUGuardImpl(c10::DeviceType t) {
     TORCH_INTERNAL_ASSERT(
-        t == c10::DeviceType::PrivateUse1,
-        "DeviceType must be NPU. Actual DeviceType is: ",
+        t == static_type,
+        "DeviceType must be 'c10::DeviceType::PrivateUse1'. Actual DeviceType is: ",
         t,
         PTA_ERROR(ErrCode::PARAM));
   }
-  c10::DeviceType type() const override {
-    return c10::DeviceType::PrivateUse1;
-  }
   c10::Device exchangeDevice(c10::Device d) const override {
     TORCH_INTERNAL_ASSERT(
         d.type() == c10::DeviceType::PrivateUse1,
@@ -44,14 +40,14 @@ struct NPUGuardImpl final : public c10::impl::DeviceGuardImplInterface {
     return old_device;
   }
   c10::Device getDevice() const override {
-    int device = 0;
+    c10::DeviceIndex device = 0;
     NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
     return c10::Device(c10::DeviceType::PrivateUse1, device);
   }
   void setDevice(c10::Device d) const override {
     TORCH_INTERNAL_ASSERT(
         d.type() == c10::DeviceType::PrivateUse1,
-        "DeviceType must be NPU. Actual DeviceType is: ",
+        "DeviceType must be 'c10::DeviceType::PrivateUse1'. Actual DeviceType is: ",
         d.type(),
         PTA_ERROR(ErrCode::PARAM));
     NPU_CHECK_ERROR(c10_npu::SetDevice(d.index()));
diff --git a/csrc/npu/NPUStream.cpp b/csrc/npu/NPUStream.cpp
index e653fea2..361bf0d2 100644
--- a/csrc/npu/NPUStream.cpp
+++ b/csrc/npu/NPUStream.cpp
@@ -143,7 +143,7 @@ static void initGlobalStreamState() {
       "). Increase that and recompile.",
       PTA_ERROR(ErrCode::VALUE));
 
-  int device_id = 0;
+  c10::DeviceIndex device_id = 0;
   auto ret = c10_npu::GetDevice(&device_id);
   if (ret != ACL_ERROR_NONE) {
     ASCEND_LOGE("Device has not been set");
diff --git a/csrc/npu/THNPUCachingHostAllocator.cpp b/csrc/npu/THNPUCachingHostAllocator.cpp
index a3deb5e5..d6247394 100644
--- a/csrc/npu/THNPUCachingHostAllocator.cpp
+++ b/csrc/npu/THNPUCachingHostAllocator.cpp
@@ -246,7 +246,7 @@ struct HostAllocator {
   aclError insertEvents(Block& block) {
     aclError err = ACL_ERROR_NONE;
 
-    int prev_device = 0;
+    c10::DeviceIndex prev_device = 0;
     err = c10_npu::GetDevice(&prev_device);
     if (err != ACL_ERROR_NONE) {
       return err;
@@ -338,9 +338,7 @@ at::Allocator* getTHNPUCachingHostAllocator() {
 
 c10::Allocator* getPinnedMemoryAllocator() {
   C10_LOG_API_USAGE_ONCE("aten.init.npu");
-  c10_npu::NpuSysCtrl::SysStatus status =
-      c10_npu::NpuSysCtrl::GetInstance().Initialize();
-  if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) {
+  if (!c10_npu::NpuSysCtrl::IsInitializeSuccess()) {
     ASCEND_LOGE("Npu init fail.");
   }
   return getTHNPUCachingHostAllocator();
diff --git a/npu/core/CachingAllocatorHelper.cpp b/npu/core/CachingAllocatorHelper.cpp
index 9fe60e8b..086ce80e 100644
--- a/npu/core/CachingAllocatorHelper.cpp
+++ b/npu/core/CachingAllocatorHelper.cpp
@@ -177,7 +177,7 @@ ExpandableSegment* createExpandableSegment(
   return new NPUExpandableSegment(device, stream, size);
 }
 
-void insertEventWrapper(int device, std::function<void()> fn) {
+void insertEventWrapper(c10::DeviceIndex device, std::function<void()> fn) {
   aclrtContext compiler_ctx = aclrtContext();
   aclError ret_ctx = aclrtGetCurrentContext(&compiler_ctx);
   NPU_CHECK_ERROR(aclrtSetCurrentContext(c10_npu::GetDeviceContext(device)));
diff --git a/npu/core/DeviceUtils.h b/npu/core/DeviceUtils.h
index 243398df..129c2d4d 100644
--- a/npu/core/DeviceUtils.h
+++ b/npu/core/DeviceUtils.h
@@ -46,9 +46,7 @@ inline c10::DeviceType get_npu_device_type() {
 
 inline void maybe_initialize_npu(const at::TensorOptions& options) {
   if (torch_npu::utils::is_npu(options)) {
-    c10_npu::NpuSysCtrl::SysStatus status =
-        c10_npu::NpuSysCtrl::GetInstance().Initialize(options.device().index());
-    if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) {
+    if (!c10_npu::NpuSysCtrl::IsInitializeSuccess(options.device().index())) {
       TORCH_CHECK(
           false,
           "npu device ",
@@ -61,9 +59,7 @@ inline void maybe_initialize_npu(const at::TensorOptions& options) {
 
 inline void maybe_initialize_npu(const at::Device& device) {
   if (torch_npu::utils::is_npu(device)) {
-    c10_npu::NpuSysCtrl::SysStatus status =
-        c10_npu::NpuSysCtrl::GetInstance().Initialize(device.index());
-    if (status != c10_npu::NpuSysCtrl::SysStatus::INIT_SUCC) {
+    if (!c10_npu::NpuSysCtrl::IsInitializeSuccess(device.index())) {
       TORCH_CHECK(
           false,
           "npu device ",
diff --git a/npu/core/NPUException.cpp b/npu/core/NPUException.cpp
index 2474c7c4..13fdcc07 100644
--- a/npu/core/NPUException.cpp
+++ b/npu/core/NPUException.cpp
@@ -38,7 +38,7 @@ void warn_(const ::c10::Warning& warning) {
 
 std::string formatErrorCode(SubModule submodule, ErrCode errorCode) {
   std::ostringstream oss;
-  int deviceIndex = -1;
+  c10::DeviceIndex deviceIndex = -1;
   c10_npu::GetDevice(&deviceIndex);
   auto rank_id = c10_npu::option::OptionsManager::GetRankId();
   oss << "\n[ERROR] " << getCurrentTimestamp() << " (PID:" << getpid()
diff --git a/npu/core/NPUGuard.h b/npu/core/NPUGuard.h
index 816eb5fc..83fbc892 100644
--- a/npu/core/NPUGuard.h
+++ b/npu/core/NPUGuard.h
@@ -4,6 +4,7 @@
 #include <c10/core/impl/InlineDeviceGuard.h>
 #include <c10/core/impl/InlineStreamGuard.h>
 #include "npu/core/NPUMacros.h"
+#include "csrc/core/PrivateUse1Guard.h"
 #include "csrc/npu/NPUGuardImpl.h"
 
 #include <cstddef>
@@ -18,16 +19,9 @@ namespace c10_npu {
 /// more efficient than DeviceGuard (it compiles to straight line
 /// NPUSetDevice/NPUGetDevice calls); however, it can only be used
 /// from code that links against NPU directly.
-struct NPUGuard {
-  /// No default constructor; see Note [Omitted default constructor from RAII]
-  explicit NPUGuard() = delete;
-
-  /// Set the current NPU device to the passed device index.
-  explicit NPUGuard(c10::DeviceIndex device_index) : guard_(device_index) {}
-
-  /// Sets the current NPU device to the passed device.  Errors if the passed
-  /// device is not a NPU device.
-  explicit NPUGuard(c10::Device device) : guard_(device) {}
+struct NPUGuard : public c10_backend::PrivateUse1Guard<impl::NPUGuardImpl> {
+  using PrivateUse1Guard = c10_backend::PrivateUse1Guard<impl::NPUGuardImpl>;
+  using PrivateUse1Guard::PrivateUse1Guard;
 
   // Copy is not allowed
   NPUGuard(const NPUGuard&) = delete;
@@ -36,55 +30,15 @@ struct NPUGuard {
   // Move is not allowed (there is no uninitialized state)
   NPUGuard(NPUGuard&& other) = delete;
   NPUGuard& operator=(NPUGuard&& other) = delete;
-
-  /// Sets the NPU device to the given device.  Errors if the given device
-  /// is not a NPU device.
-  void set_device(c10::Device device) {
-    guard_.set_device(device);
-  }
-
-  /// Sets the NPU device to the given device.  Errors if the given device
-  /// is not a NPU device.  (This method is provided for uniformity with
-  /// DeviceGuard).
-  void reset_device(c10::Device device) {
-    guard_.reset_device(device);
-  }
-
-  /// Sets the NPU device to the given device index.
-  void set_index(c10::DeviceIndex device_index) {
-    guard_.set_index(device_index);
-  }
-
-  /// Returns the device that was set upon construction of the guard
-  c10::Device original_device() const {
-    return guard_.original_device();
-  }
-
-  /// Returns the last device that was set via `set_device`, if any, otherwise
-  /// the device passed during construction.
-  c10::Device current_device() const {
-    return guard_.current_device();
-  }
-
- private:
-  /// The guard for the current device.
-  c10::impl::InlineDeviceGuard<c10_npu::impl::NPUGuardImpl> guard_;
 };
 
 /// A variant of OptionalDeviceGuard that is specialized for NPU.  See
 /// NPUGuard for when you can use this.
-struct OptionalNPUGuard {
-  /// Create an uninitialized OptionalNPUGuard.
-  explicit OptionalNPUGuard() : guard_() {}
-
-  /// Set the current NPU device to the passed Device, if it is not nullopt.
-  explicit OptionalNPUGuard(c10::optional<c10::Device> device_opt)
-      : guard_(device_opt) {}
-
-  /// Set the current NPU device to the passed device index, if it is not
-  /// nullopt
-  explicit OptionalNPUGuard(c10::optional<c10::DeviceIndex> device_index_opt)
-      : guard_(device_index_opt) {}
+struct OptionalNPUGuard
+    : public c10_backend::OptionalPrivateUse1Guard<impl::NPUGuardImpl> {
+  using OptionalPrivateUse1Guard =
+      c10_backend::OptionalPrivateUse1Guard<impl::NPUGuardImpl>;
+  using OptionalPrivateUse1Guard::OptionalPrivateUse1Guard;
 
   // Copy is not allowed
   OptionalNPUGuard(const OptionalNPUGuard&) = delete;
@@ -92,51 +46,8 @@ struct OptionalNPUGuard {
 
   // See Note [Move construction for RAII guards is tricky]
   OptionalNPUGuard(OptionalNPUGuard&& other) = delete;
-
   // See Note [Move assignment for RAII guards is tricky]
   OptionalNPUGuard& operator=(OptionalNPUGuard&& other) = delete;
-
-  /// Sets the NPU device to the given device, initializing the guard if it
-  /// is not already initialized.  Errors if the given device is not a NPU
-  /// device.
-  void set_device(c10::Device device) {
-    guard_.set_device(device);
-  }
-
-  /// Sets the NPU device to the given device, initializing the guard if it is
-  /// not already initialized.  Errors if the given device is not a NPU device.
-  /// (This method is provided for uniformity with OptionalDeviceGuard).
-  void reset_device(c10::Device device) {
-    guard_.reset_device(device);
-  }
-
-  /// Sets the NPU device to the given device index, initializing the guard if
-  /// it is not already initialized.
-  void set_index(c10::DeviceIndex device_index) {
-    guard_.set_index(device_index);
-  }
-
-  /// Returns the device that was set immediately prior to initialization of the
-  /// guard, or nullopt if the guard is uninitialized.
-  c10::optional<c10::Device> original_device() const {
-    return guard_.original_device();
-  }
-
-  /// Returns the most recent device that was set using this device guard,
-  /// either from construction, or via set_device, if the guard is initialized,
-  /// or nullopt if the guard is uninitialized.
-  c10::optional<c10::Device> current_device() const {
-    return guard_.current_device();
-  }
-
-  /// Restore the original NPU device, resetting this guard to uninitialized
-  /// state.
-  void reset() {
-    guard_.reset();
-  }
-
- private:
-  c10::impl::InlineOptionalDeviceGuard<impl::NPUGuardImpl> guard_;
 };
 
 /// A variant of StreamGuard that is specialized for NPU.  See NPUGuard
diff --git a/npu/core/sys_ctrl/npu_sys_ctrl.cpp b/npu/core/sys_ctrl/npu_sys_ctrl.cpp
index a26cc060..2a094ad4 100644
--- a/npu/core/sys_ctrl/npu_sys_ctrl.cpp
+++ b/npu/core/sys_ctrl/npu_sys_ctrl.cpp
@@ -150,6 +150,16 @@ void initCachingAllocator() {
   ASCEND_LOGD("Npu caching allocator initialize successfully");
 }
 
+bool NpuSysCtrl::IsInitializeSuccess(int device_id) {
+  SysStatus status = GetInstance().Initialize(device_id);
+  return status == SysStatus::INIT_SUCC;
+}
+
+bool NpuSysCtrl::IsFinalizeSuccess() {
+  SysStatus status = GetInstance().Finalize();
+  return status == SysStatus::FINALIZE_SUCC;
+}
+
 // Environment Initialize, return Status: SUCCESS, FAILED
 NpuSysCtrl::SysStatus NpuSysCtrl::Initialize(int device_id) {
   if (init_flag_) {
diff --git a/npu/core/sys_ctrl/npu_sys_ctrl.h b/npu/core/sys_ctrl/npu_sys_ctrl.h
index 08446231..27d595dd 100644
--- a/npu/core/sys_ctrl/npu_sys_ctrl.h
+++ b/npu/core/sys_ctrl/npu_sys_ctrl.h
@@ -37,6 +37,10 @@ class NpuSysCtrl {
   // Get NpuSysCtrl singleton instance
   C10_NPU_API static NpuSysCtrl& GetInstance();
 
+  C10_NPU_API static bool IsInitializeSuccess(int device_id = -1);
+
+  C10_NPU_API static bool IsFinalizeSuccess();
+
   // Environment Initialize, return SysStatus
   SysStatus Initialize(int device_id = -1);
 
diff --git a/npu/framework/OpCommand.cpp b/npu/framework/OpCommand.cpp
index d184f2f9..f4e63b7e 100644
--- a/npu/framework/OpCommand.cpp
+++ b/npu/framework/OpCommand.cpp
@@ -270,7 +270,7 @@ at::Tensor OpCommand::CopyHostToDevice(
 
 at::Tensor OpCommand::CopyHostToDevice(const at::Tensor& cpuTensor) {
   at::Tensor cpuPinMemTensor = cpuTensor.pin_memory();
-  int deviceIndex = 0;
+  c10::DeviceIndex deviceIndex = 0;
   NPU_CHECK_ERROR(c10_npu::GetDevice(&deviceIndex));
   auto tensor = cpuPinMemTensor.to(
       c10::Device(c10::DeviceType::PrivateUse1, deviceIndex),
diff --git a/npu/framework/utils/CalcuOpUtil.cpp b/npu/framework/utils/CalcuOpUtil.cpp
index 9fc8da98..bb5a0084 100644
--- a/npu/framework/utils/CalcuOpUtil.cpp
+++ b/npu/framework/utils/CalcuOpUtil.cpp
@@ -186,7 +186,7 @@ at::Tensor CalcuOpUtil::CopyScalarToDevice(
 
 at::Tensor CalcuOpUtil::CopyTensorHostToDevice(const at::Tensor& cpu_tensor) {
   at::Tensor cpuPinMemTensor = cpu_tensor.pin_memory();
-  int deviceIndex = 0;
+  c10::DeviceIndex deviceIndex = 0;
   NPU_CHECK_ERROR(c10_npu::GetDevice(&deviceIndex));
   return cpuPinMemTensor.to(
       c10::Device(c10::DeviceType::PrivateUse1, deviceIndex),
diff --git a/npu/framework/utils/NpuUtils.cpp b/npu/framework/utils/NpuUtils.cpp
index 2878493b..fc9fb52b 100644
--- a/npu/framework/utils/NpuUtils.cpp
+++ b/npu/framework/utils/NpuUtils.cpp
@@ -256,7 +256,7 @@ at::Tensor NpuUtils::format_contiguous_add_copy_optimize(
 
 bool NpuUtils::IsOomError(aclError ret, int index) {
   if (ret == ACL_ERROR_GE_DEVICE_MEMORY_ALLOCATION_FAILED) {
-    int deviceId = 0;
+    c10::DeviceIndex deviceId = 0;
     // free devcie cached memory when return value of the first op execution is
     // oom
     if (index == 1) {
diff --git a/test/cpp/core/CMakeLists.txt b/test/cpp/core/CMakeLists.txt
index dade077c..f86b855e 100644
--- a/test/cpp/core/CMakeLists.txt
+++ b/test/cpp/core/CMakeLists.txt
@@ -1,9 +1,11 @@
 if(BUILD_TEST)
   set(TORCH_BACKEND_CORE_TEST_SOURCES
-    ${PROJECT_SOURCE_DIR}/test/cpp/common/main.cpp)
+    ${PROJECT_SOURCE_DIR}/test/cpp/common/main.cpp
+    ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp)
 
   add_executable(test_core ${TORCH_BACKEND_CORE_TEST_SOURCES})
   target_link_libraries(test_core PRIVATE torch_npu gtest_main gtest)
+  add_test(NAME test_core COMMAND $<TARGET_FILE:test_core>)
 endif()
 
 if(INSTALL_TEST)
diff --git a/test/cpp/core/device.cpp b/test/cpp/core/device.cpp
new file mode 100644
index 00000000..5ec9c54d
--- /dev/null
+++ b/test/cpp/core/device.cpp
@@ -0,0 +1,9 @@
+#include <gtest/gtest.h>
+
+#include "csrc/core/impl/PrivateUse1GuardImpl.h"
+
+TEST(DeviceGuardTest, StaticType) {
+  EXPECT_EQ(
+      c10_backend::impl::PrivateUse1GuardImpl::static_type,
+      c10::DeviceType::PrivateUse1);
+}
diff --git a/torch_npu/csrc/Module.cpp b/torch_npu/csrc/Module.cpp
index 0e4f2e48..9facb8d6 100644
--- a/torch_npu/csrc/Module.cpp
+++ b/torch_npu/csrc/Module.cpp
@@ -63,9 +63,7 @@ PyObject* THPModule_npu_shutdown(PyObject* /* unused */) {
   }
 
   ASCEND_LOGI("NPU shutdown NpuSysCtrl Finalize.");
-  c10_npu::NpuSysCtrl::SysStatus status =
-      c10_npu::NpuSysCtrl::GetInstance().Finalize();
-  if (status != c10_npu::NpuSysCtrl::SysStatus::FINALIZE_SUCC) {
+  if (!c10_npu::NpuSysCtrl::IsFinalizeSuccess()) {
     ASCEND_LOGE("NPU shutdown failed.");
   } else {
     ASCEND_LOGI("NPU shutdown success.");
diff --git a/torch_npu/csrc/npu/Module.cpp b/torch_npu/csrc/npu/Module.cpp
index 4e192110..35fd94dc 100644
--- a/torch_npu/csrc/npu/Module.cpp
+++ b/torch_npu/csrc/npu/Module.cpp
@@ -3,15 +3,11 @@
 #include <thread>
 #include <unordered_map>
 
-#include <ATen/ATen.h>
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/Generator.h>
 #include <torch/csrc/THP.h>
 #include <torch/csrc/autograd/generated/VariableType.h>
-#include <torch/csrc/autograd/generated/variable_factories.h>
-#include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/profiler/python/combined_traceback.h>
-#include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/device_lazy_init.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_arg_parser.h>
@@ -30,11 +26,9 @@
 #include "npu/core/NpuVariables.h"
 #include "npu/core/register/OptionRegister.h"
 #include "npu/core/sys_ctrl/npu_sys_ctrl.h"
-#include "npu/framework/StorageDescHelper.h"
 #include "npu/acl/include/acl/acl.h"
 #include "torch_npu/csrc/npu/Module.h"
 #include "torch_npu/csrc/npu/NPUPluggableAllocator.h"
-#include "torch_npu/csrc/npu/Stream.h"
 #include "torch_npu/csrc/npu/memory_snapshot.h"
 
 struct NPUDeviceProp {
@@ -107,7 +101,7 @@ void RegisterNPUDeviceMemories(PyObject* module) {
       .def_readonly("free_memory", &NPUDeviceMem::freeMem);
 }
 
-NPUDeviceMem* GetDeviceMemories(int64_t deviceid) {
+NPUDeviceMem* GetDeviceMemories(c10::DeviceIndex deviceid) {
   c10_npu::NPUGuard guard(deviceid);
   size_t device_free;
   size_t device_total;
@@ -121,7 +115,7 @@ void BindGetDeviceMemories(PyObject* module) {
   auto m = py::handle(module).cast<py::module>();
   m.def(
       "_npu_getDeviceMemories",
-      [](int deviceid) -> NPUDeviceMem* { return GetDeviceMemories(deviceid); },
+      [](c10::DeviceIndex deviceid) -> NPUDeviceMem* { return GetDeviceMemories(deviceid); },
       py::return_value_policy::reference);
 }
 
@@ -230,9 +224,9 @@ static PyObject* THNPModule_initExtension(PyObject* self, PyObject* noargs) {
       throw python_error();
     }
   };
-  auto num_npus = c10_npu::device_count();
+  c10::DeviceIndex num_npus = c10_npu::device_count();
   auto default_npu_generators = PyTuple_New(static_cast<Py_ssize_t>(num_npus));
-  for (int i = 0; i < num_npus; i++) {
+  for (c10::DeviceIndex i = 0; i < num_npus; i++) {
     auto gen = at_npu::detail::getDefaultNPUGenerator(i);
     auto cast_gen = (THPGenerator*)THPGenerator_initDefaultGenerator(gen);
     // This reference is meant to be given away, so no need to incref here.
@@ -259,13 +253,13 @@ void THNPModule_setDevice(int device) {
 
 PyObject* THNPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
   HANDLE_TH_ERRORS
-  int device = THPUtils_unpackLong(arg);
+  c10::DeviceIndex device = THPUtils_unpackDeviceIndex(arg);
   {
     pybind11::gil_scoped_release no_gil;
     at::globalContext().lazyInitPrivateUse1();
   }
 
-  int pre_device = 0;
+  c10::DeviceIndex pre_device = 0;
   auto ret = c10_npu::GetDevice(&pre_device);
   if (ret != ACL_ERROR_NONE) {
     NPU_CHECK_ERROR(c10_npu::SetDevice(device));
@@ -279,22 +273,16 @@ PyObject* THNPModule_setDevice_wrap(PyObject* self, PyObject* arg) {
 
 PyObject* THNPModule_getDevice_wrap(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
-  int device;
+  c10::DeviceIndex device;
   torch::utils::device_lazy_init(at::kPrivateUse1);
   NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
-  return PyLong_FromLong(device);
+  return THPUtils_packInt32(device);
   END_HANDLE_TH_ERRORS
 }
 
 PyObject* THNPModule_getDeviceCount_wrap(PyObject* self, PyObject* noargs) {
   HANDLE_TH_ERRORS
-  return PyLong_FromLong(c10_npu::device_count());
-  END_HANDLE_TH_ERRORS
-}
-
-PyObject* THNPModule_getLocalDevice_wrap(PyObject* self, PyObject* noargs) {
-  HANDLE_TH_ERRORS
-  return PyLong_FromLong(c10_npu::GetLocalDevice());
+  return THPUtils_packInt32(c10_npu::device_count());
   END_HANDLE_TH_ERRORS
 }
 
@@ -308,53 +296,14 @@ PyObject* THNPModule_npuCanDeviceAccessPeer_wrap(
     throw torch::TypeError(
         "Pybind failed to parse parameters." + PTA_ERROR(ErrCode::TYPE));
   }
-  int32_t device_id = THPUtils_unpackInt(value_1);
-  int32_t peer_device_id = THPUtils_unpackInt(value_2);
+  c10::DeviceIndex device_id = THPUtils_unpackDeviceIndex(value_1);
+  c10::DeviceIndex peer_device_id = THPUtils_unpackDeviceIndex(value_2);
   auto can_access_peer =
       c10_npu::acl::can_device_access_peer(device_id, peer_device_id);
   return PyBool_FromLong(can_access_peer);
   END_HANDLE_TH_ERRORS
 }
 
-PyObject* THNPModule_getDeviceUtilizationRate_wrap(
-    PyObject* self,
-    PyObject* device_index) {
-  HANDLE_TH_ERRORS
-  TORCH_CHECK(
-      THPUtils_checkLong(device_index),
-      "invalid argument to getDeviceUtilizationRate",
-      PTA_ERROR(ErrCode::VALUE));
-  int32_t device = static_cast<int32_t>(THPUtils_unpackUInt32(device_index));
-  aclrtUtilizationInfo util_info;
-  util_info.cubeUtilization = 0;
-  util_info.vectorUtilization = 0;
-  util_info.utilizationExtend = nullptr;
-  NPU_CHECK_ERROR(
-      c10_npu::acl::AclrtGetDeviceUtilizationRate(device, &util_info));
-  int32_t cube = util_info.cubeUtilization;
-  int32_t vector = util_info.vectorUtilization;
-  int32_t util_rate = 0;
-  // 如果vector和cube谁支持,就返回谁的使用率，如果都支持计算(vector*1+cube*1)/2
-  if (cube == DEVICE_UTILIZATION_NOT_SUPPORT &&
-      vector != DEVICE_UTILIZATION_NOT_SUPPORT) {
-    util_rate = vector;
-  } else if (
-      cube != DEVICE_UTILIZATION_NOT_SUPPORT &&
-      vector == DEVICE_UTILIZATION_NOT_SUPPORT) {
-    util_rate = cube;
-  } else if (
-      cube != DEVICE_UTILIZATION_NOT_SUPPORT &&
-      vector != DEVICE_UTILIZATION_NOT_SUPPORT) {
-    util_rate = (cube + vector) / 2;
-  }
-  TORCH_CHECK(
-      util_rate <= 100 && util_rate >= 0,
-      "invalid result to util_rate",
-      PTA_ERROR(ErrCode::VALUE));
-  return PyLong_FromLong(util_rate);
-  END_HANDLE_TH_ERRORS
-}
-
 PyObject* THNPModule_getCurrentStream_wrap(
     PyObject* /* unused */,
     PyObject* device_index) {
@@ -363,7 +312,7 @@ PyObject* THNPModule_getCurrentStream_wrap(
       THPUtils_checkLong(device_index),
       "invalid argument to getCurrentStream",
       PTA_ERROR(ErrCode::PARAM));
-  int64_t device = THPUtils_unpackLong(device_index);
+  c10::DeviceIndex device = THPUtils_unpackDeviceIndex(device_index);
   auto stream = c10_npu::getCurrentNPUStream(device);
   PyObject* output_tuple = PyTuple_New(3);
   PyTuple_SetItem(
@@ -388,7 +337,7 @@ PyObject* THNPModule_getDefaultStream_wrap(
       THPUtils_checkLong(device_index),
       "invalid argument to getDefaultStream",
       PTA_ERROR(ErrCode::PARAM));
-  int64_t device = THPUtils_unpackLong(device_index);
+  c10::DeviceIndex device = THPUtils_unpackDeviceIndex(device_index);
   auto stream = c10_npu::getDefaultNPUStream(device);
   PyObject* output_tuple = PyTuple_New(3);
   PyTuple_SetItem(
@@ -428,9 +377,11 @@ PyObject* THNPModule_setStream_wrap(
   }
 
   auto stream = c10_npu::NPUStream::unpack3(
-      stream_id, device_index, static_cast<c10::DeviceType>(device_type));
+      stream_id,
+      static_cast<c10::DeviceIndex>(device_index),
+      static_cast<c10::DeviceType>(device_type));
 
-  int device;
+  c10::DeviceIndex device;
   NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
   if (device != stream.device_index()) {
     THNPModule_setDevice(stream.device_index());
@@ -1064,10 +1015,6 @@ static struct PyMethodDef THNPModule_methods[] = {
      (PyCFunction)THNPModule_getDevice_wrap,
      METH_NOARGS,
      nullptr},
-    {"_npu_getLocalDevice",
-     (PyCFunction)THNPModule_getLocalDevice_wrap,
-     METH_NOARGS,
-     nullptr},
     {"_npu_getDeviceCount",
      (PyCFunction)THNPModule_getDeviceCount_wrap,
      METH_NOARGS,
@@ -1076,10 +1023,6 @@ static struct PyMethodDef THNPModule_methods[] = {
      (PyCFunction)THNPModule_npuCanDeviceAccessPeer_wrap,
      METH_VARARGS,
      nullptr},
-    {"_npu_getDeviceUtilizationRate",
-     (PyCFunction)THNPModule_getDeviceUtilizationRate_wrap,
-     METH_O,
-     nullptr},
     {"_npu_getCurrentStream",
      (PyCFunction)THNPModule_getCurrentStream_wrap,
      METH_O,
diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
index 7d5764b1..4f1122f1 100644
--- a/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
+++ b/torch_npu/csrc/npu/NPUPluggableAllocator.cpp
@@ -8,7 +8,7 @@
 
 namespace torch::npu::NPUPluggableAllocator {
 
-int device_count = 0;
+c10::DeviceIndex device_count = 0;
 
 void custom_raw_deleter(void* ptr);
 
@@ -17,7 +17,7 @@ _AllocationMetadata::_AllocationMetadata()
 
 _AllocationMetadata::_AllocationMetadata(
     size_t size,
-    int device_idx,
+    c10::DeviceIndex device_idx,
     aclrtStream stream)
     : size(size), device_idx(device_idx), stream(stream) {}
 
@@ -70,7 +70,7 @@ void NPUPluggableAllocator::set_erase_stream_fn(
 
 void* NPUPluggableAllocator::malloc(
     size_t size,
-    int device,
+    c10::DeviceIndex device,
     aclrtStream stream) {
   void* r = alloc_fn_(size, device, stream);
   {
@@ -81,7 +81,7 @@ void* NPUPluggableAllocator::malloc(
 }
 
 c10::DataPtr NPUPluggableAllocator::allocate(size_t size) {
-  int device = -1;
+  c10::DeviceIndex device = -1;
   NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
   aclrtStream stream = c10_npu::getCurrentNPUStreamNoWait(device);
   void* r = this->malloc(size, device, stream);
@@ -89,8 +89,7 @@ c10::DataPtr NPUPluggableAllocator::allocate(size_t size) {
       r,
       r,
       raw_deleter(),
-      c10::Device(
-          c10::DeviceType::PrivateUse1, static_cast<c10::DeviceIndex>(device))};
+      c10::Device(c10::DeviceType::PrivateUse1, device)};
   return data_ptr;
 }
 
@@ -99,7 +98,7 @@ c10::DeleterFnPtr NPUPluggableAllocator::raw_deleter() const {
 }
 
 void* NPUPluggableAllocator::raw_alloc(size_t nbytes) {
-  int device = -1;
+  c10::DeviceIndex device = -1;
   NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
   aclrtStream stream = c10_npu::getCurrentNPUStreamNoWait(device);
   return malloc(nbytes, device, stream);
@@ -108,14 +107,14 @@ void* NPUPluggableAllocator::raw_alloc(size_t nbytes) {
 void* NPUPluggableAllocator::raw_alloc_with_stream(
     size_t nbytes,
     aclrtStream stream) {
-  int device = -1;
+  c10::DeviceIndex device = -1;
   NPU_CHECK_ERROR(c10_npu::GetDevice(&device));
   return malloc(nbytes, device, stream);
 }
 
 void NPUPluggableAllocator::raw_delete(void* ptr) {
   aclrtStream stream{};
-  int device_idx = -1;
+  c10::DeviceIndex device_idx = -1;
   size_t size = 0;
   {
     const std::lock_guard<std::mutex> lock(allocator_mutex_);
diff --git a/torch_npu/csrc/npu/NPUPluggableAllocator.h b/torch_npu/csrc/npu/NPUPluggableAllocator.h
index 49ec440d..44e1f727 100644
--- a/torch_npu/csrc/npu/NPUPluggableAllocator.h
+++ b/torch_npu/csrc/npu/NPUPluggableAllocator.h
@@ -23,9 +23,9 @@ void changeCurrentAllocator(
 
 struct _AllocationMetadata {
   _AllocationMetadata();
-  _AllocationMetadata(size_t size, int device_idx, aclrtStream stream);
+  _AllocationMetadata(size_t size, c10::DeviceIndex device_idx, aclrtStream stream);
   size_t size;
-  int device_idx;
+  c10::DeviceIndex device_idx;
   aclrtStream stream;
 };
 
@@ -46,7 +46,7 @@ struct NPUPluggableAllocator
       std::function<void(void* ptr, aclrtStream stream)> record_stream_fn);
   void set_erase_stream_fn(
       std::function<void(void* ptr, aclrtStream stream)> erase_stream_fn);
-  void* malloc(size_t size, int device, aclrtStream stream);
+  void* malloc(size_t size, c10::DeviceIndex device, aclrtStream stream);
 
   c10::DataPtr allocate(size_t size) override;
   c10::DeleterFnPtr raw_deleter() const override;
@@ -79,8 +79,8 @@ struct NPUPluggableAllocator
       c10_npu::NPUCachingAllocator::OutOfMemoryObserver observer) override;
 
  protected:
-  std::function<void*(size_t, int, aclrtStream)> alloc_fn_;
-  std::function<void(void*, size_t, int, aclrtStream)> free_fn_;
+  std::function<void*(size_t, c10::DeviceIndex, aclrtStream)> alloc_fn_;
+  std::function<void(void*, size_t, c10::DeviceIndex, aclrtStream)> free_fn_;
   std::function<void(int)> init_fn_;
   std::function<void(bool)> reset_fn_;
   std::function<void(double, int)> memory_fraction_fn_;
diff --git a/torch_npu/csrc/npu/Stream.cpp b/torch_npu/csrc/npu/Stream.cpp
index eeccec88..aa082b39 100644
--- a/torch_npu/csrc/npu/Stream.cpp
+++ b/torch_npu/csrc/npu/Stream.cpp
@@ -19,7 +19,7 @@ static PyObject* THNPStream_pynew(
     PyObject* kwargs) {
   HANDLE_TH_ERRORS
 
-  int current_device;
+  c10::DeviceIndex current_device;
   NPU_CHECK_ERROR(c10_npu::GetDevice(&current_device));
 
   int priority = 0;
diff --git a/torch_npu/npu/__init__.py b/torch_npu/npu/__init__.py
index f76cdbe2..741b6911 100644
--- a/torch_npu/npu/__init__.py
+++ b/torch_npu/npu/__init__.py
@@ -21,7 +21,6 @@
     "set_sync_debug_mode",
     "get_sync_debug_mode",
     "init_dump",
-    "utilization",
     "finalize_dump",
     "manual_seed",
     "manual_seed_all",
@@ -111,7 +110,6 @@
     get_sync_debug_mode,
     init_dump,
     is_bf16_supported,
-    utilization,
     finalize_dump,
     set_dump,
     get_npu_overflow_flag,
diff --git a/torch_npu/npu/device.py b/torch_npu/npu/device.py
index de71a8d0..841a649a 100644
--- a/torch_npu/npu/device.py
+++ b/torch_npu/npu/device.py
@@ -10,7 +10,7 @@
 from torch_npu.utils.error_code import ErrCode, pta_error
 
 __all__ = ["synchronize", "device_count", "can_device_access_peer", "set_device", "current_device", "get_device_name",
-           "get_device_properties", "mem_get_info", "get_device_capability", "utilization", "device", "device_of",
+           "get_device_properties", "mem_get_info", "get_device_capability", "device", "device_of",
            "_get_device_index"]
 
 
@@ -87,16 +87,6 @@ def get_device_capability(device=None):
     return None
 
 
-def utilization(device=None):
-    r"""Query the comprehensive utilization rate of device
-    """
-    device_id = _get_device_index(device, optional=True)
-    if device_id < 0 or device_id >= device_count():
-        raise AssertionError("Invalid device id" + pta_error(ErrCode.VALUE))
-    torch_npu.npu._lazy_init()
-    return torch_npu._C._npu_getDeviceUtilizationRate(device_id)
-
-
 class device(object):
     r"""Context-manager that changes the selected device.
 
diff --git a/torch_npu/npu/utils.py b/torch_npu/npu/utils.py
index 8399c096..a2c5b03c 100644
--- a/torch_npu/npu/utils.py
+++ b/torch_npu/npu/utils.py
@@ -17,14 +17,13 @@
     get_device_properties,
     get_device_capability,
     _get_device_index,
-    utilization,
     device,
     device_of,
     mem_get_info,
 )
 
 __all__ = ["synchronize", "device_count", "can_device_access_peer", "set_device", "current_device", "get_device_name",
-           "get_device_properties", "mem_get_info", "get_device_capability", "utilization", "device", "device_of",
+           "get_device_properties", "mem_get_info", "get_device_capability", "device", "device_of",
            "stream", "set_stream", "current_stream", "default_stream", "set_sync_debug_mode", "get_sync_debug_mode",
            "init_dump", "set_dump", "finalize_dump", "get_soc_version", "is_support_inf_nan", "is_bf16_supported",
            "get_npu_overflow_flag", "npu_check_overflow", "clear_npu_overflow_flag", "current_blas_handle"]
diff --git a/torch_npu/utils/error_code.py b/torch_npu/utils/error_code.py
index 23a8a4be..61dde1e7 100644
--- a/torch_npu/utils/error_code.py
+++ b/torch_npu/utils/error_code.py
@@ -45,13 +45,6 @@ def msg(self):
 
 
 def _format_error_msg(submodule, error_code):
-    def get_device_id():
-        try:
-            import torch_npu._C
-            return torch_npu._C._npu_getLocalDevice()
-        except Exception:
-            return -1
-
     def get_rank_id():
         try:
             import torch.distributed as dist
@@ -60,12 +53,11 @@ def get_rank_id():
         except Exception:
             return -1
 
-    error_msg = "\n[ERROR] {time} (PID:{pid}, Device:{device}, RankID:{rank}) {error_code} {submodule_name} {error_code_msg}"
+    error_msg = "\n[ERROR] {time} (PID:{pid}, RankID:{rank}) {error_code} {submodule_name} {error_code_msg}"
 
     return error_msg.format(
             time=time.strftime("%Y-%m-%d-%H:%M:%S", time.localtime()),
             pid=os.getpid(),
-            device=get_device_id(),
             rank=get_rank_id(),
             error_code="ERR{:0>2d}{:0>3d}".format(submodule.value, error_code.code,),
             submodule_name=submodule.name,