Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Performance] Frequent usage of mutex significantly slows down performance. #816

Open
skpupil opened this issue May 8, 2024 · 3 comments

Comments

@skpupil
Copy link
Contributor

skpupil commented May 8, 2024

We use a simple bandwidth testing program that utilizes send and recv, and we found that asterinas's bandwidth still lags behind Linux by nearly 8 times. We attempted to identify the performance bottleneck and used performance measurement tools to discover that a significant amount of time is consumed by locking and unlocking functions. The function consuming the most time is aster_frame::sync::wait::WaitQueue::wake_one(), rather than functions related to the network protocol stack, which is highly abnormal. Below are the statistics from our performance testing tool. The higher the count for a function, the more time is spent on that function.

time consumption statistics for each function
    110 aster_frame::sync::wait::WaitQueue::wake_one,??,??
     88 aster_frame::sync::wait::WaitQueue::wake_one,??,??,??,aster_nix::vm::vmar::<impl,??,??
     71 syscall_return,??,??
     63 aster_frame::sync::wait::WaitQueue::wake_one,??
     59 _ZN115_$LT$aster_frame..vm..heap_allocator..LockedHeapWithRescue$LT$_$GT$$u20$as$u20$core..alloc..global..GlobalAlloc$GT$5alloc17hfe6236b280b39b8eE.llvm.6362246928768243656,??
     54 aster_frame::arch::x86::interrupts_ack,aster_frame::trap::handler::call_irq_callback_functions,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??
     45 aster_frame::sync::wait::WaitQueue::wake_one,_ZN83_$LT$aster_frame..task..processor..PROCESSOR$u20$as$u20$core..ops..deref..Deref$GT$5deref11__stability4LAZY17hd77884a92564e84cE.llvm.250360732296358197,??,??,aster_frame::task::task::Task::current,??,aster_nix::thread::Thread::current,??,??,??
     36 _ZN115_$LT$aster_frame..vm..heap_allocator..LockedHeapWithRescue$LT$_$GT$$u20$as$u20$core..alloc..global..GlobalAlloc$GT$5alloc17hfe6236b280b39b8eE.llvm.6362246928768243656,??,??,??,??,??,_ZN5alloc7raw_vec11finish_grow17h63093dac982896c1E.llvm.5482989211892863760,??,??,??
     35 smoltcp::iface::interface::Interface::poll,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,??
     34 trap_syscall_entry,??,??
     33 aster_frame::arch::x86::interrupts_ack,aster_frame::trap::handler::call_irq_callback_functions,??,??,??,??,??,__from_kernel,??,??
     30 compiler_builtins::mem::memcpy,smoltcp::storage::ring_buffer::RingBuffer<T>::dequeue_slice,??,??,??,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,??,??,??
     29 aster_nix::net::iface::any_socket::AnyBoundSocket::on_iface_events,??,??,??,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,??,??,??
     29 aster_frame::arch::x86::interrupts_ack,aster_frame::trap::handler::call_irq_callback_functions,??,??,??,??,??,__from_kernel,??
     27 aster_frame::sync::wait::WaitQueue::wake_one,_ZN83_$LT$aster_frame..task..processor..PROCESSOR$u20$as$u20$core..ops..deref..Deref$GT$5deref11__stability4LAZY17hd77884a92564e84cE.llvm.250360732296358197,??,??,aster_frame::task::task::Task::current,??,aster_nix::thread::Thread::current,??
     26 aster_nix::net::iface::common::IfaceCommon::poll,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,??,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,??,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,??
     26 aster_frame::user::UserMode::execute,??
     26 aster_frame::task::processor::preempt,??,??,??
     26 aster_frame::sync::wait::WaitQueue::wait_until,??,??
     25 compiler_builtins::mem::memcpy,smoltcp::storage::ring_buffer::RingBuffer<T>::write_unallocated,??,??,??,??,??,??,??,??
     25 aster_frame::sync::wait::WaitQueue::wake_one,??,??,??,aster_nix::vm::vmar::Vmar_::read,??,??,??,??,??
     25 ??,??,??,??,??,??,??,??
     24 <aster_virtio::device::console::device::ConsoleDevice,??,??,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,??,??,??,??,??
     22 aster_nix::thread::task::create_new_user_task::user_task_entry,??,??,??
     22 aster_frame::sync::wait::WaitQueue::wait_until,??,??,??,??,aster_nix::vm::vmar::<impl,??,??
     22 _ZN115_$LT$aster_frame..vm..heap_allocator..LockedHeapWithRescue$LT$_$GT$$u20$as$u20$core..alloc..global..GlobalAlloc$GT$7dealloc17hbea2dee3c25abf03E.llvm.6362246928768243656,??,??,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,smoltcp::iface::interface::Interface::poll,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,??
     22 ??,??
     21 compiler_builtins::mem::memcpy,<aster_frame::vm::dma::dma_coherent::DmaCoherent,??,<aster_virtio::device::network::device::NetworkDevice,??,??,??,aster_network::driver::<impl,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??
     20 compiler_builtins::mem::memcpy,smoltcp::wire::tcp::Repr::emit,??
     20 compiler_builtins::mem::memcpy,<aster_frame::vm::frame::VmFrameVec,??,??,??,??,??,??,aster_nix::vm::vmo::Vmo_::write_bytes,??
     20 ??,??,??
     19 compiler_builtins::mem::memmove,<aster_frame::vm::frame::VmFrameVec,??,??,??,??,??,??,aster_nix::vm::vmo::Vmo_::read_bytes,??
     19 compiler_builtins::mem::memcpy,smoltcp::wire::tcp::Repr::emit,??,??
     19 aster_frame::sync::wait::WaitQueue::wake_one,_ZN83_$LT$aster_frame..task..processor..PROCESSOR$u20$as$u20$core..ops..deref..Deref$GT$5deref11__stability4LAZY17hd77884a92564e84cE.llvm.250360732296358197,??,??
     19 aster_frame::sync::wait::WaitQueue::wake_one,??,??,??,aster_nix::events::subject::Subject<E,F>::notify_observers,??,??,??,??
     18 trap_syscall_entry,<aster_frame::arch::x86::cpu::UserContext,??,??,??
     18 aster_frame::sync::wait::WaitQueue::wake_one,??,??,??,aster_frame::user::UserMode::execute,??
     18 _ZN115_$LT$aster_frame..vm..heap_allocator..LockedHeapWithRescue$LT$_$GT$$u20$as$u20$core..alloc..global..GlobalAlloc$GT$5alloc17hfe6236b280b39b8eE.llvm.6362246928768243656,_ZN83_$LT$aster_frame..task..processor..PROCESSOR$u20$as$u20$core..ops..deref..Deref$GT$5deref11__stability4LAZY17hd77884a92564e84cE.llvm.250360732296358197,??,??,??,??,<aster_nix::net::socket::ip::stream::StreamSocket,??,??,??
     18 <aster_nix::net::socket::ip::stream::StreamSocket,??,??,??,??,??,??,??,??,aster_nix::net::iface::any_socket::AnyBoundSocket::on_iface_events
     17 buddy_system_allocator::Heap<_>::alloc,??
     17 aster_frame::sync::wait::WaitQueue::wake_one,anon.df06effb724f81bbdcdff7d3918a86da.115.llvm,??,??,<alloc::collections::btree::map::ExtractIf<K,V,F,A>,??,??,??,anon.df06effb724f81bbdcdff7d3918a86da.115.llvm,??
     16 aster_frame::sync::wait::WaitQueue::wake_one,_ZN83_$LT$aster_frame..task..processor..PROCESSOR$u20$as$u20$core..ops..deref..Deref$GT$5deref11__stability4LAZY17hd77884a92564e84cE.llvm.250360732296358197,??,??,aster_frame::task::processor::preempt,??,??,??
     16 aster_frame::sync::wait::WaitQueue::wake_one,??,??,??,aster_nix::process::signal::handle_pending_signal,??,??,??,anon.df06effb724f81bbdcdff7d3918a86da.115.llvm,??
     16 aster_frame::sync::wait::WaitQueue::wake_one,??,??,??
     16 ??,??,??,??,??,??,??
     15 aster_nix::net::iface::any_socket::AnyBoundSocket::raw_with,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,??,??,??,??,??,??,??
     15 aster_frame::sync::wait::WaitQueue::wake_one,??,??,??,aster_nix::vm::vmar::Vmar_::write,??,??,??,??,??
     15 aster_frame::sync::wait::WaitQueue::wake_one,??,??,??,??,aster_nix::vm::vmar::<impl,??,??
     14 smoltcp::socket::tcp::Socket::process,??,??,??,??,??,??,??,??,??
     14 smoltcp::iface::interface::Interface::poll,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,??,??,??,??
     14 compiler_builtins::mem::memcpy,smoltcp::storage::ring_buffer::RingBuffer<T>::enqueue_slice,??,??,??,??,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,smoltcp::socket::tcp::Socket::send_slice,??
     14 aster_frame::sync::wait::WaitQueue::wake_one,??,??,??,aster_nix::vm::vmar::vm_mapping::VmMapping::read_bytes,??,??,??,??,??
     14 aster_frame::sync::wait::WaitQueue::wake_one,??,??,??,aster_nix::vm::vmar::Vmar_::write,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,??,??,??
     14 aster_frame::arch::x86::interrupts_ack,aster_frame::trap::handler::call_irq_callback_functions,??,??,??,??,??,<aster_frame::arch::x86::cpu::UserContext,??,??
     13 aster_nix::syscall::recvfrom::sys_recvfrom,??,??,??,??
     13 aster_frame::sync::wait::WaitQueue::wake_one,??,??,??,aster_nix::vm::vmar::vm_mapping::VmMapping::write_bytes,??,??,??,??,??
     13 _ZN115_$LT$aster_frame..vm..heap_allocator..LockedHeapWithRescue$LT$_$GT$$u20$as$u20$core..alloc..global..GlobalAlloc$GT$5alloc17hfe6236b280b39b8eE.llvm.6362246928768243656,??,??,??,??,??,aster_frame::vm::frame::VmFrameVec::new_with_capacity,??,??,??
     13 _ZN115_$LT$aster_frame..vm..heap_allocator..LockedHeapWithRescue$LT$_$GT$$u20$as$u20$core..alloc..global..GlobalAlloc$GT$5alloc17hfe6236b280b39b8eE.llvm.6362246928768243656,??,??,??,??,??,__rust_alloc_zeroed,??
     13 <aster_virtio::device::console::device::ConsoleDevice,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,??,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,??,??
     12 syscall_return,<aster_frame::arch::x86::cpu::UserContext,??,??,??
     12 smoltcp::iface::interface::InterfaceInner::process_tcp,??,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,??,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??
     12 aster_frame::sync::wait::WaitQueue::wake_one,_ZN83_$LT$aster_frame..task..processor..PROCESSOR$u20$as$u20$core..ops..deref..Deref$GT$5deref11__stability4LAZY17hd77884a92564e84cE.llvm.250360732296358197,??,??,aster_frame::task::task::Task::current,??,aster_nix::thread::task::create_new_user_task::user_task_entry,??,??,??
     11 smoltcp::socket::tcp::Socket::seq_to_transmit,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,??,??,??,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,smoltcp::iface::interface::Interface::poll,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656
     11 aster_frame::sync::wait::WaitQueue::wake_one,??,??,??,<aster_nix::net::socket::ip::stream::StreamSocket,??,??
     11 aster_frame::arch::x86::interrupts_ack,aster_frame::trap::handler::call_irq_callback_functions,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??,??,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,__from_kernel,??,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656
     11 <aster_frame::task::processor::DisablePreemptGuard,aster_nix::net::iface::common::IfaceCommon::poll,_ZN11aster_frame2vm14heap_allocator10HEAP_SPACE17h6195c48f8e90d7d1E.llvm.6362246928768243656,??
     11 <aster_frame::task::processor::DisablePreemptGuard,<aster_nix::net::socket::ip::stream::StreamSocket,??,??,??,??,??,??,??,??

Subsequently, through breakpoint statistics, we analyzed which functions extensively use locks. Below are the specific statistics.

<style> </style>
计数 占比 加锁函数 加锁位置
749 34.01% aster_nix::vm::vmar::vm_mapping::{impl#1}::range VmMappingInner加锁
532 24.16% aster_nix::vm::vmar::vm_mapping::VmMapping::map_to_addr 和range有重叠,VmMappingInner加锁
79 3.59% aster_nix::process::signal::poll::Pollee::add_events BTreeMap<Observer<aster_nix::events::io_events::IoEvents>>加锁
83 3.77% aster_nix::events::subject::Subject 与add_events有重叠 BTreeMap<Observer<aster_nix::events::io_events::IoEvents>>加锁
86 3.91% aster_nix::net::iface::loopback::{impl#2}::poll 与add_event有重叠 Loopback加锁
98 4.45% aster_nix::util::net::get_socket_from_fd FileTable加锁
64 2.91% aster_frame::vm::space::VmSpace::is_mapped MemorySet加锁
32 1.45% aster_nix::vm::vmo::Vmo_::is_cow_vmo xarray加锁
50 2.27% aster_frame::vm::space::VmSpace::activate MemorySet加锁
463 21.03% aster_frame::task::processor::current_task Processor加锁
16 0.73% aster_frame::task::processor::switch_to_task 与current_task有重叠 Processor加锁
200 9.08% aster_nix::process::signal::handle_pending_signal 与current有重叠 SigMask加锁
49 2.23% aster_nix::vm::vmo::Vmo_::commit xarray加锁
98 4.45% aster_nix::vm::vmar::vm_mapping::VmMapping::check_perm VmMappingInner加锁
81 3.68% aster_nix::vm::vmar::vm_mapping::VmMapping::vmo_offset VmMappingInner加锁
32 1.45% aster_frame::vm::space::VmSpace::is_writable MemorySet加锁
2 0.09% aster_nix::process::posix_thread::PosixThread::has_pending_signal SigQueues加锁
150 6.81% aster_nix::thread::task::create_new_user_task::user_task_entry MutexGuard<aster_nix::thread::status::ThreadStatus>>加锁
51 2.32% aster_nix::process::signal::poll::{impl#4}::drop MutexGuard<alloc::collections::btree::map::BTreeMap<keyable_arc::KeyableWeak<aster_nix::process::signal::poll::PolleeInner>PolleeInner加锁

From our current perspective, the bottleneck in network bandwidth performance lies within the memory systems and process management system. The excessive use of locks, particularly mutexes instead of lighter-weight locks, has prevented smoltcp from achieving its nominal bandwidth performance.

Perhaps next, we can focus on improving performance by:
1、Optimizing aster_frame::sync::wait::WaitQueue::wake_one().
2、Reducing the usage of mutexes, or replacing them with lighter-weight locks.

@skpupil skpupil changed the title [Performance] Locking Statistics (using the bandwidth testing program as an example). [Performance] Lock Statistics (using the bandwidth testing program as an example). May 8, 2024
@skpupil skpupil changed the title [Performance] Lock Statistics (using the bandwidth testing program as an example). [Performance] Frequent usage of mutex significantly slows down performance. May 8, 2024
@liqinggd
Copy link
Contributor

liqinggd commented May 9, 2024

Thanks for your discovery. We also found that the current implementation of locks is quite time-consuming when testing RamFS and the latency of a single syscall.

@lrh2000
Copy link
Contributor

lrh2000 commented May 10, 2024

aster_nix::vm::vmar::vm_mapping::{impl#1}::range

Each range() call locks mutexes for 3 times:

impl Interval<usize> for Arc<VmMapping> {
fn range(&self) -> Range<usize> {
self.map_to_addr()..self.map_to_addr() + self.map_size()
}
}

Each call to map_to_addr()/map_size() takes one mutex, so this simple code snippet takes three mutexes in total.

range() is called 5 times per system call (send()/recv()), so the single line self.map_to_addr()..self.map_to_addr() + self.map_size() locks mutexes 15 times per system call, or 34.01% of the total 44 mutex locks per system call.

I think 5 times come from here:

fn find(&'a self, range: &Range<Vaddr>) -> impl IntoIterator<Item = &'a Self::Item> + 'a {
let mut res = Vec::new();
let mut start_cursor = self.lower_bound(core::ops::Bound::Excluded(&range.start));
let start_key = {
start_cursor.move_prev();
if start_cursor.key().is_none()
|| start_cursor.value().unwrap().range().end <= range.start
{
// the start_cursor is pointing to the "ghost" non-element before the first element
// or not intersected
start_cursor.move_next();
}
if start_cursor.key().is_none()
|| start_cursor.value().unwrap().range().start >= range.end
{
// return None if the start_cursor is pointing to the "ghost" non-element after the last element
// or not intersected
return res;
}
start_cursor.key().unwrap()
};
let mut end_cursor = start_cursor.clone();
loop {
if end_cursor.key().is_none() || end_cursor.value().unwrap().range().start >= range.end
{
// the end_cursor is pointing to the "ghost" non-element after the last element
// or not intersected
break;
}
res.push(end_cursor.value().unwrap());
end_cursor.move_next();
}
res
}

for child_vmar_ in inner.child_vmar_s.find(&read_range) {
let child_vmar_range = child_vmar_.range();
if child_vmar_range.start <= read_start && read_end <= child_vmar_range.end {
let child_offset = read_start - child_vmar_range.start;
return child_vmar_.read(child_offset, buf);
}
}

@lrh2000
Copy link
Contributor

lrh2000 commented May 11, 2024

1、Optimizing aster_frame::sync::wait::WaitQueue::wake_one().

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

3 participants