From 6b294ab114fd6e7b17d415ac4dc0883cbded30fe Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Sun, 14 Jun 2026 20:48:02 +0200 Subject: [PATCH 01/14] [#186]: added trace_cpu_frequency and trace_cpu_idle tracepoints in metrics_tracer.rs. Added CpuFrequency structure --- core/src/components/metrics_tracer/src/cpu.rs | 36 +++++ .../metrics_tracer/src/data_structures.rs | 36 +++-- .../src/components/metrics_tracer/src/main.rs | 131 ++++++++++++------ core/src/components/metrics_tracer/src/mod.rs | 3 +- 4 files changed, 151 insertions(+), 55 deletions(-) create mode 100644 core/src/components/metrics_tracer/src/cpu.rs diff --git a/core/src/components/metrics_tracer/src/cpu.rs b/core/src/components/metrics_tracer/src/cpu.rs new file mode 100644 index 00000000..c5b76f58 --- /dev/null +++ b/core/src/components/metrics_tracer/src/cpu.rs @@ -0,0 +1,36 @@ +//tracepoint:power:cpu_frequency +//tracepoint:power:cpu_frequency_limits +//tracepoint:power:cpu_idle +//tracepoint:power:cpu_idle_miss +use aya_ebpf::programs::TracePointContext; +use aya_log_ebpf::info; + +use crate::data_structures::{CPU_FREQUENCY, CpuFrequency}; + +//sys/kernel/tracing/events/power/cpu_frequency + +pub fn cpu_frequency(ctx: TracePointContext) -> Result<(), i64> { + let state_offset = 8; + let cpu_id_offset = 12; + let state: u32 = unsafe { ctx.read_at(state_offset) }?; + let cpu_id: u32 = unsafe { ctx.read_at(cpu_id_offset) }?; + + let cpu_freq_data = CpuFrequency { + cpu_id, + cpu_freq: state, + }; + + CPU_FREQUENCY.output(&ctx, &cpu_freq_data, 0); + Ok(()) +} +//sys/kernel/tracing/events/power/cpu_idle + +pub fn cpu_idle(ctx: TracePointContext) -> Result<(), i64> { + let state_offset = 8; + let cpu_id_offset = 12; + let state: u32 = unsafe { ctx.read_at(state_offset) }?; + let cpu_id: u32 = unsafe { ctx.read_at(cpu_id_offset) }?; + + info!(&ctx, "CPU idle: State: {} cpu_id: {}", state, cpu_id); + Ok(()) +} diff --git a/core/src/components/metrics_tracer/src/data_structures.rs b/core/src/components/metrics_tracer/src/data_structures.rs index e9866a83..ca2d4373 100644 --- a/core/src/components/metrics_tracer/src/data_structures.rs +++ b/core/src/components/metrics_tracer/src/data_structures.rs @@ -1,22 +1,25 @@ -use aya_ebpf::{macros::map, maps::{LruPerCpuHashMap, HashMap, PerfEventArray}}; +use aya_ebpf::{ + macros::map, + maps::{HashMap, LruPerCpuHashMap, PerfEventArray}, +}; pub const TASK_COMM_LEN: usize = 16; -#[repr(C,packed)] +#[repr(C, packed)] pub struct NetworkMetrics { pub tgid: u32, pub comm: [u8; TASK_COMM_LEN], pub ts_us: u64, - pub sk_err: i32, // Offset 284 - pub sk_err_soft: i32, // Offset 600 - pub sk_backlog_len: i32, // Offset 196 - pub sk_write_memory_queued: i32,// Offset 376 - pub sk_receive_buffer_size: i32,// Offset 244 - pub sk_ack_backlog: u32, // Offset 604 - pub sk_drops: i32, // Offset 136 + pub sk_err: i32, // Offset 284 + pub sk_err_soft: i32, // Offset 600 + pub sk_backlog_len: i32, // Offset 196 + pub sk_write_memory_queued: i32, // Offset 376 + pub sk_receive_buffer_size: i32, // Offset 244 + pub sk_ack_backlog: u32, // Offset 604 + pub sk_drops: i32, // Offset 136 } -#[repr(C,packed)] +#[repr(C, packed)] #[derive(Copy, Clone)] pub struct TimeStampStartInfo { pub comm: [u8; TASK_COMM_LEN], @@ -25,7 +28,7 @@ pub struct TimeStampStartInfo { } // Event we send to userspace when latency is computed -#[repr(C,packed)] +#[repr(C, packed)] #[derive(Copy, Clone)] pub struct TimeStampEvent { pub delta_us: u64, @@ -41,6 +44,11 @@ pub struct TimeStampEvent { pub daddr_v6: [u32; 4], } +pub struct CpuFrequency { + pub(crate) cpu_id: u32, + pub(crate) cpu_freq: u32, +} + // Map: connect-start timestamp by socket pointer #[map(name = "time_stamp_start")] pub static mut TIME_STAMP_START: HashMap<*mut core::ffi::c_void, TimeStampStartInfo> = @@ -48,7 +56,11 @@ pub static mut TIME_STAMP_START: HashMap<*mut core::ffi::c_void, TimeStampStartI // Perf event channel for emitting Event to userspace #[map(name = "time_stamp_events")] -pub static mut TIME_STAMP_EVENTS: PerfEventArray = PerfEventArray::::new(0); +pub static mut TIME_STAMP_EVENTS: PerfEventArray = + PerfEventArray::::new(0); #[map(name = "net_metrics")] pub static NET_METRICS: PerfEventArray = PerfEventArray::new(0); + +#[map(name = "cpu_frequency")] +pub static CPU_FREQUENCY: PerfEventArray = PerfEventArray::new(0); diff --git a/core/src/components/metrics_tracer/src/main.rs b/core/src/components/metrics_tracer/src/main.rs index 216a6aca..c4dd7ceb 100644 --- a/core/src/components/metrics_tracer/src/main.rs +++ b/core/src/components/metrics_tracer/src/main.rs @@ -3,22 +3,28 @@ #![allow(warnings)] mod bindings; +mod cpu; mod data_structures; - use core::{mem, ptr}; use crate::bindings::net_device; -use aya_ebpf::helpers::generated::{bpf_ktime_get_ns, bpf_perf_event_output}; +use crate::cpu::{cpu_frequency, cpu_idle}; +use crate::data_structures::NET_METRICS; +use crate::data_structures::{ + NetworkMetrics, TASK_COMM_LEN, TIME_STAMP_EVENTS, TIME_STAMP_START, TimeStampEvent, + TimeStampStartInfo, +}; use aya_ebpf::EbpfContext; -use aya_ebpf::helpers::{bpf_get_current_comm, bpf_probe_read_kernel, bpf_probe_read_kernel_str_bytes}; -use aya_ebpf::macros::{kprobe, map}; -use aya_ebpf::maps::{HashMap, PerfEventArray}; -use aya_ebpf::programs::ProbeContext; use aya_ebpf::helpers::bpf_get_current_pid_tgid; -use crate::data_structures::{NetworkMetrics, TimeStampEvent, TimeStampStartInfo, TASK_COMM_LEN, TIME_STAMP_EVENTS, TIME_STAMP_START}; -use crate::data_structures::NET_METRICS; +use aya_ebpf::helpers::generated::{bpf_ktime_get_ns, bpf_perf_event_output}; +use aya_ebpf::helpers::{ + bpf_get_current_comm, bpf_probe_read_kernel, bpf_probe_read_kernel_str_bytes, +}; +use aya_ebpf::macros::{kprobe, map, tracepoint}; +use aya_ebpf::maps::{HashMap, PerfEventArray}; +use aya_ebpf::programs::{ProbeContext, TracePointContext}; -const AF_INET: u16 = 2; +const AF_INET: u16 = 2; const AF_INET6: u16 = 10; const TCP_SYN_SENT: u8 = 2; @@ -48,13 +54,33 @@ fn try_metrics_tracer(ctx: ProbeContext) -> Result { let sk_ack_backlog_offset = 604; let sk_drops_offset = 136; - let sk_err = unsafe { bpf_probe_read_kernel::(sk_pointer.add(sk_err_offset) as *const i32).map_err(|_| 1)? }; - let sk_err_soft = unsafe { bpf_probe_read_kernel::(sk_pointer.add(sk_err_soft_offset) as *const i32).map_err(|_| 1)? }; - let sk_backlog_len = unsafe { bpf_probe_read_kernel::(sk_pointer.add(sk_backlog_len_offset) as *const i32).map_err(|_| 1)? }; - let sk_write_memory_queued = unsafe { bpf_probe_read_kernel::(sk_pointer.add(sk_write_memory_queued_offset) as *const i32).map_err(|_| 1)? }; - let sk_receive_buffer_size = unsafe { bpf_probe_read_kernel::(sk_pointer.add(sk_receive_buffer_size_offset) as *const i32).map_err(|_| 1)? }; - let sk_ack_backlog = unsafe { bpf_probe_read_kernel::(sk_pointer.add(sk_ack_backlog_offset) as *const u32).map_err(|_| 1)? }; - let sk_drops = unsafe { bpf_probe_read_kernel::(sk_pointer.add(sk_drops_offset) as *const i32).map_err(|_| 1)? }; + let sk_err = unsafe { + bpf_probe_read_kernel::(sk_pointer.add(sk_err_offset) as *const i32).map_err(|_| 1)? + }; + let sk_err_soft = unsafe { + bpf_probe_read_kernel::(sk_pointer.add(sk_err_soft_offset) as *const i32) + .map_err(|_| 1)? + }; + let sk_backlog_len = unsafe { + bpf_probe_read_kernel::(sk_pointer.add(sk_backlog_len_offset) as *const i32) + .map_err(|_| 1)? + }; + let sk_write_memory_queued = unsafe { + bpf_probe_read_kernel::(sk_pointer.add(sk_write_memory_queued_offset) as *const i32) + .map_err(|_| 1)? + }; + let sk_receive_buffer_size = unsafe { + bpf_probe_read_kernel::(sk_pointer.add(sk_receive_buffer_size_offset) as *const i32) + .map_err(|_| 1)? + }; + let sk_ack_backlog = unsafe { + bpf_probe_read_kernel::(sk_pointer.add(sk_ack_backlog_offset) as *const u32) + .map_err(|_| 1)? + }; + let sk_drops = unsafe { + bpf_probe_read_kernel::(sk_pointer.add(sk_drops_offset) as *const i32) + .map_err(|_| 1)? + }; let net_metrics = NetworkMetrics { tgid: tgid, @@ -79,16 +105,21 @@ fn try_metrics_tracer(ctx: ProbeContext) -> Result { // Monitor on tcp_sendmsg, tcp_v4_connect #[kprobe] fn tcp_v6_connect(ctx: ProbeContext) -> u32 { - match on_connect(ctx) { Ok(_) => 0, Err(e) => e as u32 } + match on_connect(ctx) { + Ok(_) => 0, + Err(e) => e as u32, + } } // Monitor on tcp_sendmsg, tcp_v4_connect #[kprobe] fn tcp_v4_connect(ctx: ProbeContext) -> u32 { - match on_connect(ctx) { Ok(_) => 0, Err(e) => e as u32 } + match on_connect(ctx) { + Ok(_) => 0, + Err(e) => e as u32, + } } - fn on_connect(ctx: ProbeContext) -> Result<(), i64> { let sk = ctx.arg::<*mut bindings::sock>(0).ok_or(1i64)?; if sk.is_null() { @@ -107,14 +138,19 @@ fn on_connect(ctx: ProbeContext) -> Result<(), i64> { start.comm.copy_from_slice(&comm); } let map_ptr = &raw mut TIME_STAMP_START; - (*map_ptr).insert(&(sk as *mut core::ffi::c_void), &start, 0).map_err(|_| 1)?; + (*map_ptr) + .insert(&(sk as *mut core::ffi::c_void), &start, 0) + .map_err(|_| 1)?; } Ok(()) } #[kprobe] fn tcp_rcv_state_process(ctx: ProbeContext) -> u32 { - match on_rcv_state_process(ctx) { Ok(_) => 0, Err(e) => e as u32 } + match on_rcv_state_process(ctx) { + Ok(_) => 0, + Err(e) => e as u32, + } } fn on_rcv_state_process(ctx: ProbeContext) -> Result<(), i64> { @@ -122,25 +158,25 @@ fn on_rcv_state_process(ctx: ProbeContext) -> Result<(), i64> { let sk = ctx.arg::<*mut bindings::sock>(0).unwrap_or(ptr::null_mut()); let sk = if sk.is_null() { ctx.arg::<*mut bindings::sock>(1).ok_or(1i64)? - } else { sk }; + } else { + sk + }; if sk.is_null() { return Err(1); } let skc_daddr_off = 0; - let skc_rcv_saddr_off = 4; + let skc_rcv_saddr_off = 4; let skc_dport_off = 12; let skc_num_off = 14; let skc_family_off = 16; let skc_state_off = 18; - let skc_v6_daddr_off = 56; - let skc_v6_rcv_saddr_off = 72; - - let state = unsafe { - bpf_probe_read_kernel::((sk as usize + skc_state_off) as *const u8) - }.map_err(|_| 1)?; + let skc_v6_daddr_off = 56; + let skc_v6_rcv_saddr_off = 72; + let state = unsafe { bpf_probe_read_kernel::((sk as usize + skc_state_off) as *const u8) } + .map_err(|_| 1)?; if state != TCP_SYN_SENT { return Ok(()); @@ -149,7 +185,8 @@ fn on_rcv_state_process(ctx: ProbeContext) -> Result<(), i64> { let start = unsafe { let map_ptr = &raw const TIME_STAMP_START; (*map_ptr).get(&((sk as usize) as *mut core::ffi::c_void)) - }.ok_or(1i64)?; + } + .ok_or(1i64)?; let now = unsafe { bpf_ktime_get_ns() }; let delta = now as i64 - start.ts_ns as i64; if delta <= 0 { @@ -176,16 +213,13 @@ fn on_rcv_state_process(ctx: ProbeContext) -> Result<(), i64> { // family, ports ev.af = unsafe { - bpf_probe_read_kernel::((sk as usize + skc_family_off) as *const u16) - .map_err(|_| 1)? + bpf_probe_read_kernel::((sk as usize + skc_family_off) as *const u16).map_err(|_| 1)? }; ev.lport = unsafe { - bpf_probe_read_kernel::((sk as usize + skc_num_off) as *const u16) - .map_err(|_| 1)? + bpf_probe_read_kernel::((sk as usize + skc_num_off) as *const u16).map_err(|_| 1)? }; ev.dport_be = unsafe { - bpf_probe_read_kernel::((sk as usize + skc_dport_off) as *const u16) - .map_err(|_| 1)? + bpf_probe_read_kernel::((sk as usize + skc_dport_off) as *const u16).map_err(|_| 1)? }; if ev.af == AF_INET { @@ -202,13 +236,13 @@ fn on_rcv_state_process(ctx: ProbeContext) -> Result<(), i64> { for i in 0..4 { ev.saddr_v6[i] = unsafe { bpf_probe_read_kernel::( - (sk as usize + skc_v6_rcv_saddr_off + i * 4) as *const u32 - ).map_err(|_| 1)? + (sk as usize + skc_v6_rcv_saddr_off + i * 4) as *const u32, + ) + .map_err(|_| 1)? }; ev.daddr_v6[i] = unsafe { - bpf_probe_read_kernel::( - (sk as usize + skc_v6_daddr_off + i * 4) as *const u32 - ).map_err(|_| 1)? + bpf_probe_read_kernel::((sk as usize + skc_v6_daddr_off + i * 4) as *const u32) + .map_err(|_| 1)? }; } } @@ -226,11 +260,24 @@ fn on_rcv_state_process(ctx: ProbeContext) -> Result<(), i64> { let _ = (*map_ptr).remove(&((sk as usize) as *mut core::ffi::c_void)); } - Ok(()) } +#[tracepoint] +fn trace_cpu_frequency(ctx: TracePointContext) -> u32 { + match cpu_frequency(ctx) { + Ok(_) => 0, + Err(e) => e as u32, + } +} +#[tracepoint] +fn trace_cpu_idle(ctx: TracePointContext) -> u32 { + match cpu_idle(ctx) { + Ok(_) => 0, + Err(e) => e as u32, + } +} // panic handler #[panic_handler] diff --git a/core/src/components/metrics_tracer/src/mod.rs b/core/src/components/metrics_tracer/src/mod.rs index 76d66830..35ef0d6d 100644 --- a/core/src/components/metrics_tracer/src/mod.rs +++ b/core/src/components/metrics_tracer/src/mod.rs @@ -1,2 +1,3 @@ mod bindings; -mod data_structures; \ No newline at end of file +mod cpu; +mod data_structures; From 936b5d86b4ea3b5eb252778dc5ad1a68530538a9 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Sun, 14 Jun 2026 20:49:14 +0200 Subject: [PATCH 02/14] [#186]: added cpu frequency tracer user space functions to read the data from the perf buffer --- core/common/src/buffer_type.rs | 63 ++++++++++++++++++++++ core/common/src/program_handlers.rs | 52 +++++++++++++++++- core/src/components/metrics/src/helpers.rs | 26 +++++++++ core/src/components/metrics/src/main.rs | 18 ++++++- 4 files changed, 156 insertions(+), 3 deletions(-) diff --git a/core/common/src/buffer_type.rs b/core/common/src/buffer_type.rs index 45d82c81..22037cc3 100644 --- a/core/common/src/buffer_type.rs +++ b/core/common/src/buffer_type.rs @@ -128,6 +128,14 @@ pub struct TimeStampMetrics { } #[cfg(feature = "monitoring-structs")] unsafe impl aya::Pod for TimeStampMetrics {} +#[cfg(feature = "monitoring-structs")] +#[repr(C, packed)] +#[derive(Clone, Copy, Zeroable)] +pub struct CpuFrequency { + pub cpu_id: u32, + pub cpu_freq: u32, +} +unsafe impl aya::Pod for CpuFrequency {} // docs: // This function perform a byte swap from little-endian to big-endian @@ -156,6 +164,8 @@ pub enum BufferType { NetworkMetrics, #[cfg(feature = "monitoring-structs")] TimeStampMetrics, + #[cfg(feature = "monitoring-structs")] + CpuFrequency, } // IDEA: this is an experimental implementation to centralize buffer reading logic @@ -476,6 +486,45 @@ impl BufferType { } } } + + #[cfg(feature = "monitoring-structs")] + pub async fn read_cpu_frequency( + buffers: &mut [BytesMut], + tot_events: i32, + offset: i32, + //exporter: &str, + //metrics: Arc, + ) { + for i in offset..tot_events { + let vec_bytes = &buffers[i as usize]; + if vec_bytes.len() < std::mem::size_of::() { + error!( + "Corrupted Cpu Frequency Metrics data. Raw data: {}. Readed {} bytes expected {} bytes", + vec_bytes + .iter() + .map(|b| format!("{:02x}", b)) + .collect::>() + .join(" "), + vec_bytes.len(), + std::mem::size_of::() + ); + continue; + } + if vec_bytes.len() >= std::mem::size_of::() { + let cpu_freq_metrics: CpuFrequency = + unsafe { std::ptr::read_unaligned(vec_bytes.as_ptr() as *const _) }; + + //match exporter { + // "otlp" => metrics.record_timestamp_metrics(&time_stamp_event), + // _ => continue, + //} + + let cpu_id = cpu_freq_metrics.cpu_id; + let cpu_freq = cpu_freq_metrics.cpu_freq; + info!("Cpu id: {} Cpu frequency: {}", cpu_id, cpu_freq); + } + } + } } // docs: read buffer function: @@ -548,6 +597,11 @@ pub async fn read_perf_buffer>( ) .await } + #[cfg(feature = "monitoring-structs")] + BufferType::CpuFrequency => { + BufferType::read_cpu_frequency(&mut buffers, tot_events, offset) + .await + } } } } @@ -572,6 +626,8 @@ pub enum BufferSize { NetworkMetricsEvents, #[cfg(feature = "monitoring-structs")] TimeMetricsEvents, + #[cfg(feature = "monitoring-structs")] + CpuFrequency, } #[cfg(feature = "buffer-reader")] impl BufferSize { @@ -587,6 +643,8 @@ impl BufferSize { BufferSize::NetworkMetricsEvents => std::mem::size_of::(), #[cfg(feature = "monitoring-structs")] BufferSize::TimeMetricsEvents => std::mem::size_of::(), + #[cfg(feature = "monitoring-structs")] + BufferSize::CpuFrequency => std::mem::size_of::(), } } pub fn set_buffer(&self) -> Vec { @@ -630,6 +688,11 @@ impl BufferSize { let capacity = self.get_size() * 1024; return vec![BytesMut::with_capacity(capacity); tot_cpu]; } + #[cfg(feature = "monitoring-structs")] + BufferSize::CpuFrequency => { + let capacity = self.get_size() * 1024; + return vec![BytesMut::with_capacity(capacity); tot_cpu]; + } } } } diff --git a/core/common/src/program_handlers.rs b/core/common/src/program_handlers.rs index 347be51f..fc3d6d51 100644 --- a/core/common/src/program_handlers.rs +++ b/core/common/src/program_handlers.rs @@ -1,4 +1,7 @@ -use aya::{Ebpf, programs::KProbe}; +use aya::{ + Ebpf, + programs::{KProbe, TracePoint}, +}; use std::convert::TryInto; use std::sync::{Arc, Mutex}; use tracing::{error, info}; @@ -48,3 +51,50 @@ pub fn load_program( Ok(()) } + +#[cfg(feature = "program-handlers")] +pub fn load_tracepoint_program( + bpf: Arc>, + program_name: &str, + tracepoint_type: &str, + tracepoint_symbol: &str, +) -> Result<(), anyhow::Error> { + let mut bpf_new = bpf + .lock() + .map_err(|e| anyhow::anyhow!("Cannot get value from lock. Reason: {}", e))?; + + // Load and attach the eBPF program + let program: &mut TracePoint = bpf_new + .program_mut(program_name) + .ok_or_else(|| anyhow::anyhow!("Program {} not found", program_name))? + .try_into() + .map_err(|e| anyhow::anyhow!("Failed to convert program: {:?}", e))?; + + // STEP 1: load program + + program + .load() + .map_err(|e| anyhow::anyhow!("Cannot load program: {}. Error: {}", &program_name, e))?; + + // STEP 2: Attach the loaded program to kernel symbol + match program.attach(tracepoint_type, tracepoint_symbol) { + Ok(_) => info!( + "{} program attached successfully to tracepoint {}", + &program_name, &tracepoint_symbol + ), + Err(e) => { + error!( + "Error attaching {} program to tracepoint {}. Reason: {:?}", + &program_name, &tracepoint_symbol, e + ); + return Err(anyhow::anyhow!( + "Failed to attach program {} to tracepoint {}. Reason {:?}", + &program_name, + &tracepoint_symbol, + e + )); + } + }; + + Ok(()) +} diff --git a/core/src/components/metrics/src/helpers.rs b/core/src/components/metrics/src/helpers.rs index 804e9306..8eb01d37 100644 --- a/core/src/components/metrics/src/helpers.rs +++ b/core/src/components/metrics/src/helpers.rs @@ -62,9 +62,14 @@ pub async fn event_listener(bpf_maps: BpfMapsData, meter: Meter) -> Result<(), a .remove("net_metrics") .expect("Cannot create net_perf_buffer"); + let (_cpu_frequency_events_array, cpu_frequency_perf_buffer) = maps + .remove("cpu_frequency") + .expect("Cannot create cpu_frequency_perf_buffer"); + // Allocate byte-buffers sized for each structure type let net_metrics_buffers = BufferSize::NetworkMetricsEvents.set_buffer(); let time_stamp_events_buffers = BufferSize::TimeMetricsEvents.set_buffer(); + let cpu_frequency_events_buffers = BufferSize::CpuFrequency.set_buffer(); let metrics = Arc::new(Metrics::new(&meter)); @@ -100,6 +105,21 @@ pub async fn event_listener(bpf_maps: BpfMapsData, meter: Meter) -> Result<(), a }) }; + let cpu_frequency_metrics = { + let metrics = Arc::clone(&metrics); + let mut array_buffers = cpu_frequency_perf_buffer; + let mut buffers = cpu_frequency_events_buffers; + tokio::spawn(async move { + read_perf_buffer( + array_buffers, + buffers, + BufferType::CpuFrequency, + Some(metrics), + ) + .await; + }) + }; + info!("Event listeners started, entering main loop..."); tokio::select! { @@ -115,6 +135,12 @@ pub async fn event_listener(bpf_maps: BpfMapsData, meter: Meter) -> Result<(), a } } + result = cpu_frequency_metrics => { + if let Err(e) = result { + error!("Cpu frequency events task failed: {:?}", e); + } + } + _ = signal::ctrl_c() => { info!("Ctrl-C received, shutting down..."); } diff --git a/core/src/components/metrics/src/main.rs b/core/src/components/metrics/src/main.rs index 0211be68..b8f3c862 100644 --- a/core/src/components/metrics/src/main.rs +++ b/core/src/components/metrics/src/main.rs @@ -26,7 +26,7 @@ use cortexbrain_common::{ constants, logger::otlp_logger_init, map_handlers::{init_bpf_maps, map_pinner}, - program_handlers::load_program, + program_handlers::{load_program, load_tracepoint_program}, }; #[tokio::main] @@ -46,6 +46,7 @@ async fn main() -> Result<(), anyhow::Error> { let tcp_bpf = bpf.clone(); let tcp_rev_bpf = bpf.clone(); let tcp_v6_bpf = bpf.clone(); + let cpu_frequency = bpf.clone(); info!("Running Ebpf logger"); info!("loading programs"); @@ -53,7 +54,11 @@ async fn main() -> Result<(), anyhow::Error> { let bpf_map_save_path = env::var(constants::PIN_MAP_PATH).context("PIN_MAP_PATH environment variable required")?; - let map_data = vec!["time_stamp_events".to_string(), "net_metrics".to_string()]; + let map_data = vec![ + "time_stamp_events".to_string(), + "net_metrics".to_string(), + "cpu_frequency".to_string(), + ]; match init_bpf_maps(bpf.clone(), map_data) { Ok(bpf_maps) => { @@ -85,6 +90,15 @@ async fn main() -> Result<(), anyhow::Error> { .context( "An error occurred during the execution of load_program function", )?; + load_tracepoint_program( + cpu_frequency, + "trace_cpu_frequency", + "power", + "cpu_frequency", + ) + .context( + "An error occurred during the execution of load_program function", + )?; } // Hand off to the async event consumer From f240ff9e55a3ca48346e22059f04f08fe7477741 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Sun, 14 Jun 2026 20:51:09 +0200 Subject: [PATCH 03/14] [#186]: updated metrics.yaml kubernetes manifest. Added tracefs path using volumeMounts --- core/src/testing/metrics.yaml | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/core/src/testing/metrics.yaml b/core/src/testing/metrics.yaml index 8a6c7d82..ce797950 100644 --- a/core/src/testing/metrics.yaml +++ b/core/src/testing/metrics.yaml @@ -19,7 +19,7 @@ spec: hostNetwork: true containers: - name: metrics - image: lorenzotettamanti/cortexflow-metrics:otel-test-2 + image: lorenzotettamanti/cortexflow-metrics:otel-test-5 command: ["/bin/bash", "-c"] args: - | @@ -47,6 +47,9 @@ spec: - name: kernel-dev mountPath: /lib/modules readOnly: false + - name: tracefs + mountPath: /sys/kernel/debug + readOnly: false securityContext: privileged: true allowPrivilegeEscalation: true @@ -71,6 +74,9 @@ spec: - name: kernel-dev mountPath: /lib/modules readOnly: false + - name: tracefs + mountPath: /sys/kernel/debug + readOnly: false securityContext: privileged: true allowPrivilegeEscalation: true @@ -94,3 +100,10 @@ spec: hostPath: path: /lib/modules type: Directory + + - name: tracefs + hostPath: + path: /sys/kernel/debug + type: Directory + + \ No newline at end of file From 9802b0db756019882bd73a925059ffd4d8f3e7f2 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Sat, 20 Jun 2026 14:33:34 +0200 Subject: [PATCH 04/14] (feat): added memory tracing in kernel space using sys_enter_mmap tracepoint --- core/src/components/metrics_tracer/src/cpu.rs | 35 ++++++------ .../metrics_tracer/src/data_structures.rs | 21 ++++++- .../src/components/metrics_tracer/src/main.rs | 56 ++++++++++++++++++- .../components/metrics_tracer/src/memory.rs | 16 ++++++ core/src/components/metrics_tracer/src/mod.rs | 1 + 5 files changed, 106 insertions(+), 23 deletions(-) create mode 100644 core/src/components/metrics_tracer/src/memory.rs diff --git a/core/src/components/metrics_tracer/src/cpu.rs b/core/src/components/metrics_tracer/src/cpu.rs index c5b76f58..02eae0f1 100644 --- a/core/src/components/metrics_tracer/src/cpu.rs +++ b/core/src/components/metrics_tracer/src/cpu.rs @@ -2,35 +2,34 @@ //tracepoint:power:cpu_frequency_limits //tracepoint:power:cpu_idle //tracepoint:power:cpu_idle_miss -use aya_ebpf::programs::TracePointContext; +use aya_ebpf::{EbpfContext, programs::TracePointContext}; use aya_log_ebpf::info; use crate::data_structures::{CPU_FREQUENCY, CpuFrequency}; -//sys/kernel/tracing/events/power/cpu_frequency - -pub fn cpu_frequency(ctx: TracePointContext) -> Result<(), i64> { +pub fn cpu_idle(ctx: TracePointContext) -> Result<(), i64> { let state_offset = 8; let cpu_id_offset = 12; let state: u32 = unsafe { ctx.read_at(state_offset) }?; let cpu_id: u32 = unsafe { ctx.read_at(cpu_id_offset) }?; - let cpu_freq_data = CpuFrequency { - cpu_id, - cpu_freq: state, - }; - - CPU_FREQUENCY.output(&ctx, &cpu_freq_data, 0); + info!(&ctx, "CPU idle: State: {} cpu_id: {}", state, cpu_id); Ok(()) } -//sys/kernel/tracing/events/power/cpu_idle -pub fn cpu_idle(ctx: TracePointContext) -> Result<(), i64> { - let state_offset = 8; - let cpu_id_offset = 12; - let state: u32 = unsafe { ctx.read_at(state_offset) }?; - let cpu_id: u32 = unsafe { ctx.read_at(cpu_id_offset) }?; +pub fn per_cpu_bytes_alloc(ctx: &TracePointContext) -> Result<((u32, u32, [u8; 16])), i64> { + let bytes_alloc_offset = 64; + let pid_offset = 4; + let bytes_alloc = unsafe { ctx.read_at(bytes_alloc_offset) }?; + let pid = unsafe { ctx.read_at(pid_offset) }?; + let command = ctx.command()?; - info!(&ctx, "CPU idle: State: {} cpu_id: {}", state, cpu_id); - Ok(()) + //let cpu_freq_data = CpuFrequency { + // cpu_id, + // cpu_freq: state, + //}; + + //CPU_FREQUENCY.output(&ctx, &cpu_freq_data, 0); + + Ok((bytes_alloc, pid, command)) } diff --git a/core/src/components/metrics_tracer/src/data_structures.rs b/core/src/components/metrics_tracer/src/data_structures.rs index ca2d4373..877cc991 100644 --- a/core/src/components/metrics_tracer/src/data_structures.rs +++ b/core/src/components/metrics_tracer/src/data_structures.rs @@ -44,9 +44,23 @@ pub struct TimeStampEvent { pub daddr_v6: [u32; 4], } +#[repr(C, packed)] +#[derive(Copy, Clone)] pub struct CpuFrequency { - pub(crate) cpu_id: u32, - pub(crate) cpu_freq: u32, + //pub(crate) cpu_id: u32, + //pub(crate) cpu_freq: u32, + pub(crate) bytes_alloc: u32, + pub(crate) pid: u32, + pub(crate) command: [u8; 16], +} + +#[repr(C, packed)] +#[derive(Copy, Clone)] +pub struct MemAlloc { + pub(crate) tgid: u32, + pub(crate) length: u64, + pub(crate) addr: u64, + pub(crate) command: [u8; 16], } // Map: connect-start timestamp by socket pointer @@ -64,3 +78,6 @@ pub static NET_METRICS: PerfEventArray = PerfEventArray::new(0); #[map(name = "cpu_frequency")] pub static CPU_FREQUENCY: PerfEventArray = PerfEventArray::new(0); + +#[map(name = "mem_alloc")] +pub static MEM_ALLOC: PerfEventArray = PerfEventArray::new(0); diff --git a/core/src/components/metrics_tracer/src/main.rs b/core/src/components/metrics_tracer/src/main.rs index c4dd7ceb..bdc44550 100644 --- a/core/src/components/metrics_tracer/src/main.rs +++ b/core/src/components/metrics_tracer/src/main.rs @@ -5,15 +5,20 @@ mod bindings; mod cpu; mod data_structures; -use core::{mem, ptr}; +mod memory; use crate::bindings::net_device; -use crate::cpu::{cpu_frequency, cpu_idle}; +use crate::cpu::{cpu_idle, per_cpu_bytes_alloc}; +use crate::data_structures::CPU_FREQUENCY; +use crate::data_structures::CpuFrequency; +use crate::data_structures::MEM_ALLOC; +use crate::data_structures::MemAlloc; use crate::data_structures::NET_METRICS; use crate::data_structures::{ NetworkMetrics, TASK_COMM_LEN, TIME_STAMP_EVENTS, TIME_STAMP_START, TimeStampEvent, TimeStampStartInfo, }; +use crate::memory::enter_mmap; use aya_ebpf::EbpfContext; use aya_ebpf::helpers::bpf_get_current_pid_tgid; use aya_ebpf::helpers::generated::{bpf_ktime_get_ns, bpf_perf_event_output}; @@ -23,6 +28,7 @@ use aya_ebpf::helpers::{ use aya_ebpf::macros::{kprobe, map, tracepoint}; use aya_ebpf::maps::{HashMap, PerfEventArray}; use aya_ebpf::programs::{ProbeContext, TracePointContext}; +use core::{mem, ptr}; const AF_INET: u16 = 2; const AF_INET6: u16 = 10; @@ -265,7 +271,7 @@ fn on_rcv_state_process(ctx: ProbeContext) -> Result<(), i64> { #[tracepoint] fn trace_cpu_frequency(ctx: TracePointContext) -> u32 { - match cpu_frequency(ctx) { + match trace_cpu_metrics(&ctx) { Ok(_) => 0, Err(e) => e as u32, } @@ -279,6 +285,50 @@ fn trace_cpu_idle(ctx: TracePointContext) -> u32 { } } +fn trace_cpu_metrics(ctx: &TracePointContext) -> Result<(), i64> { + let (bytes_alloc, pid, command) = per_cpu_bytes_alloc(ctx)?; + //let (cpu_id, cpu_freq) = cpu_frequency(&ctx)?; + let cpu_metrics = CpuFrequency { + // cpu_id, + // cpu_freq, + bytes_alloc, + pid, + command, + }; + + unsafe { CPU_FREQUENCY.output(ctx, &cpu_metrics, 0) }; + + Ok(()) +} + +/// Tracepoint attached to `syscalls:sys_enter_mmap`. +/// +/// Emits a `MemAlloc` event for every `mmap` syscall. No PID/command filter +/// is applied yet (see the next update), so this will generate events for every +/// process in the system. +#[tracepoint] +fn trace_enter_mmap(ctx: TracePointContext) -> u32 { + match trace_memory_allocation(&ctx) { + Ok(_) => 0, + Err(e) => e as u32, + } +} + +fn trace_memory_allocation(ctx: &TracePointContext) -> Result<(), i64> { + let (tgid, addr, length, command) = enter_mmap(ctx)?; + + let memory_alloc_metrics = MemAlloc { + tgid, + addr, + length, + command, + }; + + unsafe { MEM_ALLOC.output(ctx, &memory_alloc_metrics, 0) }; + + Ok(()) +} + // panic handler #[panic_handler] fn panic(_info: &core::panic::PanicInfo) -> ! { diff --git a/core/src/components/metrics_tracer/src/memory.rs b/core/src/components/metrics_tracer/src/memory.rs new file mode 100644 index 00000000..de14c7a8 --- /dev/null +++ b/core/src/components/metrics_tracer/src/memory.rs @@ -0,0 +1,16 @@ +use aya_ebpf::{EbpfContext, programs::TracePointContext}; + +/// Read the fields of the `syscalls:sys_enter_mmap` tracepoint. +pub fn enter_mmap(ctx: &TracePointContext) -> Result<((u32, u64, u64, [u8; 16])), i64> { + // For syscall tracepoints `common_pid` is the TGID of the calling thread. + let tgid_offset = 4; + let addr_offset = 16; + let len_offset = 24; + + let tgid: u32 = unsafe { ctx.read_at(tgid_offset) }?; + let addr: u64 = unsafe { ctx.read_at(addr_offset) }?; + let len: u64 = unsafe { ctx.read_at(len_offset) }?; + let command = ctx.command()?; + + Ok((tgid, addr, len, command)) +} diff --git a/core/src/components/metrics_tracer/src/mod.rs b/core/src/components/metrics_tracer/src/mod.rs index 35ef0d6d..56f80cc7 100644 --- a/core/src/components/metrics_tracer/src/mod.rs +++ b/core/src/components/metrics_tracer/src/mod.rs @@ -1,3 +1,4 @@ mod bindings; mod cpu; mod data_structures; +mod memory; From ce341e58fdd69d4314636717d9fcb937046d73e7 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Sat, 20 Jun 2026 14:36:52 +0200 Subject: [PATCH 05/14] (feat: #186): added cpu_bytes_alloc_events_total, cpy_bytes_alloc mem_alloc_events_total, enter_mem_alloc metrics --- core/common/src/otel_metrics.rs | 75 ++++++++++++++++++++++++++++++++- 1 file changed, 73 insertions(+), 2 deletions(-) diff --git a/core/common/src/otel_metrics.rs b/core/common/src/otel_metrics.rs index f123b3b4..c256cc1d 100644 --- a/core/common/src/otel_metrics.rs +++ b/core/common/src/otel_metrics.rs @@ -11,7 +11,7 @@ //! extracted from the eBPF struct, allowing downstream collectors to group //! telemetry by process. -use crate::buffer_type::{NetworkMetrics, TimeStampMetrics}; +use crate::buffer_type::{CpuFrequency, MemAlloc, NetworkMetrics, TimeStampMetrics}; use opentelemetry::KeyValue; use opentelemetry::metrics::{Counter, Gauge, Histogram, Meter}; pub struct Metrics { @@ -35,6 +35,18 @@ pub struct Metrics { /// Histogram of `ts_us` values seen in both `net_metrics` and /// `time_stamp_events`. pub ts_us: Histogram, + + /// Cpu bytes alloc total events + pub cpu_bytes_alloc_events_total: Counter, + + /// Cpu bytes allocation + pub cpu_bytes_alloc: Gauge, + + /// Total number of memory allocation (mmap) events processed. + pub mem_alloc_events_total: Counter, + + /// Observed bytes requested via mmap syscalls. + pub enter_mem_alloc: Gauge, } impl Metrics { @@ -66,7 +78,7 @@ impl Metrics { // delta microseconds let delta_us = meter - .u64_histogram("cortexbrain_delta_us") + .u64_histogram("delta_us") .with_description("Distribution of delta_us values from timestamp events") .build(); @@ -76,6 +88,30 @@ impl Metrics { .with_description("Distribution of timestamp values from eBPF events") .build(); + // cpu bytes alloc total events + let cpu_bytes_alloc_events_total = meter + .u64_counter("bytes_alloc_events_total") + .with_description("Total bytes_alloc events occuring in the CPU") + .build(); + + // cpu bytes allocation + let cpu_bytes_alloc = meter + .i64_gauge("cpu_bytes_alloc") + .with_description("Cpu bytes allocation per event") + .build(); + + // memory allocation (mmap) events total + let mem_alloc_events_total = meter + .u64_counter("mem_alloc_events_total") + .with_description("Total number of memory allocation (mmap) events processed") + .build(); + + // bytes requested via mmap syscalls + let enter_mem_alloc = meter + .i64_gauge("enter_mem_alloc") + .with_description("Bytes requested via mmap syscalls") + .build(); + Self { events_total, packets_total, @@ -83,6 +119,10 @@ impl Metrics { sk_err, delta_us, ts_us, + cpu_bytes_alloc, + cpu_bytes_alloc_events_total, + mem_alloc_events_total, + enter_mem_alloc, } } @@ -130,4 +170,35 @@ impl Metrics { self.delta_us.record(m.delta_us, attrs); self.ts_us.record(m.ts_us, attrs); } + + pub fn record_cpu_bytes_alloc(&self, m: &CpuFrequency) { + let bytes_allocated = m.bytes_alloc; + let tgid = m.pid; // percpu tracepoints expose TGID in common_pid + let comm = String::from_utf8_lossy(&m.command); + let command = comm.trim_end_matches('\0').to_string(); + let attrs = &[ + KeyValue::new("tgid", tgid as i64), + KeyValue::new("command", command), + ]; + self.cpu_bytes_alloc_events_total.add(1, attrs); + self.cpu_bytes_alloc.record(bytes_allocated as i64, attrs); + } + + /// Record a single [`MemAlloc`] event (mmap syscall). + /// + /// Increments the dedicated `mem_alloc_events_total` counter and records + /// the requested length in the `enter_mem_alloc` gauge. The shared + /// `events_total` counter is intentionally **not** incremented for these + /// events. + pub fn record_enter_mem_alloc(&self, m: &MemAlloc) { + let comm = String::from_utf8_lossy(&m.command); + let command = comm.trim_end_matches('\0').to_string(); + let attrs = &[ + KeyValue::new("tgid", m.tgid as i64), + KeyValue::new("command", command), + ]; + + self.mem_alloc_events_total.add(1, attrs); + self.enter_mem_alloc.record(m.length as i64, attrs); + } } From 12ae5b52d6ccdbe74a1de4c4f4155b26e63acf95 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Sat, 20 Jun 2026 14:43:38 +0200 Subject: [PATCH 06/14] (feat : #186): added userspace consumer for memory allocation events. Fixed typo in percpu_alloc_percpu tracepoint --- core/common/src/buffer_type.rs | 126 ++++++++++++++++++--- core/src/components/metrics/src/helpers.rs | 26 +++++ core/src/components/metrics/src/main.rs | 15 ++- 3 files changed, 150 insertions(+), 17 deletions(-) diff --git a/core/common/src/buffer_type.rs b/core/common/src/buffer_type.rs index 22037cc3..63b01c23 100644 --- a/core/common/src/buffer_type.rs +++ b/core/common/src/buffer_type.rs @@ -132,11 +132,26 @@ unsafe impl aya::Pod for TimeStampMetrics {} #[repr(C, packed)] #[derive(Clone, Copy, Zeroable)] pub struct CpuFrequency { - pub cpu_id: u32, - pub cpu_freq: u32, + //pub cpu_id: u32, + //pub cpu_freq: u32, + pub bytes_alloc: u32, + pub pid: u32, + pub command: [u8; 16], } unsafe impl aya::Pod for CpuFrequency {} +#[cfg(feature = "monitoring-structs")] +#[repr(C, packed)] +#[derive(Clone, Copy, Zeroable)] +pub struct MemAlloc { + pub tgid: u32, + pub length: u64, + pub addr: u64, + pub command: [u8; TASK_COMM_LEN], +} +#[cfg(feature = "monitoring-structs")] +unsafe impl aya::Pod for MemAlloc {} + // docs: // This function perform a byte swap from little-endian to big-endian // It's used to reconstruct the correct IPv4 address from the u32 representation @@ -166,10 +181,10 @@ pub enum BufferType { TimeStampMetrics, #[cfg(feature = "monitoring-structs")] CpuFrequency, + #[cfg(feature = "monitoring-structs")] + MemAlloc, } -// IDEA: this is an experimental implementation to centralize buffer reading logic -// TODO: add variant for cortexflow API exporter #[cfg(feature = "buffer-reader")] impl BufferType { #[cfg(feature = "network-structs")] @@ -492,8 +507,8 @@ impl BufferType { buffers: &mut [BytesMut], tot_events: i32, offset: i32, - //exporter: &str, - //metrics: Arc, + exporter: &str, + metrics: Arc, ) { for i in offset..tot_events { let vec_bytes = &buffers[i as usize]; @@ -514,14 +529,69 @@ impl BufferType { let cpu_freq_metrics: CpuFrequency = unsafe { std::ptr::read_unaligned(vec_bytes.as_ptr() as *const _) }; - //match exporter { - // "otlp" => metrics.record_timestamp_metrics(&time_stamp_event), - // _ => continue, - //} + match exporter { + "otlp" => metrics.record_cpu_bytes_alloc(&cpu_freq_metrics), + _ => continue, + } + + //let cpu_id = cpu_freq_metrics.cpu_id; + //let cpu_freq = cpu_freq_metrics.cpu_freq; + let bytes_alloc = cpu_freq_metrics.bytes_alloc; + //info!( + // "Cpu id: {} Cpu frequency: {} Bytes alloc: {}", + // cpu_id, cpu_freq, bytes_alloc + //); + let pid = cpu_freq_metrics.pid; + let command = cpu_freq_metrics.command; + info!( + "Cpu Bytes alloc: {} pid : {} command: {:?}", + bytes_alloc, pid, command + ); + } + } + } + + #[cfg(feature = "monitoring-structs")] + pub async fn read_mem_alloc( + buffers: &mut [BytesMut], + tot_events: i32, + offset: i32, + exporter: &str, + metrics: Arc, + ) { + for i in offset..tot_events { + let vec_bytes = &buffers[i as usize]; + if vec_bytes.len() < std::mem::size_of::() { + error!( + "Corrupted MemAlloc data. Raw data: {}. Readed {} bytes expected {} bytes", + vec_bytes + .iter() + .map(|b| format!("{:02x}", b)) + .collect::>() + .join(" "), + vec_bytes.len(), + std::mem::size_of::() + ); + continue; + } + if vec_bytes.len() >= std::mem::size_of::() { + let mem_alloc: MemAlloc = + unsafe { std::ptr::read_unaligned(vec_bytes.as_ptr() as *const _) }; + + match exporter { + "otlp" => metrics.record_enter_mem_alloc(&mem_alloc), + _ => continue, + } + + let tgid = mem_alloc.tgid; + let command = String::from_utf8_lossy(&mem_alloc.command); + let addr = mem_alloc.addr; + let length = mem_alloc.length; - let cpu_id = cpu_freq_metrics.cpu_id; - let cpu_freq = cpu_freq_metrics.cpu_freq; - info!("Cpu id: {} Cpu frequency: {}", cpu_id, cpu_freq); + info!( + "MemAlloc - tgid: {}, command: {}, addr: {}, length: {}", + tgid, command, addr, length + ); } } } @@ -599,8 +669,25 @@ pub async fn read_perf_buffer>( } #[cfg(feature = "monitoring-structs")] BufferType::CpuFrequency => { - BufferType::read_cpu_frequency(&mut buffers, tot_events, offset) - .await + BufferType::read_cpu_frequency( + &mut buffers, + tot_events, + offset, + "otlp", + metrics.clone().expect("Metric required for CpuFrequency"), + ) + .await + } + #[cfg(feature = "monitoring-structs")] + BufferType::MemAlloc => { + BufferType::read_mem_alloc( + &mut buffers, + tot_events, + offset, + "otlp", + metrics.clone().expect("Metric required for MemAlloc"), + ) + .await } } } @@ -628,6 +715,8 @@ pub enum BufferSize { TimeMetricsEvents, #[cfg(feature = "monitoring-structs")] CpuFrequency, + #[cfg(feature = "monitoring-structs")] + MemAlloc, } #[cfg(feature = "buffer-reader")] impl BufferSize { @@ -645,6 +734,8 @@ impl BufferSize { BufferSize::TimeMetricsEvents => std::mem::size_of::(), #[cfg(feature = "monitoring-structs")] BufferSize::CpuFrequency => std::mem::size_of::(), + #[cfg(feature = "monitoring-structs")] + BufferSize::MemAlloc => std::mem::size_of::(), } } pub fn set_buffer(&self) -> Vec { @@ -693,6 +784,11 @@ impl BufferSize { let capacity = self.get_size() * 1024; return vec![BytesMut::with_capacity(capacity); tot_cpu]; } + #[cfg(feature = "monitoring-structs")] + BufferSize::MemAlloc => { + let capacity = self.get_size() * 1024; + return vec![BytesMut::with_capacity(capacity); tot_cpu]; + } } } } diff --git a/core/src/components/metrics/src/helpers.rs b/core/src/components/metrics/src/helpers.rs index 8eb01d37..ba63c9f9 100644 --- a/core/src/components/metrics/src/helpers.rs +++ b/core/src/components/metrics/src/helpers.rs @@ -66,10 +66,15 @@ pub async fn event_listener(bpf_maps: BpfMapsData, meter: Meter) -> Result<(), a .remove("cpu_frequency") .expect("Cannot create cpu_frequency_perf_buffer"); + let (_mem_alloc_array, mem_alloc_perf_buffer) = maps + .remove("mem_alloc") + .expect("Cannot create mem_alloc perf buffer"); + // Allocate byte-buffers sized for each structure type let net_metrics_buffers = BufferSize::NetworkMetricsEvents.set_buffer(); let time_stamp_events_buffers = BufferSize::TimeMetricsEvents.set_buffer(); let cpu_frequency_events_buffers = BufferSize::CpuFrequency.set_buffer(); + let mem_alloc_buffers = BufferSize::MemAlloc.set_buffer(); let metrics = Arc::new(Metrics::new(&meter)); @@ -120,6 +125,21 @@ pub async fn event_listener(bpf_maps: BpfMapsData, meter: Meter) -> Result<(), a }) }; + let mem_alloc_metrics = { + let metrics = Arc::clone(&metrics); + let mut array_buffers = mem_alloc_perf_buffer; + let mut buffers = mem_alloc_buffers; + tokio::spawn(async move { + read_perf_buffer( + array_buffers, + buffers, + BufferType::MemAlloc, + Some(metrics), + ) + .await; + }) + }; + info!("Event listeners started, entering main loop..."); tokio::select! { @@ -141,6 +161,12 @@ pub async fn event_listener(bpf_maps: BpfMapsData, meter: Meter) -> Result<(), a } } + result = mem_alloc_metrics => { + if let Err(e) = result { + error!("MemAlloc events task failed: {:?}", e); + } + } + _ = signal::ctrl_c() => { info!("Ctrl-C received, shutting down..."); } diff --git a/core/src/components/metrics/src/main.rs b/core/src/components/metrics/src/main.rs index b8f3c862..591b4b3c 100644 --- a/core/src/components/metrics/src/main.rs +++ b/core/src/components/metrics/src/main.rs @@ -47,6 +47,7 @@ async fn main() -> Result<(), anyhow::Error> { let tcp_rev_bpf = bpf.clone(); let tcp_v6_bpf = bpf.clone(); let cpu_frequency = bpf.clone(); + let mem_alloc_bpf = bpf.clone(); info!("Running Ebpf logger"); info!("loading programs"); @@ -58,6 +59,7 @@ async fn main() -> Result<(), anyhow::Error> { "time_stamp_events".to_string(), "net_metrics".to_string(), "cpu_frequency".to_string(), + "mem_alloc".to_string(), ]; match init_bpf_maps(bpf.clone(), map_data) { @@ -93,8 +95,17 @@ async fn main() -> Result<(), anyhow::Error> { load_tracepoint_program( cpu_frequency, "trace_cpu_frequency", - "power", - "cpu_frequency", + "percpu", + "percpu_alloc_percpu", + ) + .context( + "An error occurred during the execution of load_program function", + )?; + load_tracepoint_program( + mem_alloc_bpf, + "trace_enter_mmap", + "syscalls", + "sys_enter_mmap", ) .context( "An error occurred during the execution of load_program function", From fc158c8a223391750cdab9237bad24f9f0a574f2 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Sun, 21 Jun 2026 22:48:36 +0200 Subject: [PATCH 07/14] (feat: #186) : Added scheduler tracing metrics (sched_stat_wait,sched_stat_runtime) --- core/src/components/metrics_tracer/src/cpu.rs | 24 +++++++++ .../metrics_tracer/src/data_structures.rs | 22 ++++++++ .../src/components/metrics_tracer/src/main.rs | 52 +++++++++++++++++-- 3 files changed, 94 insertions(+), 4 deletions(-) diff --git a/core/src/components/metrics_tracer/src/cpu.rs b/core/src/components/metrics_tracer/src/cpu.rs index 02eae0f1..ba545150 100644 --- a/core/src/components/metrics_tracer/src/cpu.rs +++ b/core/src/components/metrics_tracer/src/cpu.rs @@ -33,3 +33,27 @@ pub fn per_cpu_bytes_alloc(ctx: &TracePointContext) -> Result<((u32, u32, [u8; 1 Ok((bytes_alloc, pid, command)) } + +pub fn sched_stat_wait(ctx: &TracePointContext) -> Result<((u32, u64, [u8; 16])), i64> { + let pid_offset = 4; + let delay_offset = 16; + + let pid = unsafe { ctx.read_at(pid_offset) }?; + + let delay = unsafe { ctx.read_at(delay_offset) }?; + let command = ctx.command()?; + + Ok((pid, delay, command)) +} + +pub fn sched_stat_runtime(ctx: &TracePointContext) -> Result<((u32, u64, [u8; 16])), i64> { + let pid_offset = 4; + let runtime_offset = 16; + + let pid = unsafe { ctx.read_at(pid_offset) }?; + + let runtime = unsafe { ctx.read_at(runtime_offset) }?; + let command = ctx.command()?; + + Ok((pid, runtime, command)) +} diff --git a/core/src/components/metrics_tracer/src/data_structures.rs b/core/src/components/metrics_tracer/src/data_structures.rs index 877cc991..91249a14 100644 --- a/core/src/components/metrics_tracer/src/data_structures.rs +++ b/core/src/components/metrics_tracer/src/data_structures.rs @@ -63,6 +63,22 @@ pub struct MemAlloc { pub(crate) command: [u8; 16], } +#[repr(C, packed)] +#[derive(Copy, Clone)] +pub struct SchedStatWait { + pub(crate) tgid: u32, + pub(crate) delay: u64, + pub(crate) command: [u8; 16], +} + +#[repr(C, packed)] +#[derive(Copy, Clone)] +pub struct SchedStatRuntime { + pub(crate) tgid: u32, + pub(crate) runtime: u64, + pub(crate) command: [u8; 16], +} + // Map: connect-start timestamp by socket pointer #[map(name = "time_stamp_start")] pub static mut TIME_STAMP_START: HashMap<*mut core::ffi::c_void, TimeStampStartInfo> = @@ -81,3 +97,9 @@ pub static CPU_FREQUENCY: PerfEventArray = PerfEventArray::new(0); #[map(name = "mem_alloc")] pub static MEM_ALLOC: PerfEventArray = PerfEventArray::new(0); + +#[map(name = "sched_stat_wait")] +pub static SCHED_STAT_WAIT: PerfEventArray = PerfEventArray::new(0); + +#[map(name = "sched_stat_runtime")] +pub static SCHED_STAT_RUNTIME: PerfEventArray = PerfEventArray::new(0); diff --git a/core/src/components/metrics_tracer/src/main.rs b/core/src/components/metrics_tracer/src/main.rs index bdc44550..1ff160ad 100644 --- a/core/src/components/metrics_tracer/src/main.rs +++ b/core/src/components/metrics_tracer/src/main.rs @@ -8,12 +8,12 @@ mod data_structures; mod memory; use crate::bindings::net_device; -use crate::cpu::{cpu_idle, per_cpu_bytes_alloc}; -use crate::data_structures::CPU_FREQUENCY; +use crate::cpu::{cpu_idle, per_cpu_bytes_alloc, sched_stat_runtime, sched_stat_wait}; use crate::data_structures::CpuFrequency; -use crate::data_structures::MEM_ALLOC; -use crate::data_structures::MemAlloc; use crate::data_structures::NET_METRICS; +use crate::data_structures::{CPU_FREQUENCY, SchedStatWait}; +use crate::data_structures::{MEM_ALLOC, SCHED_STAT_RUNTIME, SCHED_STAT_WAIT}; +use crate::data_structures::{MemAlloc, SchedStatRuntime}; use crate::data_structures::{ NetworkMetrics, TASK_COMM_LEN, TIME_STAMP_EVENTS, TIME_STAMP_START, TimeStampEvent, TimeStampStartInfo, @@ -329,6 +329,50 @@ fn trace_memory_allocation(ctx: &TracePointContext) -> Result<(), i64> { Ok(()) } +#[tracepoint] +fn trace_sched_stat_wait(ctx: TracePointContext) -> u32 { + match sched_stat_wait_tracer(&ctx) { + Ok(_) => 0, + Err(e) => e as u32, + } +} + +fn sched_stat_wait_tracer(ctx: &TracePointContext) -> Result<(), i64> { + let (tgid, delay, command) = sched_stat_wait(ctx)?; + + let sched_stat_wait_data = SchedStatWait { + tgid, + delay, + command, + }; + + unsafe { SCHED_STAT_WAIT.output(ctx, &sched_stat_wait_data, 0) }; + + Ok(()) +} + +#[tracepoint] +fn trace_sched_stat_runtime(ctx: TracePointContext) -> u32 { + match sched_stat_runtime_tracer(&ctx) { + Ok(_) => 0, + Err(e) => e as u32, + } +} + +fn sched_stat_runtime_tracer(ctx: &TracePointContext) -> Result<(), i64> { + let (tgid, runtime, command) = sched_stat_runtime(ctx)?; + + let sched_stat_runtime_data = SchedStatRuntime { + tgid, + runtime, + command, + }; + + unsafe { SCHED_STAT_RUNTIME.output(ctx, &sched_stat_runtime_data, 0) }; + + Ok(()) +} + // panic handler #[panic_handler] fn panic(_info: &core::panic::PanicInfo) -> ! { From 84ca27fb5e712ae0beec0ca30ac80af9b730e898 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Sun, 21 Jun 2026 22:49:32 +0200 Subject: [PATCH 08/14] (feat: #186) : added userspace consumer for scheduler metrics --- core/common/src/buffer_type.rs | 154 +++++++++++++++++++++ core/common/src/otel_metrics.rs | 54 +++++++- core/src/components/metrics/src/helpers.rs | 52 +++++++ core/src/components/metrics/src/main.rs | 22 +++ 4 files changed, 281 insertions(+), 1 deletion(-) diff --git a/core/common/src/buffer_type.rs b/core/common/src/buffer_type.rs index 63b01c23..887d855a 100644 --- a/core/common/src/buffer_type.rs +++ b/core/common/src/buffer_type.rs @@ -152,6 +152,28 @@ pub struct MemAlloc { #[cfg(feature = "monitoring-structs")] unsafe impl aya::Pod for MemAlloc {} +#[cfg(feature = "monitoring-structs")] +#[repr(C, packed)] +#[derive(Clone, Copy, Zeroable)] +pub struct SchedStatWait { + pub tgid: u32, + pub delay: u64, + pub command: [u8; TASK_COMM_LEN], +} +#[cfg(feature = "monitoring-structs")] +unsafe impl aya::Pod for SchedStatWait {} + +#[cfg(feature = "monitoring-structs")] +#[repr(C, packed)] +#[derive(Clone, Copy, Zeroable)] +pub struct SchedStatRuntime { + pub tgid: u32, + pub runtime: u64, + pub command: [u8; TASK_COMM_LEN], +} +#[cfg(feature = "monitoring-structs")] +unsafe impl aya::Pod for SchedStatRuntime {} + // docs: // This function perform a byte swap from little-endian to big-endian // It's used to reconstruct the correct IPv4 address from the u32 representation @@ -183,6 +205,10 @@ pub enum BufferType { CpuFrequency, #[cfg(feature = "monitoring-structs")] MemAlloc, + #[cfg(feature = "monitoring-structs")] + SchedStatWait, + #[cfg(feature = "monitoring-structs")] + SchedStatRuntime, } #[cfg(feature = "buffer-reader")] @@ -595,6 +621,94 @@ impl BufferType { } } } + + #[cfg(feature = "monitoring-structs")] + pub async fn read_sched_stat_wait( + buffers: &mut [BytesMut], + tot_events: i32, + offset: i32, + exporter: &str, + metrics: Arc, + ) { + for i in offset..tot_events { + let vec_bytes = &buffers[i as usize]; + if vec_bytes.len() < std::mem::size_of::() { + error!( + "Corrupted SchedStatWait data. Raw data: {}. Readed {} bytes expected {} bytes", + vec_bytes + .iter() + .map(|b| format!("{:02x}", b)) + .collect::>() + .join(" "), + vec_bytes.len(), + std::mem::size_of::() + ); + continue; + } + if vec_bytes.len() >= std::mem::size_of::() { + let sched_stat_wait: SchedStatWait = + unsafe { std::ptr::read_unaligned(vec_bytes.as_ptr() as *const _) }; + + match exporter { + "otlp" => metrics.record_sched_stat_wait(&sched_stat_wait), + _ => continue, + } + + let tgid = sched_stat_wait.tgid; + let command = String::from_utf8_lossy(&sched_stat_wait.command); + let delay = sched_stat_wait.delay; + + info!( + "SchedStatWait - tgid: {}, command: {}, delay: {}", + tgid, command, delay + ); + } + } + } + + #[cfg(feature = "monitoring-structs")] + pub async fn read_sched_stat_runtime( + buffers: &mut [BytesMut], + tot_events: i32, + offset: i32, + exporter: &str, + metrics: Arc, + ) { + for i in offset..tot_events { + let vec_bytes = &buffers[i as usize]; + if vec_bytes.len() < std::mem::size_of::() { + error!( + "Corrupted SchedStatRuntime data. Raw data: {}. Readed {} bytes expected {} bytes", + vec_bytes + .iter() + .map(|b| format!("{:02x}", b)) + .collect::>() + .join(" "), + vec_bytes.len(), + std::mem::size_of::() + ); + continue; + } + if vec_bytes.len() >= std::mem::size_of::() { + let sched_stat_runtime: SchedStatRuntime = + unsafe { std::ptr::read_unaligned(vec_bytes.as_ptr() as *const _) }; + + match exporter { + "otlp" => metrics.record_sched_stat_runtime(&sched_stat_runtime), + _ => continue, + } + + let tgid = sched_stat_runtime.tgid; + let command = String::from_utf8_lossy(&sched_stat_runtime.command); + let runtime = sched_stat_runtime.runtime; + + info!( + "SchedStatRuntime - tgid: {}, command: {}, runtime: {}", + tgid, command, runtime + ); + } + } + } } // docs: read buffer function: @@ -689,6 +803,28 @@ pub async fn read_perf_buffer>( ) .await } + #[cfg(feature = "monitoring-structs")] + BufferType::SchedStatWait => { + BufferType::read_sched_stat_wait( + &mut buffers, + tot_events, + offset, + "otlp", + metrics.clone().expect("Metric required for SchedStatWait"), + ) + .await + } + #[cfg(feature = "monitoring-structs")] + BufferType::SchedStatRuntime => { + BufferType::read_sched_stat_runtime( + &mut buffers, + tot_events, + offset, + "otlp", + metrics.clone().expect("Metric required for SchedStatRuntime"), + ) + .await + } } } } @@ -717,6 +853,10 @@ pub enum BufferSize { CpuFrequency, #[cfg(feature = "monitoring-structs")] MemAlloc, + #[cfg(feature = "monitoring-structs")] + SchedStatWait, + #[cfg(feature = "monitoring-structs")] + SchedStatRuntime, } #[cfg(feature = "buffer-reader")] impl BufferSize { @@ -736,6 +876,10 @@ impl BufferSize { BufferSize::CpuFrequency => std::mem::size_of::(), #[cfg(feature = "monitoring-structs")] BufferSize::MemAlloc => std::mem::size_of::(), + #[cfg(feature = "monitoring-structs")] + BufferSize::SchedStatWait => std::mem::size_of::(), + #[cfg(feature = "monitoring-structs")] + BufferSize::SchedStatRuntime => std::mem::size_of::(), } } pub fn set_buffer(&self) -> Vec { @@ -789,6 +933,16 @@ impl BufferSize { let capacity = self.get_size() * 1024; return vec![BytesMut::with_capacity(capacity); tot_cpu]; } + #[cfg(feature = "monitoring-structs")] + BufferSize::SchedStatWait => { + let capacity = self.get_size() * 1024; + return vec![BytesMut::with_capacity(capacity); tot_cpu]; + } + #[cfg(feature = "monitoring-structs")] + BufferSize::SchedStatRuntime => { + let capacity = self.get_size() * 1024; + return vec![BytesMut::with_capacity(capacity); tot_cpu]; + } } } } diff --git a/core/common/src/otel_metrics.rs b/core/common/src/otel_metrics.rs index c256cc1d..37611729 100644 --- a/core/common/src/otel_metrics.rs +++ b/core/common/src/otel_metrics.rs @@ -11,7 +11,9 @@ //! extracted from the eBPF struct, allowing downstream collectors to group //! telemetry by process. -use crate::buffer_type::{CpuFrequency, MemAlloc, NetworkMetrics, TimeStampMetrics}; +use crate::buffer_type::{ + CpuFrequency, MemAlloc, NetworkMetrics, SchedStatRuntime, SchedStatWait, TimeStampMetrics, +}; use opentelemetry::KeyValue; use opentelemetry::metrics::{Counter, Gauge, Histogram, Meter}; pub struct Metrics { @@ -47,6 +49,12 @@ pub struct Metrics { /// Observed bytes requested via mmap syscalls. pub enter_mem_alloc: Gauge, + + /// Observed scheduler wait time in nanoseconds (sched_stat_wait). + pub sched_stat_wait: Gauge, + + /// Observed scheduler runtime in nanoseconds (sched_stat_runtime). + pub sched_stat_runtime: Gauge, } impl Metrics { @@ -112,6 +120,18 @@ impl Metrics { .with_description("Bytes requested via mmap syscalls") .build(); + // scheduler wait time in nanoseconds + let sched_stat_wait = meter + .i64_gauge("sched_stat_wait") + .with_description("Scheduler wait time in nanoseconds from sched_stat_wait") + .build(); + + // scheduler runtime in nanoseconds + let sched_stat_runtime = meter + .i64_gauge("sched_stat_runtime") + .with_description("Scheduler runtime in nanoseconds from sched_stat_runtime") + .build(); + Self { events_total, packets_total, @@ -123,6 +143,8 @@ impl Metrics { cpu_bytes_alloc_events_total, mem_alloc_events_total, enter_mem_alloc, + sched_stat_wait, + sched_stat_runtime, } } @@ -201,4 +223,34 @@ impl Metrics { self.mem_alloc_events_total.add(1, attrs); self.enter_mem_alloc.record(m.length as i64, attrs); } + + /// Record a single [`SchedStatWait`] event. + /// + /// Records `delay` in the `sched_stat_wait` gauge. No shared or dedicated + /// counter is incremented, as requested. + pub fn record_sched_stat_wait(&self, m: &SchedStatWait) { + let comm = String::from_utf8_lossy(&m.command); + let command = comm.trim_end_matches('\0').to_string(); + let attrs = &[ + KeyValue::new("tgid", m.tgid as i64), + KeyValue::new("command", command), + ]; + + self.sched_stat_wait.record(m.delay as i64, attrs); + } + + /// Record a single [`SchedStatRuntime`] event. + /// + /// Records `runtime` in the `sched_stat_runtime` gauge. No shared or + /// dedicated counter is incremented, as requested. + pub fn record_sched_stat_runtime(&self, m: &SchedStatRuntime) { + let comm = String::from_utf8_lossy(&m.command); + let command = comm.trim_end_matches('\0').to_string(); + let attrs = &[ + KeyValue::new("tgid", m.tgid as i64), + KeyValue::new("command", command), + ]; + + self.sched_stat_runtime.record(m.runtime as i64, attrs); + } } diff --git a/core/src/components/metrics/src/helpers.rs b/core/src/components/metrics/src/helpers.rs index ba63c9f9..95232c53 100644 --- a/core/src/components/metrics/src/helpers.rs +++ b/core/src/components/metrics/src/helpers.rs @@ -70,11 +70,21 @@ pub async fn event_listener(bpf_maps: BpfMapsData, meter: Meter) -> Result<(), a .remove("mem_alloc") .expect("Cannot create mem_alloc perf buffer"); + let (_sched_stat_wait_array, sched_stat_wait_perf_buffer) = maps + .remove("sched_stat_wait") + .expect("Cannot create sched_stat_wait perf buffer"); + + let (_sched_stat_runtime_array, sched_stat_runtime_perf_buffer) = maps + .remove("sched_stat_runtime") + .expect("Cannot create sched_stat_runtime perf buffer"); + // Allocate byte-buffers sized for each structure type let net_metrics_buffers = BufferSize::NetworkMetricsEvents.set_buffer(); let time_stamp_events_buffers = BufferSize::TimeMetricsEvents.set_buffer(); let cpu_frequency_events_buffers = BufferSize::CpuFrequency.set_buffer(); let mem_alloc_buffers = BufferSize::MemAlloc.set_buffer(); + let sched_stat_wait_buffers = BufferSize::SchedStatWait.set_buffer(); + let sched_stat_runtime_buffers = BufferSize::SchedStatRuntime.set_buffer(); let metrics = Arc::new(Metrics::new(&meter)); @@ -140,6 +150,36 @@ pub async fn event_listener(bpf_maps: BpfMapsData, meter: Meter) -> Result<(), a }) }; + let sched_stat_wait_metrics = { + let metrics = Arc::clone(&metrics); + let mut array_buffers = sched_stat_wait_perf_buffer; + let mut buffers = sched_stat_wait_buffers; + tokio::spawn(async move { + read_perf_buffer( + array_buffers, + buffers, + BufferType::SchedStatWait, + Some(metrics), + ) + .await; + }) + }; + + let sched_stat_runtime_metrics = { + let metrics = Arc::clone(&metrics); + let mut array_buffers = sched_stat_runtime_perf_buffer; + let mut buffers = sched_stat_runtime_buffers; + tokio::spawn(async move { + read_perf_buffer( + array_buffers, + buffers, + BufferType::SchedStatRuntime, + Some(metrics), + ) + .await; + }) + }; + info!("Event listeners started, entering main loop..."); tokio::select! { @@ -167,6 +207,18 @@ pub async fn event_listener(bpf_maps: BpfMapsData, meter: Meter) -> Result<(), a } } + result = sched_stat_wait_metrics => { + if let Err(e) = result { + error!("SchedStatWait events task failed: {:?}", e); + } + } + + result = sched_stat_runtime_metrics => { + if let Err(e) = result { + error!("SchedStatRuntime events task failed: {:?}", e); + } + } + _ = signal::ctrl_c() => { info!("Ctrl-C received, shutting down..."); } diff --git a/core/src/components/metrics/src/main.rs b/core/src/components/metrics/src/main.rs index 591b4b3c..a3657b26 100644 --- a/core/src/components/metrics/src/main.rs +++ b/core/src/components/metrics/src/main.rs @@ -48,6 +48,8 @@ async fn main() -> Result<(), anyhow::Error> { let tcp_v6_bpf = bpf.clone(); let cpu_frequency = bpf.clone(); let mem_alloc_bpf = bpf.clone(); + let sched_stat_wait_bpf = bpf.clone(); + let sched_stat_runtime_bpf = bpf.clone(); info!("Running Ebpf logger"); info!("loading programs"); @@ -60,6 +62,8 @@ async fn main() -> Result<(), anyhow::Error> { "net_metrics".to_string(), "cpu_frequency".to_string(), "mem_alloc".to_string(), + "sched_stat_wait".to_string(), + "sched_stat_runtime".to_string(), ]; match init_bpf_maps(bpf.clone(), map_data) { @@ -110,6 +114,24 @@ async fn main() -> Result<(), anyhow::Error> { .context( "An error occurred during the execution of load_program function", )?; + load_tracepoint_program( + sched_stat_wait_bpf, + "trace_sched_stat_wait", + "sched", + "sched_stat_wait", + ) + .context( + "An error occurred during the execution of load_program function", + )?; + load_tracepoint_program( + sched_stat_runtime_bpf, + "trace_sched_stat_runtime", + "sched", + "sched_stat_runtime", + ) + .context( + "An error occurred during the execution of load_program function", + )?; } // Hand off to the async event consumer From c8141f474af825df6c7f2b6fc6c6165bd82eca6d Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Mon, 22 Jun 2026 14:29:11 +0200 Subject: [PATCH 09/14] (example): added grafana dashboard example --- Examples/dashboard-template.yaml | 656 +++++++++++++++++++++++++++++++ 1 file changed, 656 insertions(+) create mode 100644 Examples/dashboard-template.yaml diff --git a/Examples/dashboard-template.yaml b/Examples/dashboard-template.yaml new file mode 100644 index 00000000..84b6f930 --- /dev/null +++ b/Examples/dashboard-template.yaml @@ -0,0 +1,656 @@ +apiVersion: dashboard.grafana.app/v2 +kind: Dashboard +metadata: + name: adzxtsp + namespace: default + uid: d2def6af-04c7-43d3-9f90-969b4bc414ee + resourceVersion: '1782049084952992' + generation: 32 + creationTimestamp: '2026-06-04T18:44:45Z' + labels: + grafana.app/deprecatedInternalID: '4461471636946944' + annotations: + grafana.app/createdBy: user:afo4ysywj3xmof + grafana.app/folder: '' + grafana.app/saved-from-ui: Grafana v13.0.2 (3fcdbc5a) + grafana.app/updatedBy: user:afo4ysywj3xmof + grafana.app/updatedTimestamp: '2026-06-21T13:38:04Z' +spec: + annotations: + - kind: AnnotationQuery + spec: + query: + kind: DataQuery + group: grafana + version: v0 + datasource: + name: '-- Grafana --' + spec: {} + enable: true + hide: true + iconColor: rgba(0, 211, 255, 1) + name: Annotations & Alerts + builtIn: true + cursorSync: 'Off' + editable: true + elements: + panel-1: + kind: Panel + spec: + id: 1 + title: Total packets + description: '' + links: [] + data: + kind: QueryGroup + spec: + queries: + - kind: PanelQuery + spec: + query: + kind: DataQuery + group: prometheus + version: v0 + datasource: + name: ffo50cmg1o1s0a + spec: + editorMode: builder + expr: cortexbrain_packets_total + legendFormat: __auto + range: true + refId: A + hidden: false + transformations: [] + queryOptions: {} + vizConfig: + kind: VizConfig + group: timeseries + version: 13.0.2 + spec: + options: + annotations: + clustering: -1 + multiLane: false + legend: + calcs: [] + displayMode: list + placement: bottom + showLegend: true + tooltip: + hideZeros: false + mode: single + sort: none + fieldConfig: + defaults: + thresholds: + mode: absolute + steps: + - value: 0 + color: green + - value: 80 + color: red + color: + mode: palette-classic + custom: + axisBorderShow: false + axisCenteredZero: false + axisColorMode: text + axisLabel: '' + axisPlacement: auto + barAlignment: 0 + barWidthFactor: 0.6 + drawStyle: line + fillOpacity: 0 + gradientMode: none + hideFrom: + legend: false + tooltip: false + viz: false + insertNulls: false + lineInterpolation: linear + lineWidth: 1 + pointSize: 5 + scaleDistribution: + type: linear + showPoints: auto + showValues: false + spanNulls: false + stacking: + group: A + mode: none + thresholdsStyle: + mode: 'off' + overrides: [] + panel-2: + kind: Panel + spec: + id: 2 + title: Cpu bytes allocation + description: '' + links: [] + data: + kind: QueryGroup + spec: + queries: + - kind: PanelQuery + spec: + query: + kind: DataQuery + group: prometheus + version: v0 + datasource: + name: ffo50cmg1o1s0a + spec: + editorMode: builder + expr: cortexbrain_cpu_bytes_alloc + legendFormat: __auto + range: true + refId: A + hidden: false + transformations: [] + queryOptions: {} + vizConfig: + kind: VizConfig + group: timeseries + version: 13.0.2 + spec: + options: + annotations: + clustering: -1 + multiLane: false + legend: + calcs: [] + displayMode: list + placement: bottom + showLegend: true + tooltip: + hideZeros: false + mode: single + sort: none + fieldConfig: + defaults: + thresholds: + mode: absolute + steps: + - value: 0 + color: green + - value: 80 + color: red + color: + mode: palette-classic + custom: + axisBorderShow: false + axisCenteredZero: false + axisColorMode: text + axisLabel: '' + axisPlacement: auto + barAlignment: 0 + barWidthFactor: 0.6 + drawStyle: line + fillOpacity: 25 + gradientMode: none + hideFrom: + legend: false + tooltip: false + viz: false + insertNulls: false + lineInterpolation: linear + lineWidth: 1 + pointSize: 5 + scaleDistribution: + type: linear + showPoints: auto + showValues: false + spanNulls: false + stacking: + group: A + mode: normal + thresholdsStyle: + mode: 'off' + overrides: [] + panel-3: + kind: Panel + spec: + id: 3 + title: Timestamp microseconds histograms + description: '' + links: [] + data: + kind: QueryGroup + spec: + queries: + - kind: PanelQuery + spec: + query: + kind: DataQuery + group: prometheus + version: v0 + datasource: + name: ffo50cmg1o1s0a + spec: + editorMode: builder + expr: cortexbrain_cpu_bytes_alloc + legendFormat: __auto + range: true + refId: A + hidden: false + transformations: [] + queryOptions: {} + vizConfig: + kind: VizConfig + group: bargauge + version: 13.0.2 + spec: + options: + displayMode: gradient + legend: + calcs: [] + displayMode: list + placement: bottom + showLegend: false + maxVizHeight: 300 + minVizHeight: 16 + minVizWidth: 8 + namePlacement: auto + orientation: horizontal + reduceOptions: + calcs: + - lastNotNull + fields: '' + values: false + showUnfilled: true + sizing: auto + valueMode: color + fieldConfig: + defaults: + thresholds: + mode: percentage + steps: + - value: 0 + color: green + - value: 80 + color: red + color: + mode: thresholds + fieldMinMax: false + overrides: [] + transparent: true + panel-4: + kind: Panel + spec: + id: 4 + title: Mem Alloc size + description: '' + links: [] + data: + kind: QueryGroup + spec: + queries: + - kind: PanelQuery + spec: + query: + kind: DataQuery + group: prometheus + version: v0 + datasource: + name: ffo50cmg1o1s0a + spec: + editorMode: builder + expr: cortexbrain_enter_mem_alloc + legendFormat: __auto + range: true + refId: A + hidden: false + transformations: [] + queryOptions: {} + vizConfig: + kind: VizConfig + group: timeseries + version: 13.0.2 + spec: + options: + annotations: + clustering: -1 + multiLane: false + legend: + calcs: [] + displayMode: list + placement: bottom + showLegend: true + tooltip: + hideZeros: false + mode: single + sort: none + fieldConfig: + defaults: + thresholds: + mode: absolute + steps: + - value: 0 + color: green + - value: 80 + color: red + color: + mode: palette-classic + custom: + axisBorderShow: false + axisCenteredZero: false + axisColorMode: text + axisLabel: '' + axisPlacement: auto + barAlignment: 0 + barWidthFactor: 0.6 + drawStyle: line + fillOpacity: 25 + gradientMode: none + hideFrom: + legend: false + tooltip: false + viz: false + insertNulls: false + lineInterpolation: linear + lineWidth: 1 + pointSize: 5 + scaleDistribution: + type: linear + showPoints: auto + showValues: false + spanNulls: false + stacking: + group: A + mode: normal + thresholdsStyle: + mode: 'off' + overrides: [] + panel-5: + kind: Panel + spec: + id: 5 + title: Mem alloc tot events + description: '' + links: [] + data: + kind: QueryGroup + spec: + queries: + - kind: PanelQuery + spec: + query: + kind: DataQuery + group: prometheus + version: v0 + datasource: + name: ffo50cmg1o1s0a + spec: + editorMode: builder + expr: cortexbrain_mem_alloc_events_total + legendFormat: __auto + range: true + refId: A + hidden: false + transformations: [] + queryOptions: {} + vizConfig: + kind: VizConfig + group: timeseries + version: 13.0.2 + spec: + options: + annotations: + clustering: -1 + multiLane: false + legend: + calcs: [] + displayMode: list + placement: bottom + showLegend: true + tooltip: + hideZeros: false + mode: single + sort: none + fieldConfig: + defaults: + thresholds: + mode: absolute + steps: + - value: 0 + color: green + - value: 80 + color: red + color: + mode: palette-classic + custom: + axisBorderShow: false + axisCenteredZero: false + axisColorMode: text + axisLabel: '' + axisPlacement: auto + barAlignment: 0 + barWidthFactor: 0.6 + drawStyle: line + fillOpacity: 0 + gradientMode: none + hideFrom: + legend: false + tooltip: false + viz: false + insertNulls: false + lineInterpolation: linear + lineWidth: 1 + pointSize: 5 + scaleDistribution: + type: linear + showPoints: auto + showValues: false + spanNulls: false + stacking: + group: A + mode: none + thresholdsStyle: + mode: 'off' + overrides: [] + panel-6: + kind: Panel + spec: + id: 6 + title: Scheduler Runtime (us) + description: '' + links: [] + data: + kind: QueryGroup + spec: + queries: + - kind: PanelQuery + spec: + query: + kind: DataQuery + group: prometheus + version: v0 + datasource: + name: ffo50cmg1o1s0a + spec: + editorMode: builder + expr: cortexbrain_sched_stat_runtime + legendFormat: __auto + range: true + refId: A + hidden: false + transformations: [] + queryOptions: {} + vizConfig: + kind: VizConfig + group: bargauge + version: 13.0.2 + spec: + options: + displayMode: gradient + legend: + calcs: [] + displayMode: list + placement: bottom + showLegend: false + maxVizHeight: 300 + minVizHeight: 16 + minVizWidth: 8 + namePlacement: auto + orientation: horizontal + reduceOptions: + calcs: + - lastNotNull + fields: '' + values: false + showUnfilled: true + sizing: auto + valueMode: color + fieldConfig: + defaults: + thresholds: + mode: absolute + steps: + - value: 0 + color: green + - value: 80 + color: red + color: + mode: thresholds + fieldMinMax: false + overrides: [] + transparent: true + layout: + kind: TabsLayout + spec: + tabs: + - kind: TabsLayoutTab + spec: + title: Events + layout: + kind: AutoGridLayout + spec: + maxColumnCount: 2 + columnWidthMode: standard + rowHeightMode: standard + items: + - kind: AutoGridLayoutItem + spec: + element: + kind: ElementReference + name: panel-1 + - kind: AutoGridLayoutItem + spec: + element: + kind: ElementReference + name: panel-2 + - kind: AutoGridLayoutItem + spec: + element: + kind: ElementReference + name: panel-4 + - kind: AutoGridLayoutItem + spec: + element: + kind: ElementReference + name: panel-5 + - kind: TabsLayoutTab + spec: + title: Latencies + layout: + kind: AutoGridLayout + spec: + maxColumnCount: 3 + columnWidthMode: standard + rowHeightMode: standard + items: + - kind: AutoGridLayoutItem + spec: + element: + kind: ElementReference + name: panel-6 + - kind: AutoGridLayoutItem + spec: + element: + kind: ElementReference + name: panel-3 + links: [] + liveNow: false + preload: false + tags: [] + timeSettings: + timezone: browser + from: now-6h + to: now + autoRefresh: '' + autoRefreshIntervals: + - 5s + - 10s + - 30s + - 1m + - 5m + - 15m + - 30m + - 1h + - 2h + - 1d + hideTimepicker: false + fiscalYearStartMonth: 0 + title: Dashboard + variables: + - kind: CustomVariable + spec: + name: custom0 + query: '' + current: + text: '' + value: '' + options: [] + multi: false + includeAll: false + hide: dontHide + skipUrlSync: false + allowCustomValue: true + valuesFormat: csv + - kind: QueryVariable + spec: + name: query0 + current: + text: '' + value: '' + hide: dontHide + refresh: onDashboardLoad + skipUrlSync: false + query: + kind: DataQuery + group: prometheus + version: v0 + datasource: + name: ffo50cmg1o1s0a + spec: {} + regex: '' + regexApplyTo: value + sort: disabled + options: [] + multi: false + includeAll: false + allowCustomValue: true + - kind: DatasourceVariable + spec: + name: datasource0 + pluginId: prometheus + refresh: onDashboardLoad + regex: '' + current: + text: prometheus + value: ffo50cmg1o1s0a + options: [] + multi: false + includeAll: false + hide: dontHide + skipUrlSync: false + allowCustomValue: true + preferences: + layout: + kind: AutoGridLayout + spec: + maxColumnCount: 3 + columnWidthMode: standard + rowHeightMode: standard + items: [] From 85b28841160044bce2f6451b023f53555912bc5d Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Mon, 22 Jun 2026 22:35:59 +0200 Subject: [PATCH 10/14] [#186]: Added cpu idle metrics in kernel space --- core/src/components/metrics_tracer/src/cpu.rs | 22 ++++++++++++++++++- .../metrics_tracer/src/data_structures.rs | 14 ++++++++++++ .../src/components/metrics_tracer/src/main.rs | 6 ++--- 3 files changed, 38 insertions(+), 4 deletions(-) diff --git a/core/src/components/metrics_tracer/src/cpu.rs b/core/src/components/metrics_tracer/src/cpu.rs index ba545150..9d4b73a0 100644 --- a/core/src/components/metrics_tracer/src/cpu.rs +++ b/core/src/components/metrics_tracer/src/cpu.rs @@ -5,7 +5,7 @@ use aya_ebpf::{EbpfContext, programs::TracePointContext}; use aya_log_ebpf::info; -use crate::data_structures::{CPU_FREQUENCY, CpuFrequency}; +use crate::data_structures::{CPU_FREQUENCY, CPU_IDLE, CPU_IDLE_LAST_STATE, CpuFrequency, CpuIdle}; pub fn cpu_idle(ctx: TracePointContext) -> Result<(), i64> { let state_offset = 8; @@ -13,6 +13,26 @@ pub fn cpu_idle(ctx: TracePointContext) -> Result<(), i64> { let state: u32 = unsafe { ctx.read_at(state_offset) }?; let cpu_id: u32 = unsafe { ctx.read_at(cpu_id_offset) }?; + let map_ptr = unsafe { &raw mut CPU_IDLE_LAST_STATE }; + + // skip the data when: + // - last_state is equal to the current state + // - last_state is equal to 4294967295 or -1. This codes means that the cpu is exiting from the current state and entering a new state + let emit = match unsafe { (*map_ptr).get(&cpu_id) } { + Some(last_state) + if (*last_state == state) || (*last_state == 4294967295) || (*last_state == -1) => + { + false + } + _ => true, + }; + + if emit { + let _ = unsafe { (*map_ptr).insert(&cpu_id, &state, 0) }; + let event = CpuIdle { cpu_id, state }; + unsafe { CPU_IDLE.output(&ctx, &event, 0) }; + } + info!(&ctx, "CPU idle: State: {} cpu_id: {}", state, cpu_id); Ok(()) } diff --git a/core/src/components/metrics_tracer/src/data_structures.rs b/core/src/components/metrics_tracer/src/data_structures.rs index 91249a14..d74ef35a 100644 --- a/core/src/components/metrics_tracer/src/data_structures.rs +++ b/core/src/components/metrics_tracer/src/data_structures.rs @@ -79,6 +79,13 @@ pub struct SchedStatRuntime { pub(crate) command: [u8; 16], } +#[repr(C, packed)] +#[derive(Copy, Clone)] +pub struct CpuIdle { + pub(crate) cpu_id: u32, + pub(crate) state: u32, +} + // Map: connect-start timestamp by socket pointer #[map(name = "time_stamp_start")] pub static mut TIME_STAMP_START: HashMap<*mut core::ffi::c_void, TimeStampStartInfo> = @@ -103,3 +110,10 @@ pub static SCHED_STAT_WAIT: PerfEventArray = PerfEventArray::new( #[map(name = "sched_stat_runtime")] pub static SCHED_STAT_RUNTIME: PerfEventArray = PerfEventArray::new(0); + +#[map(name = "cpu_idle")] +pub static CPU_IDLE: PerfEventArray = PerfEventArray::new(0); + +#[map(name = "cpu_idle_last_state")] +pub static mut CPU_IDLE_LAST_STATE: HashMap = + HashMap::::with_max_entries(256, 0); diff --git a/core/src/components/metrics_tracer/src/main.rs b/core/src/components/metrics_tracer/src/main.rs index 1ff160ad..55faa453 100644 --- a/core/src/components/metrics_tracer/src/main.rs +++ b/core/src/components/metrics_tracer/src/main.rs @@ -12,12 +12,12 @@ use crate::cpu::{cpu_idle, per_cpu_bytes_alloc, sched_stat_runtime, sched_stat_w use crate::data_structures::CpuFrequency; use crate::data_structures::NET_METRICS; use crate::data_structures::{CPU_FREQUENCY, SchedStatWait}; -use crate::data_structures::{MEM_ALLOC, SCHED_STAT_RUNTIME, SCHED_STAT_WAIT}; -use crate::data_structures::{MemAlloc, SchedStatRuntime}; use crate::data_structures::{ - NetworkMetrics, TASK_COMM_LEN, TIME_STAMP_EVENTS, TIME_STAMP_START, TimeStampEvent, + CPU_IDLE, NetworkMetrics, TASK_COMM_LEN, TIME_STAMP_EVENTS, TIME_STAMP_START, TimeStampEvent, TimeStampStartInfo, }; +use crate::data_structures::{MEM_ALLOC, SCHED_STAT_RUNTIME, SCHED_STAT_WAIT}; +use crate::data_structures::{MemAlloc, SchedStatRuntime}; use crate::memory::enter_mmap; use aya_ebpf::EbpfContext; use aya_ebpf::helpers::bpf_get_current_pid_tgid; From 86c88ef7201a9cbe5325068185d321d93d28f578 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Mon, 22 Jun 2026 22:36:59 +0200 Subject: [PATCH 11/14] [#186]: added userspace consumer for the cpu idle events --- core/common/src/buffer_type.rs | 79 +++++++++++++++++++++- core/common/src/otel_metrics.rs | 23 ++++++- core/src/components/metrics/src/helpers.rs | 28 ++++++-- core/src/components/metrics/src/main.rs | 11 +++ 4 files changed, 132 insertions(+), 9 deletions(-) diff --git a/core/common/src/buffer_type.rs b/core/common/src/buffer_type.rs index 887d855a..7124e55f 100644 --- a/core/common/src/buffer_type.rs +++ b/core/common/src/buffer_type.rs @@ -174,6 +174,16 @@ pub struct SchedStatRuntime { #[cfg(feature = "monitoring-structs")] unsafe impl aya::Pod for SchedStatRuntime {} +#[cfg(feature = "monitoring-structs")] +#[repr(C, packed)] +#[derive(Clone, Copy, Zeroable)] +pub struct CpuIdle { + pub cpu_id: u32, + pub state: u32, +} +#[cfg(feature = "monitoring-structs")] +unsafe impl aya::Pod for CpuIdle {} + // docs: // This function perform a byte swap from little-endian to big-endian // It's used to reconstruct the correct IPv4 address from the u32 representation @@ -209,6 +219,8 @@ pub enum BufferType { SchedStatWait, #[cfg(feature = "monitoring-structs")] SchedStatRuntime, + #[cfg(feature = "monitoring-structs")] + CpuIdle, } #[cfg(feature = "buffer-reader")] @@ -709,6 +721,49 @@ impl BufferType { } } } + + #[cfg(feature = "monitoring-structs")] + pub async fn read_cpu_idle( + buffers: &mut [BytesMut], + tot_events: i32, + offset: i32, + exporter: &str, + metrics: Arc, + ) { + for i in offset..tot_events { + let vec_bytes = &buffers[i as usize]; + if vec_bytes.len() < std::mem::size_of::() { + error!( + "Corrupted CpuIdle data. Raw data: {}. Readed {} bytes expected {} bytes", + vec_bytes + .iter() + .map(|b| format!("{:02x}", b)) + .collect::>() + .join(" "), + vec_bytes.len(), + std::mem::size_of::() + ); + continue; + } + if vec_bytes.len() >= std::mem::size_of::() { + let cpu_idle: CpuIdle = + unsafe { std::ptr::read_unaligned(vec_bytes.as_ptr() as *const _) }; + + match exporter { + "otlp" => metrics.record_cpu_idle(&cpu_idle), + _ => continue, + } + + let cpu_id = cpu_idle.cpu_id; + let state = cpu_idle.state; + + info!( + "CpuIdle state changed - cpu_id: {}, state: {}", + cpu_id, state + ); + } + } + } } // docs: read buffer function: @@ -821,7 +876,20 @@ pub async fn read_perf_buffer>( tot_events, offset, "otlp", - metrics.clone().expect("Metric required for SchedStatRuntime"), + metrics + .clone() + .expect("Metric required for SchedStatRuntime"), + ) + .await + } + #[cfg(feature = "monitoring-structs")] + BufferType::CpuIdle => { + BufferType::read_cpu_idle( + &mut buffers, + tot_events, + offset, + "otlp", + metrics.clone().expect("Metric required for CpuIdle"), ) .await } @@ -857,6 +925,8 @@ pub enum BufferSize { SchedStatWait, #[cfg(feature = "monitoring-structs")] SchedStatRuntime, + #[cfg(feature = "monitoring-structs")] + CpuIdle, } #[cfg(feature = "buffer-reader")] impl BufferSize { @@ -880,6 +950,8 @@ impl BufferSize { BufferSize::SchedStatWait => std::mem::size_of::(), #[cfg(feature = "monitoring-structs")] BufferSize::SchedStatRuntime => std::mem::size_of::(), + #[cfg(feature = "monitoring-structs")] + BufferSize::CpuIdle => std::mem::size_of::(), } } pub fn set_buffer(&self) -> Vec { @@ -943,6 +1015,11 @@ impl BufferSize { let capacity = self.get_size() * 1024; return vec![BytesMut::with_capacity(capacity); tot_cpu]; } + #[cfg(feature = "monitoring-structs")] + BufferSize::CpuIdle => { + let capacity = self.get_size() * 1024; + return vec![BytesMut::with_capacity(capacity); tot_cpu]; + } } } } diff --git a/core/common/src/otel_metrics.rs b/core/common/src/otel_metrics.rs index 37611729..79d08b88 100644 --- a/core/common/src/otel_metrics.rs +++ b/core/common/src/otel_metrics.rs @@ -12,7 +12,8 @@ //! telemetry by process. use crate::buffer_type::{ - CpuFrequency, MemAlloc, NetworkMetrics, SchedStatRuntime, SchedStatWait, TimeStampMetrics, + CpuFrequency, CpuIdle, MemAlloc, NetworkMetrics, SchedStatRuntime, SchedStatWait, + TimeStampMetrics, }; use opentelemetry::KeyValue; use opentelemetry::metrics::{Counter, Gauge, Histogram, Meter}; @@ -55,6 +56,9 @@ pub struct Metrics { /// Observed scheduler runtime in nanoseconds (sched_stat_runtime). pub sched_stat_runtime: Gauge, + + /// Current CPU idle C-state per cpu_id, updated only on state change. + pub cpu_idle_state: Gauge, } impl Metrics { @@ -132,6 +136,12 @@ impl Metrics { .with_description("Scheduler runtime in nanoseconds from sched_stat_runtime") .build(); + // current CPU idle C-state per cpu_id + let cpu_idle_state = meter + .i64_gauge("cpu_idle_state") + .with_description("Current CPU idle C-state per cpu_id, updated only on state change") + .build(); + Self { events_total, packets_total, @@ -145,6 +155,7 @@ impl Metrics { enter_mem_alloc, sched_stat_wait, sched_stat_runtime, + cpu_idle_state, } } @@ -253,4 +264,14 @@ impl Metrics { self.sched_stat_runtime.record(m.runtime as i64, attrs); } + + /// Record a single [`CpuIdle`] event. + /// + /// Updates `cpu_idle_state` gauge to the latest C-state for the given + /// `cpu_id`. Events are only emitted by eBPF when the state changes. + pub fn record_cpu_idle(&self, m: &CpuIdle) { + let attrs = &[KeyValue::new("cpu_id", m.cpu_id as i64)]; + + self.cpu_idle_state.record(m.state as i64, attrs); + } } diff --git a/core/src/components/metrics/src/helpers.rs b/core/src/components/metrics/src/helpers.rs index 95232c53..141cad6f 100644 --- a/core/src/components/metrics/src/helpers.rs +++ b/core/src/components/metrics/src/helpers.rs @@ -66,6 +66,10 @@ pub async fn event_listener(bpf_maps: BpfMapsData, meter: Meter) -> Result<(), a .remove("cpu_frequency") .expect("Cannot create cpu_frequency_perf_buffer"); + let (_cpu_idle_array, cpu_idle_perf_buffer) = maps + .remove("cpu_idle") + .expect("Cannot create cpu_idle perf buffer"); + let (_mem_alloc_array, mem_alloc_perf_buffer) = maps .remove("mem_alloc") .expect("Cannot create mem_alloc perf buffer"); @@ -82,6 +86,7 @@ pub async fn event_listener(bpf_maps: BpfMapsData, meter: Meter) -> Result<(), a let net_metrics_buffers = BufferSize::NetworkMetricsEvents.set_buffer(); let time_stamp_events_buffers = BufferSize::TimeMetricsEvents.set_buffer(); let cpu_frequency_events_buffers = BufferSize::CpuFrequency.set_buffer(); + let cpu_idle_buffers = BufferSize::CpuIdle.set_buffer(); let mem_alloc_buffers = BufferSize::MemAlloc.set_buffer(); let sched_stat_wait_buffers = BufferSize::SchedStatWait.set_buffer(); let sched_stat_runtime_buffers = BufferSize::SchedStatRuntime.set_buffer(); @@ -135,18 +140,21 @@ pub async fn event_listener(bpf_maps: BpfMapsData, meter: Meter) -> Result<(), a }) }; + let cpu_idle_metrics = { + let metrics = Arc::clone(&metrics); + let mut array_buffers = cpu_idle_perf_buffer; + let mut buffers = cpu_idle_buffers; + tokio::spawn(async move { + read_perf_buffer(array_buffers, buffers, BufferType::CpuIdle, Some(metrics)).await; + }) + }; + let mem_alloc_metrics = { let metrics = Arc::clone(&metrics); let mut array_buffers = mem_alloc_perf_buffer; let mut buffers = mem_alloc_buffers; tokio::spawn(async move { - read_perf_buffer( - array_buffers, - buffers, - BufferType::MemAlloc, - Some(metrics), - ) - .await; + read_perf_buffer(array_buffers, buffers, BufferType::MemAlloc, Some(metrics)).await; }) }; @@ -201,6 +209,12 @@ pub async fn event_listener(bpf_maps: BpfMapsData, meter: Meter) -> Result<(), a } } + result = cpu_idle_metrics => { + if let Err(e) = result { + error!("CpuIdle events task failed: {:?}", e); + } + } + result = mem_alloc_metrics => { if let Err(e) = result { error!("MemAlloc events task failed: {:?}", e); diff --git a/core/src/components/metrics/src/main.rs b/core/src/components/metrics/src/main.rs index a3657b26..bcf7de72 100644 --- a/core/src/components/metrics/src/main.rs +++ b/core/src/components/metrics/src/main.rs @@ -47,6 +47,7 @@ async fn main() -> Result<(), anyhow::Error> { let tcp_rev_bpf = bpf.clone(); let tcp_v6_bpf = bpf.clone(); let cpu_frequency = bpf.clone(); + let cpu_idle_bpf = bpf.clone(); let mem_alloc_bpf = bpf.clone(); let sched_stat_wait_bpf = bpf.clone(); let sched_stat_runtime_bpf = bpf.clone(); @@ -61,6 +62,7 @@ async fn main() -> Result<(), anyhow::Error> { "time_stamp_events".to_string(), "net_metrics".to_string(), "cpu_frequency".to_string(), + "cpu_idle".to_string(), "mem_alloc".to_string(), "sched_stat_wait".to_string(), "sched_stat_runtime".to_string(), @@ -105,6 +107,15 @@ async fn main() -> Result<(), anyhow::Error> { .context( "An error occurred during the execution of load_program function", )?; + load_tracepoint_program( + cpu_idle_bpf, + "trace_cpu_idle", + "power", + "cpu_idle", + ) + .context( + "An error occurred during the execution of load_program function", + )?; load_tracepoint_program( mem_alloc_bpf, "trace_enter_mmap", From f244487efbba2f9b5b774a10bbc6fe2a22568e30 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Fri, 3 Jul 2026 21:03:17 +0200 Subject: [PATCH 12/14] (example): Added docker example(only metrics). improved dashboard template --- Examples/dashboard-template.yaml | 656 ------------------ Examples/run-with-docker/docker-compose.yaml | 109 +++ .../dashboards/cortexbrain-dashboard.json | 605 ++++++++++++++++ .../provisioning/dashboards/dashboards.yml | 11 + .../provisioning/datasources/prometheus.yml | 9 + .../otel-collector-config.yaml | 34 + Examples/run-with-docker/prometheus.yml | 8 + 7 files changed, 776 insertions(+), 656 deletions(-) delete mode 100644 Examples/dashboard-template.yaml create mode 100644 Examples/run-with-docker/docker-compose.yaml create mode 100644 Examples/run-with-docker/grafana/dashboards/cortexbrain-dashboard.json create mode 100644 Examples/run-with-docker/grafana/provisioning/dashboards/dashboards.yml create mode 100644 Examples/run-with-docker/grafana/provisioning/datasources/prometheus.yml create mode 100644 Examples/run-with-docker/otel-collector-config.yaml create mode 100644 Examples/run-with-docker/prometheus.yml diff --git a/Examples/dashboard-template.yaml b/Examples/dashboard-template.yaml deleted file mode 100644 index 84b6f930..00000000 --- a/Examples/dashboard-template.yaml +++ /dev/null @@ -1,656 +0,0 @@ -apiVersion: dashboard.grafana.app/v2 -kind: Dashboard -metadata: - name: adzxtsp - namespace: default - uid: d2def6af-04c7-43d3-9f90-969b4bc414ee - resourceVersion: '1782049084952992' - generation: 32 - creationTimestamp: '2026-06-04T18:44:45Z' - labels: - grafana.app/deprecatedInternalID: '4461471636946944' - annotations: - grafana.app/createdBy: user:afo4ysywj3xmof - grafana.app/folder: '' - grafana.app/saved-from-ui: Grafana v13.0.2 (3fcdbc5a) - grafana.app/updatedBy: user:afo4ysywj3xmof - grafana.app/updatedTimestamp: '2026-06-21T13:38:04Z' -spec: - annotations: - - kind: AnnotationQuery - spec: - query: - kind: DataQuery - group: grafana - version: v0 - datasource: - name: '-- Grafana --' - spec: {} - enable: true - hide: true - iconColor: rgba(0, 211, 255, 1) - name: Annotations & Alerts - builtIn: true - cursorSync: 'Off' - editable: true - elements: - panel-1: - kind: Panel - spec: - id: 1 - title: Total packets - description: '' - links: [] - data: - kind: QueryGroup - spec: - queries: - - kind: PanelQuery - spec: - query: - kind: DataQuery - group: prometheus - version: v0 - datasource: - name: ffo50cmg1o1s0a - spec: - editorMode: builder - expr: cortexbrain_packets_total - legendFormat: __auto - range: true - refId: A - hidden: false - transformations: [] - queryOptions: {} - vizConfig: - kind: VizConfig - group: timeseries - version: 13.0.2 - spec: - options: - annotations: - clustering: -1 - multiLane: false - legend: - calcs: [] - displayMode: list - placement: bottom - showLegend: true - tooltip: - hideZeros: false - mode: single - sort: none - fieldConfig: - defaults: - thresholds: - mode: absolute - steps: - - value: 0 - color: green - - value: 80 - color: red - color: - mode: palette-classic - custom: - axisBorderShow: false - axisCenteredZero: false - axisColorMode: text - axisLabel: '' - axisPlacement: auto - barAlignment: 0 - barWidthFactor: 0.6 - drawStyle: line - fillOpacity: 0 - gradientMode: none - hideFrom: - legend: false - tooltip: false - viz: false - insertNulls: false - lineInterpolation: linear - lineWidth: 1 - pointSize: 5 - scaleDistribution: - type: linear - showPoints: auto - showValues: false - spanNulls: false - stacking: - group: A - mode: none - thresholdsStyle: - mode: 'off' - overrides: [] - panel-2: - kind: Panel - spec: - id: 2 - title: Cpu bytes allocation - description: '' - links: [] - data: - kind: QueryGroup - spec: - queries: - - kind: PanelQuery - spec: - query: - kind: DataQuery - group: prometheus - version: v0 - datasource: - name: ffo50cmg1o1s0a - spec: - editorMode: builder - expr: cortexbrain_cpu_bytes_alloc - legendFormat: __auto - range: true - refId: A - hidden: false - transformations: [] - queryOptions: {} - vizConfig: - kind: VizConfig - group: timeseries - version: 13.0.2 - spec: - options: - annotations: - clustering: -1 - multiLane: false - legend: - calcs: [] - displayMode: list - placement: bottom - showLegend: true - tooltip: - hideZeros: false - mode: single - sort: none - fieldConfig: - defaults: - thresholds: - mode: absolute - steps: - - value: 0 - color: green - - value: 80 - color: red - color: - mode: palette-classic - custom: - axisBorderShow: false - axisCenteredZero: false - axisColorMode: text - axisLabel: '' - axisPlacement: auto - barAlignment: 0 - barWidthFactor: 0.6 - drawStyle: line - fillOpacity: 25 - gradientMode: none - hideFrom: - legend: false - tooltip: false - viz: false - insertNulls: false - lineInterpolation: linear - lineWidth: 1 - pointSize: 5 - scaleDistribution: - type: linear - showPoints: auto - showValues: false - spanNulls: false - stacking: - group: A - mode: normal - thresholdsStyle: - mode: 'off' - overrides: [] - panel-3: - kind: Panel - spec: - id: 3 - title: Timestamp microseconds histograms - description: '' - links: [] - data: - kind: QueryGroup - spec: - queries: - - kind: PanelQuery - spec: - query: - kind: DataQuery - group: prometheus - version: v0 - datasource: - name: ffo50cmg1o1s0a - spec: - editorMode: builder - expr: cortexbrain_cpu_bytes_alloc - legendFormat: __auto - range: true - refId: A - hidden: false - transformations: [] - queryOptions: {} - vizConfig: - kind: VizConfig - group: bargauge - version: 13.0.2 - spec: - options: - displayMode: gradient - legend: - calcs: [] - displayMode: list - placement: bottom - showLegend: false - maxVizHeight: 300 - minVizHeight: 16 - minVizWidth: 8 - namePlacement: auto - orientation: horizontal - reduceOptions: - calcs: - - lastNotNull - fields: '' - values: false - showUnfilled: true - sizing: auto - valueMode: color - fieldConfig: - defaults: - thresholds: - mode: percentage - steps: - - value: 0 - color: green - - value: 80 - color: red - color: - mode: thresholds - fieldMinMax: false - overrides: [] - transparent: true - panel-4: - kind: Panel - spec: - id: 4 - title: Mem Alloc size - description: '' - links: [] - data: - kind: QueryGroup - spec: - queries: - - kind: PanelQuery - spec: - query: - kind: DataQuery - group: prometheus - version: v0 - datasource: - name: ffo50cmg1o1s0a - spec: - editorMode: builder - expr: cortexbrain_enter_mem_alloc - legendFormat: __auto - range: true - refId: A - hidden: false - transformations: [] - queryOptions: {} - vizConfig: - kind: VizConfig - group: timeseries - version: 13.0.2 - spec: - options: - annotations: - clustering: -1 - multiLane: false - legend: - calcs: [] - displayMode: list - placement: bottom - showLegend: true - tooltip: - hideZeros: false - mode: single - sort: none - fieldConfig: - defaults: - thresholds: - mode: absolute - steps: - - value: 0 - color: green - - value: 80 - color: red - color: - mode: palette-classic - custom: - axisBorderShow: false - axisCenteredZero: false - axisColorMode: text - axisLabel: '' - axisPlacement: auto - barAlignment: 0 - barWidthFactor: 0.6 - drawStyle: line - fillOpacity: 25 - gradientMode: none - hideFrom: - legend: false - tooltip: false - viz: false - insertNulls: false - lineInterpolation: linear - lineWidth: 1 - pointSize: 5 - scaleDistribution: - type: linear - showPoints: auto - showValues: false - spanNulls: false - stacking: - group: A - mode: normal - thresholdsStyle: - mode: 'off' - overrides: [] - panel-5: - kind: Panel - spec: - id: 5 - title: Mem alloc tot events - description: '' - links: [] - data: - kind: QueryGroup - spec: - queries: - - kind: PanelQuery - spec: - query: - kind: DataQuery - group: prometheus - version: v0 - datasource: - name: ffo50cmg1o1s0a - spec: - editorMode: builder - expr: cortexbrain_mem_alloc_events_total - legendFormat: __auto - range: true - refId: A - hidden: false - transformations: [] - queryOptions: {} - vizConfig: - kind: VizConfig - group: timeseries - version: 13.0.2 - spec: - options: - annotations: - clustering: -1 - multiLane: false - legend: - calcs: [] - displayMode: list - placement: bottom - showLegend: true - tooltip: - hideZeros: false - mode: single - sort: none - fieldConfig: - defaults: - thresholds: - mode: absolute - steps: - - value: 0 - color: green - - value: 80 - color: red - color: - mode: palette-classic - custom: - axisBorderShow: false - axisCenteredZero: false - axisColorMode: text - axisLabel: '' - axisPlacement: auto - barAlignment: 0 - barWidthFactor: 0.6 - drawStyle: line - fillOpacity: 0 - gradientMode: none - hideFrom: - legend: false - tooltip: false - viz: false - insertNulls: false - lineInterpolation: linear - lineWidth: 1 - pointSize: 5 - scaleDistribution: - type: linear - showPoints: auto - showValues: false - spanNulls: false - stacking: - group: A - mode: none - thresholdsStyle: - mode: 'off' - overrides: [] - panel-6: - kind: Panel - spec: - id: 6 - title: Scheduler Runtime (us) - description: '' - links: [] - data: - kind: QueryGroup - spec: - queries: - - kind: PanelQuery - spec: - query: - kind: DataQuery - group: prometheus - version: v0 - datasource: - name: ffo50cmg1o1s0a - spec: - editorMode: builder - expr: cortexbrain_sched_stat_runtime - legendFormat: __auto - range: true - refId: A - hidden: false - transformations: [] - queryOptions: {} - vizConfig: - kind: VizConfig - group: bargauge - version: 13.0.2 - spec: - options: - displayMode: gradient - legend: - calcs: [] - displayMode: list - placement: bottom - showLegend: false - maxVizHeight: 300 - minVizHeight: 16 - minVizWidth: 8 - namePlacement: auto - orientation: horizontal - reduceOptions: - calcs: - - lastNotNull - fields: '' - values: false - showUnfilled: true - sizing: auto - valueMode: color - fieldConfig: - defaults: - thresholds: - mode: absolute - steps: - - value: 0 - color: green - - value: 80 - color: red - color: - mode: thresholds - fieldMinMax: false - overrides: [] - transparent: true - layout: - kind: TabsLayout - spec: - tabs: - - kind: TabsLayoutTab - spec: - title: Events - layout: - kind: AutoGridLayout - spec: - maxColumnCount: 2 - columnWidthMode: standard - rowHeightMode: standard - items: - - kind: AutoGridLayoutItem - spec: - element: - kind: ElementReference - name: panel-1 - - kind: AutoGridLayoutItem - spec: - element: - kind: ElementReference - name: panel-2 - - kind: AutoGridLayoutItem - spec: - element: - kind: ElementReference - name: panel-4 - - kind: AutoGridLayoutItem - spec: - element: - kind: ElementReference - name: panel-5 - - kind: TabsLayoutTab - spec: - title: Latencies - layout: - kind: AutoGridLayout - spec: - maxColumnCount: 3 - columnWidthMode: standard - rowHeightMode: standard - items: - - kind: AutoGridLayoutItem - spec: - element: - kind: ElementReference - name: panel-6 - - kind: AutoGridLayoutItem - spec: - element: - kind: ElementReference - name: panel-3 - links: [] - liveNow: false - preload: false - tags: [] - timeSettings: - timezone: browser - from: now-6h - to: now - autoRefresh: '' - autoRefreshIntervals: - - 5s - - 10s - - 30s - - 1m - - 5m - - 15m - - 30m - - 1h - - 2h - - 1d - hideTimepicker: false - fiscalYearStartMonth: 0 - title: Dashboard - variables: - - kind: CustomVariable - spec: - name: custom0 - query: '' - current: - text: '' - value: '' - options: [] - multi: false - includeAll: false - hide: dontHide - skipUrlSync: false - allowCustomValue: true - valuesFormat: csv - - kind: QueryVariable - spec: - name: query0 - current: - text: '' - value: '' - hide: dontHide - refresh: onDashboardLoad - skipUrlSync: false - query: - kind: DataQuery - group: prometheus - version: v0 - datasource: - name: ffo50cmg1o1s0a - spec: {} - regex: '' - regexApplyTo: value - sort: disabled - options: [] - multi: false - includeAll: false - allowCustomValue: true - - kind: DatasourceVariable - spec: - name: datasource0 - pluginId: prometheus - refresh: onDashboardLoad - regex: '' - current: - text: prometheus - value: ffo50cmg1o1s0a - options: [] - multi: false - includeAll: false - hide: dontHide - skipUrlSync: false - allowCustomValue: true - preferences: - layout: - kind: AutoGridLayout - spec: - maxColumnCount: 3 - columnWidthMode: standard - rowHeightMode: standard - items: [] diff --git a/Examples/run-with-docker/docker-compose.yaml b/Examples/run-with-docker/docker-compose.yaml new file mode 100644 index 00000000..5c4cebe9 --- /dev/null +++ b/Examples/run-with-docker/docker-compose.yaml @@ -0,0 +1,109 @@ +services: + cortexflow-metrics: + image: lorenzotettamanti/cortexflow-metrics:otel-metrics-15 + container_name: cortexflow-metrics + stop_signal: SIGINT + restart: unless-stopped + privileged: true + pid: host + init: true + command: ["/usr/local/bin/cortexflow-metrics"] + environment: + - OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317 + - OTEL_EXPORTER_OTLP_PROTOCOL=grpc + volumes: + - type: bind + source: /sys/fs/bpf + target: /sys/fs/bpf + bind: + propagation: shared + - /lib/modules:/lib/modules:ro + - /sys/kernel/debug:/sys/kernel/debug + - /proc:/host/proc:ro + networks: + - cortexflow + + #cortexflow-identity: + # image: lorenzotettamanti/cortexflow-identity:0.1.2 + # container_name: cortexflow-identity + # stop_signal: SIGINT + # restart: unless-stopped + # privileged: true + # pid: host + # init: true + # command: ["/usr/local/bin/cortexflow-identity-service"] + # volumes: + # - type: bind + # source: /sys/fs/bpf + # target: /sys/fs/bpf + # bind: + # propagation: shared + # - /lib/modules:/lib/modules:ro + # - /sys/kernel/debug:/sys/kernel/debug + # - /proc:/host/proc:ro + # networks: + # - cortexflow + + otel-collector: + image: otel/opentelemetry-collector:0.95.0 + container_name: otel-collector + command: + - "--config=/conf/otel-collector-config.yaml" + ports: + - "4317:4317" + - "4318:4318" + - "8889:8889" + environment: + - GOMEMLIMIT=1600MiB + volumes: + - ./otel-collector-config.yaml:/conf/otel-collector-config.yaml:ro + networks: + - cortexflow + depends_on: + # - cortexflow-identity + - cortexflow-metrics + + prometheus: + image: prom/prometheus:v2.51.2 + container_name: prometheus + user: "65534:65534" + command: + - "--config.file=/etc/prometheus/prometheus.yml" + - "--storage.tsdb.path=/prometheus" + - "--web.enable-lifecycle" + ports: + - "9090:9090" + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml:ro + - prometheus-data:/prometheus + networks: + - cortexflow + depends_on: + - otel-collector + + grafana: + image: grafana/grafana:latest + container_name: grafana + user: "472:472" + ports: + - "3000:3000" + environment: + - GF_SECURITY_ADMIN_USER=admin + - GF_SECURITY_ADMIN_PASSWORD=admin + - GF_USERS_ALLOW_SIGN_UP=false + - GF_AUTH_ANONYMOUS_ENABLED=false + volumes: + - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro + - grafana-data:/var/lib/grafana + networks: + - cortexflow + depends_on: + - prometheus + +networks: + cortexflow: + +volumes: + prometheus-data: + grafana-data: diff --git a/Examples/run-with-docker/grafana/dashboards/cortexbrain-dashboard.json b/Examples/run-with-docker/grafana/dashboards/cortexbrain-dashboard.json new file mode 100644 index 00000000..89e182fb --- /dev/null +++ b/Examples/run-with-docker/grafana/dashboards/cortexbrain-dashboard.json @@ -0,0 +1,605 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 1, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "builder", + "expr": "cortexbrain_packets_total", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Total packets", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "builder", + "expr": "cortexbrain_cpu_bytes_alloc", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Cpu bytes allocation", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 8 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "builder", + "expr": "cortexbrain_enter_mem_alloc", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Mem Alloc size", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 8 + }, + "id": 5, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "hideZeros": false, + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "builder", + "expr": "cortexbrain_mem_alloc_events_total", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Mem alloc tot events", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 0, + "y": 16 + }, + "id": 6, + "options": { + "displayMode": "gradient", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "builder", + "expr": "cortexbrain_sched_stat_runtime", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Scheduler Runtime (us)", + "transparent": true, + "type": "bargauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 16, + "x": 8, + "y": 16 + }, + "id": 3, + "options": { + "displayMode": "gradient", + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": false + }, + "maxVizHeight": 300, + "minVizHeight": 16, + "minVizWidth": 8, + "namePlacement": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showUnfilled": true, + "sizing": "auto", + "valueMode": "color" + }, + "pluginVersion": "10.4.2", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "builder", + "expr": "cortexbrain_cpu_bytes_alloc", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Timestamp microseconds histograms", + "transparent": true, + "type": "bargauge" + } + ], + "preload": false, + "refresh": "", + "schemaVersion": 39, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ] + }, + "timezone": "browser", + "title": "CortexBrain Dashboard", + "uid": "cortexbrain-main", + "version": 1 +} diff --git a/Examples/run-with-docker/grafana/provisioning/dashboards/dashboards.yml b/Examples/run-with-docker/grafana/provisioning/dashboards/dashboards.yml new file mode 100644 index 00000000..0bcf3d81 --- /dev/null +++ b/Examples/run-with-docker/grafana/provisioning/dashboards/dashboards.yml @@ -0,0 +1,11 @@ +apiVersion: 1 + +providers: + - name: 'default' + orgId: 1 + folder: '' + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards diff --git a/Examples/run-with-docker/grafana/provisioning/datasources/prometheus.yml b/Examples/run-with-docker/grafana/provisioning/datasources/prometheus.yml new file mode 100644 index 00000000..bb009bb2 --- /dev/null +++ b/Examples/run-with-docker/grafana/provisioning/datasources/prometheus.yml @@ -0,0 +1,9 @@ +apiVersion: 1 + +datasources: + - name: Prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: false diff --git a/Examples/run-with-docker/otel-collector-config.yaml b/Examples/run-with-docker/otel-collector-config.yaml new file mode 100644 index 00000000..91cae017 --- /dev/null +++ b/Examples/run-with-docker/otel-collector-config.yaml @@ -0,0 +1,34 @@ +receivers: + otlp: + protocols: + grpc: + endpoint: 0.0.0.0:4317 + http: + endpoint: 0.0.0.0:4318 + +processors: + memory_limiter: + limit_mib: 1500 + spike_limit_mib: 512 + check_interval: 5s + +exporters: + logging: {} + prometheus: + endpoint: 0.0.0.0:8889 + namespace: cortexbrain + +service: + pipelines: + traces: + receivers: [otlp] + processors: [memory_limiter] + exporters: [logging] + logs: + receivers: [otlp] + processors: [memory_limiter] + exporters: [logging] + metrics: + receivers: [otlp] + processors: [memory_limiter] + exporters: [logging, prometheus] diff --git a/Examples/run-with-docker/prometheus.yml b/Examples/run-with-docker/prometheus.yml new file mode 100644 index 00000000..faeda702 --- /dev/null +++ b/Examples/run-with-docker/prometheus.yml @@ -0,0 +1,8 @@ +global: + scrape_interval: 5s + evaluation_interval: 5s + +scrape_configs: + - job_name: "otel-collector" + static_configs: + - targets: ["otel-collector:8889"] From 46cdb15882f2f8a31eb5de5b04c24a116c0a248c Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Fri, 3 Jul 2026 21:20:14 +0200 Subject: [PATCH 13/14] [#186]: fixed typo in cpu.rs --- core/src/components/metrics_tracer/src/cpu.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/core/src/components/metrics_tracer/src/cpu.rs b/core/src/components/metrics_tracer/src/cpu.rs index 9d4b73a0..374ea36a 100644 --- a/core/src/components/metrics_tracer/src/cpu.rs +++ b/core/src/components/metrics_tracer/src/cpu.rs @@ -19,11 +19,7 @@ pub fn cpu_idle(ctx: TracePointContext) -> Result<(), i64> { // - last_state is equal to the current state // - last_state is equal to 4294967295 or -1. This codes means that the cpu is exiting from the current state and entering a new state let emit = match unsafe { (*map_ptr).get(&cpu_id) } { - Some(last_state) - if (*last_state == state) || (*last_state == 4294967295) || (*last_state == -1) => - { - false - } + Some(last_state) if (*last_state == state) || (*last_state == 4294967295) => false, _ => true, }; From 845c5c417a5e5ce60a64d691f699d799cac7ddf5 Mon Sep 17 00:00:00 2001 From: LorenzoTettamanti Date: Fri, 3 Jul 2026 21:20:40 +0200 Subject: [PATCH 14/14] (chores): updated metrics image --- Examples/run-with-docker/docker-compose.yaml | 2 +- core/src/testing/metrics.yaml | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/Examples/run-with-docker/docker-compose.yaml b/Examples/run-with-docker/docker-compose.yaml index 5c4cebe9..a77444cc 100644 --- a/Examples/run-with-docker/docker-compose.yaml +++ b/Examples/run-with-docker/docker-compose.yaml @@ -1,6 +1,6 @@ services: cortexflow-metrics: - image: lorenzotettamanti/cortexflow-metrics:otel-metrics-15 + image: lorenzotettamanti/cortexflow-metrics:otel-test-15 container_name: cortexflow-metrics stop_signal: SIGINT restart: unless-stopped diff --git a/core/src/testing/metrics.yaml b/core/src/testing/metrics.yaml index ce797950..7e748fd6 100644 --- a/core/src/testing/metrics.yaml +++ b/core/src/testing/metrics.yaml @@ -19,7 +19,7 @@ spec: hostNetwork: true containers: - name: metrics - image: lorenzotettamanti/cortexflow-metrics:otel-test-5 + image: lorenzotettamanti/cortexflow-metrics:otel-test-15 command: ["/bin/bash", "-c"] args: - | @@ -62,7 +62,7 @@ spec: - SYS_PTRACE - name: bpftool-control-manager image: danielpacak/bpftool-runner:latest - command: ["/bin/bash", "-c","sleep infinity"] + command: ["/bin/bash", "-c", "sleep infinity"] volumeMounts: - name: bpf mountPath: /sys/fs/bpf @@ -103,7 +103,6 @@ spec: - name: tracefs hostPath: - path: /sys/kernel/debug - type: Directory + path: /sys/kernel/debug + type: Directory - \ No newline at end of file