implement sched affinity syscalls

arihant2math · arihant2math · commit 28d2d7e42240 · 2026-03-19T11:30:41.000-07:00
diff --git a/etc/syscalls_linux_aarch64.md b/etc/syscalls_linux_aarch64.md
@@ -122,8 +122,8 @@
 | 0x77 (119)  | sched_setscheduler      | (pid_t pid, int policy, struct sched_param *param)                                                                                         | __arm64_sys_sched_setscheduler      | false       |
 | 0x78 (120)  | sched_getscheduler      | (pid_t pid)                                                                                                                                | __arm64_sys_sched_getscheduler      | false       |
 | 0x79 (121)  | sched_getparam          | (pid_t pid, struct sched_param *param)                                                                                                     | __arm64_sys_sched_getparam          | false       |
-| 0x7a (122)  | sched_setaffinity       | (pid_t pid, unsigned int len, unsigned long *user_mask_ptr)                                                                                | __arm64_sys_sched_setaffinity       | false       |
-| 0x7b (123)  | sched_getaffinity       | (pid_t pid, unsigned int len, unsigned long *user_mask_ptr)                                                                                | __arm64_sys_sched_getaffinity       | error       |
+| 0x7a (122)  | sched_setaffinity       | (pid_t pid, unsigned int len, unsigned long *user_mask_ptr)                                                                                | __arm64_sys_sched_setaffinity       | true        |
+| 0x7b (123)  | sched_getaffinity       | (pid_t pid, unsigned int len, unsigned long *user_mask_ptr)                                                                                | __arm64_sys_sched_getaffinity       | true        |
 | 0x7c (124)  | sched_yield             | ()                                                                                                                                         | __arm64_sys_sched_yield             | true        |
 | 0x7d (125)  | sched_get_priority_max  | (int policy)                                                                                                                               | __arm64_sys_sched_get_priority_max  | false       |
 | 0x7e (126)  | sched_get_priority_min  | (int policy)                                                                                                                               | __arm64_sys_sched_get_priority_min  | false       |
diff --git a/src/arch/arm64/exceptions/syscall.rs b/src/arch/arm64/exceptions/syscall.rs
@@ -97,7 +97,12 @@ use crate::{
         },
         threading::{futex::sys_futex, sys_set_robust_list, sys_set_tid_address},
     },
-    sched::{self, current::current_task, sched_task::state::TaskState, sys_sched_yield},
+    sched::{
+        self,
+        current::current_task,
+        sched_task::state::TaskState,
+        syscalls::{sys_sched_getaffinity, sys_sched_setaffinity, sys_sched_yield},
+    },
 };
 use alloc::boxed::Box;
 use libkernel::{
@@ -455,7 +460,8 @@ pub async fn handle_syscall() {
             )
             .await
         }
-        0x7b => Err(KernelError::NotSupported),
+        0x7a => sys_sched_setaffinity(arg1 as _, arg2 as _, TUA::from_value(arg3 as _)).await,
+        0x7b => sys_sched_getaffinity(arg1 as _, arg2 as _, TUA::from_value(arg3 as _)).await,
         0x7c => sys_sched_yield(),
         0x81 => sys_kill(arg1 as _, arg2.into()),
         0x82 => sys_tkill(arg1 as _, arg2.into()),
diff --git a/src/sched/mod.rs b/src/sched/mod.rs
@@ -4,14 +4,14 @@ use crate::drivers::timer::now;
 use crate::interrupts::cpu_messenger::{Message, message_cpu};
 use crate::kernel::cpu_id::CpuId;
 use crate::process::owned::OwnedTask;
+use crate::sched::sched_task::{CPU_MASK_SIZE, CpuMask};
 use crate::{per_cpu_private, per_cpu_shared, process::TASK_LIST};
 use alloc::{boxed::Box, sync::Arc, vec::Vec};
 use core::fmt::Debug;
 use core::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
 use core::task::Waker;
 use core::time::Duration;
 use current::{CUR_TASK_PTR, current_task};
-use libkernel::error::Result;
 use log::warn;
 use runqueue::RunQueue;
 use sched_task::{RunnableTask, Work};
@@ -20,6 +20,7 @@ use waker::create_waker;
 pub mod current;
 mod runqueue;
 pub mod sched_task;
+pub mod syscalls;
 pub mod uspc_ret;
 pub mod waker;
 
@@ -135,21 +136,30 @@ pub fn spawn_kernel_work(fut: impl Future<Output = ()> + 'static + Send) {
 }
 
 #[cfg(feature = "smp")]
-fn get_best_cpu() -> CpuId {
+fn get_best_cpu(cpu_mask: CpuMask) -> CpuId {
     let r = 0..ArchImpl::cpu_count();
-    r.min_by(|&x, &y| {
-        // TODO: Find a way to calculate already assigned affinities and account for that
-        let info_x = SHARED_SCHED_STATE.get_by_cpu(x);
-        let info_y = SHARED_SCHED_STATE.get_by_cpu(y);
-        let weight_x = info_x.total_runq_weight.load(Ordering::Relaxed);
-        let weight_y = info_y.total_runq_weight.load(Ordering::Relaxed);
-        weight_x.cmp(&weight_y)
-    })
-    .map(CpuId::from_value)
-    .unwrap_or_else(|| {
-        warn!("No CPUs found when trying to get best CPU! Defaulting to CPU 0");
-        CpuId::from_value(0)
-    })
+    r.enumerate()
+        // Filter to only CPUs in the mask
+        .filter(|(i, _)| {
+            let byte_index = i / 8;
+            let bit_index = i % 8;
+            (cpu_mask[byte_index] & (1 << bit_index)) != 0
+        })
+        .map(|(_, cpu_id)| cpu_id)
+        // Find optimal CPU based on least run queue weight
+        .min_by(|&x, &y| {
+            // TODO: Find a way to calculate already assigned affinities and account for that
+            let info_x = SHARED_SCHED_STATE.get_by_cpu(x);
+            let info_y = SHARED_SCHED_STATE.get_by_cpu(y);
+            let weight_x = info_x.total_runq_weight.load(Ordering::Relaxed);
+            let weight_y = info_y.total_runq_weight.load(Ordering::Relaxed);
+            weight_x.cmp(&weight_y)
+        })
+        .map(CpuId::from_value)
+        .unwrap_or_else(|| {
+            warn!("No CPUs found when trying to get best CPU! Defaulting to CPU 0");
+            CpuId::from_value(0)
+        })
 }
 
 /// Insert the given task onto a CPU's run queue.
@@ -159,17 +169,28 @@ pub fn insert_work(work: Arc<Work>) {
 
 #[cfg(feature = "smp")]
 pub fn insert_work_cross_cpu(work: Arc<Work>) {
-    let last_cpu = work
-        .sched_data
-        .lock_save_irq()
+    let sched_data = work.sched_data.lock_save_irq();
+    let last_cpu = sched_data
         .as_ref()
         .map(|s| s.last_cpu)
         .unwrap_or(usize::MAX);
+    let mask = sched_data
+        .as_ref()
+        .map(|s| s.cpu_mask)
+        .unwrap_or([u8::MAX; CPU_MASK_SIZE]);
     let cpu = if last_cpu == usize::MAX {
-        get_best_cpu()
+        get_best_cpu(mask)
     } else {
-        CpuId::from_value(last_cpu)
+        // Check if the last CPU is still in the affinity mask, and if so, prefer it to improve cache locality.
+        let byte_index = last_cpu / 8;
+        let bit_index = last_cpu % 8;
+        if (mask[byte_index] & (1 << bit_index)) != 0 {
+            CpuId::from_value(last_cpu)
+        } else {
+            get_best_cpu(mask)
+        }
     };
+    drop(sched_data);
     if cpu == CpuId::this() {
         SCHED_STATE.borrow_mut().run_q.add_work(work);
     } else {
@@ -264,11 +285,6 @@ pub fn sched_init_secondary() {
     schedule();
 }
 
-pub fn sys_sched_yield() -> Result<usize> {
-    schedule();
-    Ok(0)
-}
-
 pub fn current_work() -> Arc<Work> {
     SCHED_STATE.borrow().run_q.current().task.clone()
 }
diff --git a/src/sched/sched_task/mod.rs b/src/sched/sched_task/mod.rs
@@ -23,7 +23,8 @@ pub struct Work {
 }
 
 pub const NR_CPUS: usize = 256;
-pub const CPU_MASK_SIZE: usize = NR_CPUS / 64;
+pub const CPU_MASK_SIZE: usize = NR_CPUS / 8;
+pub type CpuMask = [u8; CPU_MASK_SIZE];
 
 #[derive(Clone)]
 pub struct SchedulerData {
@@ -36,7 +37,7 @@ pub struct SchedulerData {
     pub deadline: Option<Instant>,
     pub last_run: Option<Instant>,
     pub last_cpu: usize,
-    pub cpu_mask: [u64; CPU_MASK_SIZE],
+    pub cpu_mask: CpuMask,
 }
 
 impl SchedulerData {
@@ -49,7 +50,7 @@ impl SchedulerData {
             deadline: None,
             last_run: None,
             last_cpu: usize::MAX,
-            cpu_mask: [u64::MAX; CPU_MASK_SIZE],
+            cpu_mask: [u8::MAX; CPU_MASK_SIZE],
         }
     }
 }
diff --git a/src/sched/syscalls/mod.rs b/src/sched/syscalls/mod.rs
@@ -0,0 +1,71 @@
+use crate::arch::{Arch, ArchImpl};
+use crate::memory::uaccess::{copy_from_user_slice, copy_to_user_slice};
+use crate::process::thread_group::pid::PidT;
+use crate::sched::sched_task::CPU_MASK_SIZE;
+use crate::sched::{current_work, schedule};
+use alloc::vec;
+use libkernel::memory::address::UA;
+
+pub fn sys_sched_yield() -> libkernel::error::Result<usize> {
+    schedule();
+    Ok(0)
+}
+
+pub async fn sys_sched_getaffinity(
+    pid: PidT,
+    size: usize,
+    mask: UA,
+) -> libkernel::error::Result<usize> {
+    let task = if pid == 0 {
+        current_work()
+    } else {
+        // TODO: Support getting affinity of other tasks if PERM_NICE
+        return Err(libkernel::error::KernelError::InvalidValue);
+    };
+    let cpu_mask = {
+        let sched_data = task.sched_data.lock_save_irq();
+        sched_data.as_ref().unwrap().cpu_mask
+    };
+    let mut cpu_mask: &[u8] = &cpu_mask;
+    if CPU_MASK_SIZE > size {
+        cpu_mask = &cpu_mask[..size];
+    }
+    copy_to_user_slice(cpu_mask, mask).await?;
+    Ok(cpu_mask.len())
+}
+
+pub async fn sys_sched_setaffinity(
+    pid: PidT,
+    size: usize,
+    mask: UA,
+) -> libkernel::error::Result<usize> {
+    let mut cpu_set = vec![0u8; size];
+    copy_from_user_slice(mask, cpu_set.as_mut_slice()).await?;
+    let task = if pid == 0 {
+        current_work()
+    } else {
+        // TODO: Support setting affinity of other tasks if PERM_NICE
+        return Err(libkernel::error::KernelError::InvalidValue);
+    };
+    let mut sched_data = task.sched_data.lock_save_irq();
+    if CPU_MASK_SIZE > size {
+        return Err(libkernel::error::KernelError::InvalidValue);
+    }
+    cpu_set.truncate(CPU_MASK_SIZE);
+    // Check if this turns off all CPUs, which is not allowed.
+    let mut any_true = false;
+    for i in 0..ArchImpl::cpu_count() {
+        let byte_index = i / 8;
+        let bit_index = i % 8;
+        if (cpu_set[byte_index] & (1 << bit_index)) != 0 {
+            any_true = true;
+            break;
+        }
+    }
+    if !any_true {
+        return Err(libkernel::error::KernelError::InvalidValue);
+    }
+    sched_data.as_mut().unwrap().cpu_mask = cpu_set.try_into().unwrap();
+    // TODO: apply the new affinity immediately if the current CPU is no longer in the set
+    Ok(0)
+}

Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,8 @@ pub struct Work {`
`23`	`23`	`}`
`24`	`24`
`25`	`25`	`pub const NR_CPUS: usize = 256;`
`26`		`-pub const CPU_MASK_SIZE: usize = NR_CPUS / 64;`
	`26`	`+pub const CPU_MASK_SIZE: usize = NR_CPUS / 8;`
	`27`	`+pub type CpuMask = [u8; CPU_MASK_SIZE];`
`27`	`28`
`28`	`29`	`#[derive(Clone)]`
`29`	`30`	`pub struct SchedulerData {`
`@@ -36,7 +37,7 @@ pub struct SchedulerData {`
`36`	`37`	`pub deadline: Option<Instant>,`
`37`	`38`	`pub last_run: Option<Instant>,`
`38`	`39`	`pub last_cpu: usize,`
`39`		`- pub cpu_mask: [u64; CPU_MASK_SIZE],`
	`40`	`+ pub cpu_mask: CpuMask,`
`40`	`41`	`}`
`41`	`42`
`42`	`43`	`impl SchedulerData {`
`@@ -49,7 +50,7 @@ impl SchedulerData {`
`49`	`50`	`deadline: None,`
`50`	`51`	`last_run: None,`
`51`	`52`	`last_cpu: usize::MAX,`
`52`		`- cpu_mask: [u64::MAX; CPU_MASK_SIZE],`
	`53`	`+ cpu_mask: [u8::MAX; CPU_MASK_SIZE],`
`53`	`54`	`}`
`54`	`55`	`}`
`55`	`56`	`}`