Skip to content
This repository was archived by the owner on Jan 27, 2026. It is now read-only.

Commit add3c1a

Browse files
authored
fix: Docker status code handling (#383)
* handle exit code explicitly
1 parent c9756c6 commit add3c1a

File tree

2 files changed

+23
-81
lines changed

2 files changed

+23
-81
lines changed

crates/worker/src/docker/docker_manager.rs

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ pub struct ContainerDetails {
3333
#[allow(unused)]
3434
pub image: String,
3535
pub status: Option<ContainerStateStatusEnum>,
36+
pub status_code: Option<i64>,
3637
#[allow(unused)]
3738
pub names: Vec<String>,
3839
#[allow(unused)]
@@ -454,10 +455,13 @@ impl DockerManager {
454455
) -> Result<ContainerDetails, DockerError> {
455456
debug!("Getting details for container: {}", container_id);
456457
let container = self.docker.inspect_container(container_id, None).await?;
458+
let state = container.state.clone();
459+
457460
let info = ContainerDetails {
458461
id: container.id.unwrap_or_default(),
459462
image: container.image.unwrap_or_default(),
460-
status: container.state.and_then(|s| s.status),
463+
status: state.as_ref().and_then(|s| s.status),
464+
status_code: state.as_ref().and_then(|s| s.exit_code),
461465
names: vec![container.name.unwrap_or_default()],
462466
created: container
463467
.created

crates/worker/src/docker/service.rs

Lines changed: 18 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ use super::DockerState;
44
use crate::console::Console;
55
use bollard::models::ContainerStateStatusEnum;
66
use chrono::{DateTime, Utc};
7+
use log::debug;
78
use shared::models::node::GpuSpecs;
89
use shared::models::task::Task;
910
use shared::models::task::TaskState;
@@ -267,14 +268,16 @@ impl DockerService {
267268
if status.status == Some(ContainerStateStatusEnum::CREATED) && task_state_current == TaskState::FAILED {
268269
Console::info("DockerService", "Task failed, waiting for new command from manager ...");
269270
} else {
270-
let task_state_live = match status.status {
271-
Some(ContainerStateStatusEnum::RUNNING) => TaskState::RUNNING,
272-
Some(ContainerStateStatusEnum::CREATED) => TaskState::PENDING,
273-
Some(ContainerStateStatusEnum::EXITED) => TaskState::COMPLETED,
274-
Some(ContainerStateStatusEnum::DEAD) => TaskState::FAILED,
275-
Some(ContainerStateStatusEnum::PAUSED) => TaskState::PAUSED,
276-
Some(ContainerStateStatusEnum::RESTARTING) => TaskState::RESTARTING,
277-
Some(ContainerStateStatusEnum::REMOVING) => TaskState::UNKNOWN,
271+
debug!("docker container status: {:?}, status_code: {:?}", status.status, status.status_code);
272+
let task_state_live = match (status.status, status.status_code) {
273+
(Some(ContainerStateStatusEnum::RUNNING), _) => TaskState::RUNNING,
274+
(Some(ContainerStateStatusEnum::CREATED), _) => TaskState::PENDING,
275+
(Some(ContainerStateStatusEnum::EXITED), Some(0)) => TaskState::COMPLETED,
276+
(Some(ContainerStateStatusEnum::EXITED), Some(code)) if code != 0 => TaskState::FAILED,
277+
(Some(ContainerStateStatusEnum::DEAD), _) => TaskState::FAILED,
278+
(Some(ContainerStateStatusEnum::PAUSED), _) => TaskState::PAUSED,
279+
(Some(ContainerStateStatusEnum::RESTARTING), _) => TaskState::RESTARTING,
280+
(Some(ContainerStateStatusEnum::REMOVING), _) => TaskState::UNKNOWN,
278281
_ => TaskState::UNKNOWN,
279282
};
280283

@@ -346,7 +349,6 @@ impl DockerService {
346349
}
347350
}
348351
}
349-
350352
#[cfg(test)]
351353
mod tests {
352354
use super::*;
@@ -357,7 +359,7 @@ mod tests {
357359

358360
#[tokio::test]
359361
#[serial_test::serial]
360-
async fn test_docker_service() {
362+
async fn test_docker_service_basic() {
361363
let cancellation_token = CancellationToken::new();
362364
let docker_service = DockerService::new(
363365
cancellation_token.clone(),
@@ -374,7 +376,7 @@ mod tests {
374376
id: Uuid::new_v4(),
375377
env_vars: None,
376378
command: Some("sleep".to_string()),
377-
args: Some(vec!["100".to_string()]),
379+
args: Some(vec!["5".to_string()]), // Reduced sleep time
378380
state: TaskState::PENDING,
379381
created_at: Utc::now().timestamp_millis(),
380382
..Default::default()
@@ -385,84 +387,20 @@ mod tests {
385387
.state
386388
.set_current_task(Some(task_clone))
387389
.await;
388-
let task_name = task.name.to_string();
389-
assert_eq!(
390-
docker_service.state.get_current_task().await.unwrap().name,
391-
task_name
392-
);
393-
394-
tokio::spawn(async move {
395-
docker_service.run().await.unwrap();
396-
});
397-
tokio::time::sleep(Duration::from_secs(10)).await;
398-
state_clone.set_current_task(None).await;
399-
tokio::time::sleep(Duration::from_secs(10)).await;
400-
Console::info("DockerService", "Cancelling cancellation token");
401-
cancellation_token.cancel();
402-
tokio::time::sleep(Duration::from_secs(10)).await;
403-
Console::info("DockerService", "Cancelling done");
404-
}
405-
406-
#[tokio::test]
407-
#[serial_test::serial]
408-
async fn test_docker_service_idle_on_failure() {
409-
let cancellation_token = CancellationToken::new();
410-
let docker_service = DockerService::new(
411-
cancellation_token.clone(),
412-
None,
413-
Some(1024),
414-
"/tmp/com.prime.miner/metrics.sock".to_string(),
415-
None,
416-
Address::ZERO.to_string(),
417-
None,
418-
);
419-
let state = docker_service.state.clone();
420-
421-
// Create task that will fail
422-
let task = Task {
423-
image: "ubuntu:latest".to_string(),
424-
name: "test-restart".to_string(),
425-
id: Uuid::new_v4(),
426-
env_vars: None,
427-
command: Some("invalid_command".to_string()),
428-
state: TaskState::PENDING,
429-
created_at: Utc::now().timestamp_millis(),
430-
..Default::default()
431-
};
432390

433-
let task_clone = task.clone();
434-
let state_clone = docker_service.state.clone();
435-
docker_service
436-
.state
437-
.set_current_task(Some(task_clone))
438-
.await;
439-
let task_name = task.name.to_string();
440391
assert_eq!(
441392
docker_service.state.get_current_task().await.unwrap().name,
442-
task_name
393+
task.name
443394
);
395+
444396
tokio::spawn(async move {
445397
docker_service.run().await.unwrap();
446398
});
447399

448-
// Wait for initial container start
449-
tokio::time::sleep(Duration::from_secs(5)).await;
450-
451-
// Wait for container to fail and timeout period
452-
tokio::time::sleep(Duration::from_secs(20)).await;
453-
454-
// Get current task state
455-
let current_task = state.get_current_task().await.unwrap();
456-
assert_eq!(current_task.state, TaskState::FAILED);
457-
458-
// Verify new container was created after timeout
459-
let last_started = state.get_last_started().await;
460-
assert!(last_started.is_some());
461-
462-
// Cleanup
400+
// Reduced wait times
401+
tokio::time::sleep(Duration::from_secs(2)).await;
463402
state_clone.set_current_task(None).await;
464-
tokio::time::sleep(Duration::from_secs(10)).await;
403+
tokio::time::sleep(Duration::from_secs(2)).await;
465404
cancellation_token.cancel();
466-
tokio::time::sleep(Duration::from_secs(5)).await;
467405
}
468406
}

0 commit comments

Comments
 (0)