Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 25 additions & 0 deletions doc/ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,31 @@ Change Log

All relevant changes are documented in this file.

[Unreleased]
------------

### Changes

- Restart log now spells out the signal name and flags core dumps,
e.g. `killed by SIGKILL` or `killed by SIGSEGV, core dumped`, in
place of the bare numeric `by signal: N`. Gives operators a much
stronger breadcrumb when a daemon dies unexpectedly

### Fixes

- Remove stale daemon-owned (`pid:!`) pidfiles after unclean exits.
When a daemon dies via SIGKILL/OOM/segfault, or exits early during
startup, its pidfile lingers and can prevent the next instance from
starting -- `dbus-daemon` for example refuses to start when its
pidfile already exists, so Finit's restart loop gives up. Finit now
drops the file when it still names the just-reaped PID and that PID
is no longer alive (the liveness check guards against PID reuse).
This is a controlled exception to the long-standing rule that Finit
does not touch service-owned pidfiles
- Fix misspelled `SIGUNKOWN` returned by `sig_name()` for unknown
signal numbers, now spelled correctly as `SIGUNKNOWN`. Surfaced by
the new restart log above

[4.17][] - 2026-04-28
---------------------

Expand Down
8 changes: 8 additions & 0 deletions doc/config/services.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,14 @@ basename of the binary to guess the PID file to watch for the PID:
`/var/run/ntpd.pid`. If Finit guesses wrong, you have to submit the
full `pid:!/path/to/file.pid`.

With `pid:!/path`, the file belongs to the service: Finit reads it
but does not create or remove it. The one exception is *stale*
cleanup — if the service dies without removing its own pidfile
(SIGKILL, OOM, segfault), and the file still names the just-reaped
PID, Finit removes it before the next retry. This prevents daemons
that refuse to start on an existing pidfile (e.g. `dbus-daemon`)
from getting stuck in a crash-restart loop.

**Example:**

In the case of `ospfd` (below), we omit the `-d` flag (daemonize) to
Expand Down
5 changes: 4 additions & 1 deletion src/pid.c
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,10 @@ int pid_file_set(svc_t *svc, char *file, int not)
* pid:!foo --> !/run/foo.pid
* pid:!/run/foo.pid --> !/run/foo.pid
*
* Note, nothing is created or removed by Finit in this latter form.
* Nothing is created or removed by Finit in this latter form, with one
* exception: a verifiably stale pidfile (still names the just-reaped
* PID, and that PID is no longer alive) is removed so the next instance
* can start. See service_clean_pidfile() in src/service.c.
*/
int pid_file_parse(svc_t *svc, char *arg)
{
Expand Down
55 changes: 47 additions & 8 deletions src/service.c
Original file line number Diff line number Diff line change
Expand Up @@ -1120,6 +1120,35 @@ static void service_notify_stop(svc_t *svc)
}
}

/*
* Drop a daemon-owned (pid:!) pidfile if it still names the just-reaped
* PID and that PID is gone. The liveness check guards against reuse.
*/
static void service_clean_pidfile(svc_t *svc, pid_t reaped)
Comment thread
troglobit marked this conversation as resolved.
{
pid_t pid;
char *fn;

if (reaped <= 1)
return;

fn = pid_file(svc);
Comment thread
troglobit marked this conversation as resolved.
if (!fn)
return;

pid = pid_file_read(fn);
if (pid != reaped || pid_alive(pid))
return;

if (remove(fn) && errno != ENOENT) {
logit(LOG_CRIT, "Failed removing stale service %s pidfile %s",
svc_ident(svc, NULL, 0), fn);
return;
}

dbg("Removed stale service %s pidfile %s", svc_ident(svc, NULL, 0), fn);
}

/*
* Clean up any lingering state from dead/killed services
*/
Expand All @@ -1137,6 +1166,8 @@ static void service_cleanup(svc_t *svc)
if (remove(fn) && errno != ENOENT)
logit(LOG_CRIT, "Failed removing service %s pidfile %s",
svc_ident(svc, NULL, 0), fn);
} else if (svc->pidfile[0] == '!') {
service_clean_pidfile(svc, svc->pid);
}

/*
Expand Down Expand Up @@ -2405,7 +2436,10 @@ void service_monitor(pid_t lost, int status)
if (svc_is_forking(svc)) {
/* Likely start script exiting */
if (svc_is_starting(svc)) {
svc->pid = 0; /* Expect no more activity from this one */
/* Daemon died before clearing 'starting'; drop any stale pidfile. */
service_clean_pidfile(svc, lost);
Comment thread
troglobit marked this conversation as resolved.
svc->oldpid = lost; /* So service_retry() logs the real PID */
svc->pid = 0; /* Expect no more activity from this one */
goto cont;
}

Expand Down Expand Up @@ -2794,13 +2828,18 @@ static void service_retry(svc_t *svc)
timeout = ((*restart_cnt) <= (svc->restart_max / 2)) ? 2000 : 5000;
/* If a longer timeout was specified in the conf, use that instead. */
svc->restart_tmo = max(svc->restart_tmo, timeout);
logit(LOG_CONSOLE|LOG_WARNING, "Service %s[%d] died (%s%d), restarting (retry in %d msec) (attempt: %d/%d)",
svc_ident(svc, NULL, 0), svc->oldpid,
WIFEXITED(svc->status) ? "with exit status: " : "by signal: ",
WIFEXITED(svc->status) ? WEXITSTATUS(svc->status) : WTERMSIG(svc->status),
svc->restart_tmo,
*restart_cnt,
svc->restart_max);
if (WIFEXITED(svc->status))
logit(LOG_CONSOLE|LOG_WARNING,
"Service %s[%d] died (exit status: %d), restarting (retry in %d msec) (attempt: %d/%d)",
svc_ident(svc, NULL, 0), svc->oldpid, WEXITSTATUS(svc->status),
svc->restart_tmo, *restart_cnt, svc->restart_max);
else
logit(LOG_CONSOLE|LOG_WARNING,
"Service %s[%d] died (killed by %s%s), restarting (retry in %d msec) (attempt: %d/%d)",
svc_ident(svc, NULL, 0), svc->oldpid,
sig_name(WTERMSIG(svc->status)),
WCOREDUMP(svc->status) ? ", core dumped" : "",
svc->restart_tmo, *restart_cnt, svc->restart_max);
Comment thread
troglobit marked this conversation as resolved.

svc_unblock(svc);
service_step(svc);
Expand Down
2 changes: 1 addition & 1 deletion src/sig.c
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,7 @@ const char *sig_name(int signo)
return signames[i].name;
}

return "SIGUNKOWN";
return "SIGUNKNOWN";
}

/*
Expand Down
2 changes: 2 additions & 0 deletions test/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ EXTRA_DIST += global-envs.sh
EXTRA_DIST += initctl-status-subset.sh
EXTRA_DIST += notify.sh
EXTRA_DIST += pidfile.sh
EXTRA_DIST += stale-pidfile.sh
EXTRA_DIST += pre-post-serv.sh
EXTRA_DIST += pre-fail.sh
EXTRA_DIST += process-depends.sh
Expand Down Expand Up @@ -84,6 +85,7 @@ TESTS += global-envs.sh
TESTS += initctl-status-subset.sh
TESTS += notify.sh
TESTS += pidfile.sh
TESTS += stale-pidfile.sh
TESTS += pre-post-serv.sh
TESTS += pre-fail.sh
TESTS += process-depends.sh
Expand Down
19 changes: 16 additions & 3 deletions test/src/serv.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ volatile sig_atomic_t reloading = 1;
volatile sig_atomic_t running = 1;
static char *ident = PROGNM;
static char fn[80];
static int exclusive = 0; /* -x: refuse to start if pidfile exists (dbus-style) */

static void verify_env(char *arg)
{
Expand Down Expand Up @@ -122,7 +123,7 @@ static void writefn(char *fn, int val)

static int checkfn(char *fn)
{
return !access(fn, R_OK);
return !access(fn, F_OK);
}

static void mine(char *fn)
Expand All @@ -133,12 +134,18 @@ static void mine(char *fn)

static void pidfile(char *pidfn)
{
static int once = 0;

if (!pidfn) {
if (fn[0] == 0)
snprintf(fn, sizeof(fn), "%s%s.pid", _PATH_VARRUN, ident);
pidfn = fn;
}

if (exclusive && !once && checkfn(pidfn))
errx(EX_SOFTWARE, "PID file %s exists, refusing to start", pidfn);
once = 1;
Comment thread
troglobit marked this conversation as resolved.

if (!checkfn(pidfn)) {
pid_t pid;

Expand All @@ -148,7 +155,7 @@ static void pidfile(char *pidfn)
atexit(cleanup);
} else {
inf("Touching PID file %s", pidfn);
utimensat(0, fn, NULL, 0);
utimensat(AT_FDCWD, pidfn, NULL, 0);
}
}

Expand Down Expand Up @@ -182,6 +189,7 @@ static int usage(int rc)
" -p Create PID file despite running in foreground\n"
" -P FILE Create PID file using FILE\n"
" -r SVC Call initctl to restart service SVC (self)\n"
" -x Refuse to start if PID file already exists (dbus-style)\n"
"\n"
"By default this program daemonizes itself to the background, and,\n"
"when it's done setting up its signal handler(s), creates a PID file\n"
Expand Down Expand Up @@ -212,7 +220,7 @@ int main(int argc, char *argv[])
char cmd[80];
int c;

while ((c = getopt(argc, argv, "cCe:E:f:F:hi:nN:pP:r:")) != EOF) {
while ((c = getopt(argc, argv, "cCe:E:f:F:hi:nN:pP:r:x")) != EOF) {
switch (c) {
case 'c':
do_crash = 1;
Expand Down Expand Up @@ -249,12 +257,17 @@ int main(int argc, char *argv[])
break;
case 'P':
pidfn = optarg;
if ((size_t)snprintf(fn, sizeof(fn), "%s", optarg) >= sizeof(fn))
errx(EX_USAGE, "-P path too long (max %zu)", sizeof(fn) - 1);
do_pidfile++;
break;
Comment thread
troglobit marked this conversation as resolved.
case 'r':
snprintf(cmd, sizeof(cmd), "initctl restart %s", optarg);
do_restart = 1;
break;
case 'x':
exclusive = 1;
break;
default:
return usage(1);
}
Expand Down
38 changes: 38 additions & 0 deletions test/stale-pidfile.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/sh
# Regression: Finit must remove a daemon-owned (pid:!) stale pidfile
# left behind by an unclean exit (SIGKILL), so the next instance can
# start. Simulates the dbus-daemon pattern: 'serv -x' refuses to
# start if its pidfile already exists.

set -eu

TEST_DIR=$(dirname "$0")
PIDFN="/run/serv.pid"

test_teardown()
{
say "Running test teardown."
run "rm -f $FINIT_CONF $PIDFN"
}

# shellcheck source=/dev/null
. "$TEST_DIR/lib/setup.sh"

say "Add service stanza '$FINIT_CONF' with pid:!$PIDFN"
run "echo 'service pid:!$PIDFN serv -np -P $PIDFN -x' > $FINIT_CONF"
run "initctl reload"

retry "assert_num_children 1 serv"
assert_is_pidfile "serv" "$PIDFN"

PID=$(texec cat "$PIDFN")
say "SIGKILL serv ($PID) -- leaves stale pidfile"
run "kill -9 $PID"

# Without the fix, 'serv -x' refuses to start on each retry until
# Finit hits restart_max and marks the service crashed. With the fix
# Finit removes the stale pidfile and the next instance comes up.
retry "assert_pidiff serv $PID"
retry "assert_num_children 1 serv"
Comment thread
troglobit marked this conversation as resolved.

sep
Loading