fix: stalled

lindesvard · lindesvard · commit 64bb729fc463 · 2025-10-25T12:37:17.000+02:00
diff --git a/src/lua/check-stalled.lua b/src/lua/check-stalled.lua
@@ -8,6 +8,26 @@ local now = tonumber(ARGV[2])
 local gracePeriod = tonumber(ARGV[3]) or 0
 local maxStalledCount = tonumber(ARGV[4]) or 1
 
+-- Circuit breaker for high concurrency: limit stalled job recovery
+-- to prevent excessive Redis load and race conditions
+local circuitBreakerKey = ns .. ":stalled:circuit"
+local lastCheck = redis.call("GET", circuitBreakerKey)
+if lastCheck then
+  local lastCheckTime = tonumber(lastCheck)
+  if lastCheckTime and (now - lastCheckTime) < 1000 then
+    -- Circuit breaker: only check stalled jobs once per second
+    return {}
+  end
+end
+redis.call("SET", circuitBreakerKey, now, "PX", 2000)
+
+-- BullMQ-inspired: Two-phase stalled detection for better accuracy
+-- Phase 1: Get potentially stalled jobs (jobs past their deadline)
+local potentiallyStalled = redis.call("ZRANGEBYSCORE", processingKey, 0, now - gracePeriod, "LIMIT", 0, 100)
+if not potentiallyStalled or #potentiallyStalled == 0 then
+  return {}
+end
+
 local processingKey = ns .. ':processing'
 local groupsKey = ns .. ':groups'
 local stalledKey = ns .. ':stalled'
@@ -70,7 +90,22 @@ for _, jobId in ipairs(processingJobs) do
       -- If job was completed between our snapshot and now, don't re-add it
       local stillInProcessing = redis.call('ZSCORE', processingKey, jobId)
       
-      if stillInProcessing then
+      -- Additional safety: check if job status is still 'processing' or 'waiting'
+      -- If it's 'completed' or 'failed', don't recover it
+      local currentStatus = redis.call('HGET', jobKey, 'status')
+      
+      -- CRITICAL: For high concurrency, add extra safety checks
+      -- Check if job was recently completed (within last 5 seconds)
+      local finishedOn = redis.call('HGET', jobKey, 'finishedOn')
+      local recentlyCompleted = false
+      if finishedOn then
+        local finishedTime = tonumber(finishedOn)
+        if finishedTime and (now - finishedTime) < 5000 then
+          recentlyCompleted = true
+        end
+      end
+      
+      if stillInProcessing and (currentStatus == 'processing' or currentStatus == 'waiting' or not currentStatus) and not recentlyCompleted then
         -- Job is confirmed to still be in processing, safe to recover
         redis.call('ZREM', processingKey, jobId)
         
diff --git a/src/lua/get-active-count.lua b/src/lua/get-active-count.lua
@@ -1,6 +1,7 @@
 -- argv: ns
 local ns = ARGV[1]
-local processingKey = ns .. ":processing"
-return redis.call("ZCARD", processingKey)
+local activeCountKey = ns .. ":count:active"
+local count = redis.call("GET", activeCountKey)
+return tonumber(count) or 0
 
 
diff --git a/src/lua/reserve-atomic.lua b/src/lua/reserve-atomic.lua
@@ -91,6 +91,10 @@ redis.call("HSET", procKey, "groupId", groupId, "deadlineAt", tostring(deadline)
 local processingKey = ns .. ":processing"
 redis.call("ZADD", processingKey, deadline, id)
 
+-- Increment active counter
+local activeCountKey = ns .. ":count:active"
+redis.call("INCR", activeCountKey)
+
 local nextHead = redis.call("ZRANGE", gZ, 0, 0, "WITHSCORES")
 if nextHead and #nextHead >= 2 then
   local nextScore = tonumber(nextHead[2])
diff --git a/src/lua/reserve-batch.lua b/src/lua/reserve-batch.lua
@@ -52,6 +52,10 @@ for i = 1, #groups, 2 do
         redis.call("HSET", procKey, "groupId", gid, "deadlineAt", tostring(deadline))
         redis.call("ZADD", processingKey, deadline, id)
 
+        -- Increment active counter
+        local activeCountKey = ns .. ":count:active"
+        redis.call("INCR", activeCountKey)
+
         -- Re-add group if there is a new head job (next oldest)
         local nextHead = redis.call("ZRANGE", gZ, 0, 0, "WITHSCORES")
         if nextHead and #nextHead >= 2 then
diff --git a/src/lua/reserve.lua b/src/lua/reserve.lua
@@ -90,6 +90,10 @@ redis.call("HSET", procKey, "groupId", chosenGid, "deadlineAt", tostring(deadlin
 local processingKey2 = ns .. ":processing"
 redis.call("ZADD", processingKey2, deadline, id)
 
+-- Increment active counter
+local activeCountKey = ns .. ":count:active"
+redis.call("INCR", activeCountKey)
+
 local nextHead = redis.call("ZRANGE", gZ, 0, 0, "WITHSCORES")
 if nextHead and #nextHead >= 2 then
   local nextScore = tonumber(nextHead[2])
diff --git a/src/queue.ts b/src/queue.ts
@@ -639,6 +639,14 @@ export class Queue<T = any> {
     }
   }
 
+  /**
+   * Check if a job is currently in processing state
+   */
+  async isJobProcessing(jobId: string): Promise<boolean> {
+    const score = await this.r.zscore(`${this.ns}:processing`, jobId);
+    return score !== null;
+  }
+
   async retry(jobId: string, backoffMs = 0) {
     return evalScript<number>(this.r, 'retry', [
       this.ns,
diff --git a/src/worker.ts b/src/worker.ts
@@ -793,6 +793,42 @@ class _Worker<T = any> extends TypedEventEmitter<WorkerEvents<T>> {
         );
         return nextJob;
       }
+      // CRITICAL FIX: If atomic completion failed, we need to check if the job was actually completed
+      // The job completion happens BEFORE the next job reservation in the Lua script
+      // So if it failed, the job might still be completed in Redis
+      this.logger.debug(
+        `Atomic completion failed for job ${job.id}, checking if job was completed in Redis`,
+      );
+
+      // Check if the job is still in processing - if not, it was completed
+      const isStillProcessing = await this.q.isJobProcessing(job.id);
+      if (!isStillProcessing) {
+        this.logger.debug(
+          `Job ${job.id} was completed in Redis despite atomic failure, group ${job.groupId} should be unlocked`,
+        );
+        // Job was completed, just ensure group is unlocked for next job
+        await this.q.complete(job);
+      } else {
+        this.logger.warn(
+          `Job ${job.id} is still in processing after atomic failure - this should not happen`,
+        );
+        // Fallback: complete the job normally
+        await this.q.completeWithMetadata(job, handlerResult, {
+          processedOn: processedOn || Date.now(),
+          finishedOn: finishedOn || Date.now(),
+          attempts: job.attempts,
+          maxAttempts: job.maxAttempts,
+        });
+      }
+
+      // CRITICAL: For high concurrency, add a small delay to prevent thundering herd
+      // This reduces the chance of multiple workers hitting the same race condition
+      if (Math.random() < 0.1) {
+        // 10% chance
+        await new Promise((resolve) =>
+          setTimeout(resolve, Math.random() * 100),
+        );
+      }
     } else {
       // Use completeWithMetadata for atomic completion with metadata
       await this.q.completeWithMetadata(job, handlerResult, {
@@ -808,11 +844,15 @@ class _Worker<T = any> extends TypedEventEmitter<WorkerEvents<T>> {
 
   /**
    * Start monitoring for stuck worker conditions
+   * BullMQ-inspired: More aggressive monitoring for high concurrency
    */
   private startStuckDetection(): void {
+    // More frequent checks for high concurrency environments
+    const checkInterval = this.concurrency > 10 ? 15000 : 30000; // 15s for high concurrency, 30s otherwise
+
     this.stuckDetectionTimer = setInterval(async () => {
       await this.checkForStuckConditions();
-    }, 30000); // Check every 30 seconds
+    }, checkInterval);
   }
 
   /**
@@ -900,6 +940,7 @@ class _Worker<T = any> extends TypedEventEmitter<WorkerEvents<T>> {
 
   /**
    * Check if worker appears to be stuck
+   * BullMQ-inspired: More sophisticated monitoring for high concurrency
    */
   private async checkForStuckConditions(): Promise<void> {
     if (this.stopping || this.closed) return;
@@ -909,33 +950,54 @@ class _Worker<T = any> extends TypedEventEmitter<WorkerEvents<T>> {
     const timeSinceLastJob =
       this.lastJobPickupTime > 0 ? now - this.lastJobPickupTime : now;
 
+    // BullMQ-inspired: Adaptive thresholds based on concurrency
+    const activityThreshold = this.concurrency > 10 ? 30000 : 60000; // 30s for high concurrency, 60s otherwise
+    const emptyReservesThreshold = this.concurrency > 10 ? 25 : 50; // Lower threshold for high concurrency
+    const jobStarvationThreshold = this.concurrency > 10 ? 60000 : 120000; // 1min for high concurrency, 2min otherwise
+
     // Check for stuck conditions
-    if (timeSinceLastActivity > 60000) {
-      // 1 minute without any activity
+    if (timeSinceLastActivity > activityThreshold) {
+      // No activity for threshold time
       this.logger.warn(
-        `STUCK WORKER ALERT: No activity for ${Math.round(timeSinceLastActivity / 1000)}s`,
+        `STUCK WORKER ALERT: No activity for ${Math.round(timeSinceLastActivity / 1000)}s (concurrency: ${this.concurrency})`,
       );
       await this.logWorkerStatus();
     }
 
     if (
-      this.blockingStats.consecutiveEmptyReserves > 50 &&
+      this.blockingStats.consecutiveEmptyReserves > emptyReservesThreshold &&
       this.shouldWarnAboutEmptyReserves()
     ) {
       // Too many empty reserves (but queue might have jobs)
       this.logger.warn(
-        `BLOCKING ALERT: ${this.blockingStats.consecutiveEmptyReserves} consecutive empty reserves`,
+        `BLOCKING ALERT: ${this.blockingStats.consecutiveEmptyReserves} consecutive empty reserves (threshold: ${emptyReservesThreshold})`,
       );
       await this.logWorkerStatus();
     }
 
-    if (timeSinceLastJob > 120000 && this.totalJobsProcessed > 0) {
-      // 2 minutes since last job (but has processed jobs before)
+    if (
+      timeSinceLastJob > jobStarvationThreshold &&
+      this.totalJobsProcessed > 0
+    ) {
+      // No jobs for threshold time (but has processed jobs before)
       this.logger.warn(
-        `JOB STARVATION ALERT: No jobs for ${Math.round(timeSinceLastJob / 1000)}s`,
+        `JOB STARVATION ALERT: No jobs for ${Math.round(timeSinceLastJob / 1000)}s (concurrency: ${this.concurrency})`,
       );
       await this.logWorkerStatus();
     }
+
+    // BullMQ-inspired: Check for heartbeat failures in high concurrency
+    if (this.concurrency > 10 && this.jobsInProgress.size > 0) {
+      const longRunningJobs = Array.from(this.jobsInProgress).filter(
+        (item) => now - item.ts > (this.q.jobTimeoutMs || 30000) / 2,
+      );
+
+      if (longRunningJobs.length > 0) {
+        this.logger.warn(
+          `HEARTBEAT ALERT: ${longRunningJobs.length} jobs running longer than half timeout (${this.q.jobTimeoutMs || 30000}ms) - check for event loop blocking`,
+        );
+      }
+    }
   }
 
   /**
@@ -1177,14 +1239,14 @@ class _Worker<T = any> extends TypedEventEmitter<WorkerEvents<T>> {
     let heartbeatDelayTimer: NodeJS.Timeout | undefined;
 
     const startHeartbeat = () => {
-      // Extend lock every jobTimeout/2 for more aggressive renewal
+      // BullMQ-inspired: Adaptive heartbeat interval based on concurrency
       const minInterval = Math.max(
-        this.hbMs,
+        this.hbMs, // Use the worker's configured heartbeat interval
         Math.floor((this.q.jobTimeoutMs || 30000) / 2),
       );
 
       this.logger.debug(
-        `Starting heartbeat for job ${job.id} (interval: ${minInterval}ms)`,
+        `Starting heartbeat for job ${job.id} (interval: ${minInterval}ms, concurrency: ${this.concurrency})`,
       );
 
       hbTimer = setInterval(async () => {
@@ -1221,7 +1283,7 @@ class _Worker<T = any> extends TypedEventEmitter<WorkerEvents<T>> {
     };
 
     try {
-      // Smart heartbeat: only start for jobs that might actually timeout
+      // BullMQ-inspired: Smart heartbeat with adaptive timing
       // Skip heartbeat for short jobs (< jobTimeoutMs / 3) to reduce Redis load
       const jobTimeout = this.q.jobTimeoutMs || 30000;
       const heartbeatThreshold = jobTimeout / 3;

Original file line number	Diff line number	Diff line change
`@@ -639,6 +639,14 @@ export class Queue<T = any> {`
`639`	`639`	`}`
`640`	`640`	`}`
`641`	`641`
	`642`	`+ /**`
	`643`	`+ * Check if a job is currently in processing state`
	`644`	`+ */`
	`645`	`+ async isJobProcessing(jobId: string): Promise<boolean> {`
	`646`	+ const score = await this.r.zscore(`${this.ns}:processing`, jobId);
	`647`	`+ return score !== null;`
	`648`	`+ }`
	`649`	`+`
`642`	`650`	`async retry(jobId: string, backoffMs = 0) {`
`643`	`651`	`return evalScript<number>(this.r, 'retry', [`
`644`	`652`	`this.ns,`