@@ -793,6 +793,42 @@ class _Worker<T = any> extends TypedEventEmitter<WorkerEvents<T>> {
793793 ) ;
794794 return nextJob ;
795795 }
796+ // CRITICAL FIX: If atomic completion failed, we need to check if the job was actually completed
797+ // The job completion happens BEFORE the next job reservation in the Lua script
798+ // So if it failed, the job might still be completed in Redis
799+ this . logger . debug (
800+ `Atomic completion failed for job ${ job . id } , checking if job was completed in Redis` ,
801+ ) ;
802+
803+ // Check if the job is still in processing - if not, it was completed
804+ const isStillProcessing = await this . q . isJobProcessing ( job . id ) ;
805+ if ( ! isStillProcessing ) {
806+ this . logger . debug (
807+ `Job ${ job . id } was completed in Redis despite atomic failure, group ${ job . groupId } should be unlocked` ,
808+ ) ;
809+ // Job was completed, just ensure group is unlocked for next job
810+ await this . q . complete ( job ) ;
811+ } else {
812+ this . logger . warn (
813+ `Job ${ job . id } is still in processing after atomic failure - this should not happen` ,
814+ ) ;
815+ // Fallback: complete the job normally
816+ await this . q . completeWithMetadata ( job , handlerResult , {
817+ processedOn : processedOn || Date . now ( ) ,
818+ finishedOn : finishedOn || Date . now ( ) ,
819+ attempts : job . attempts ,
820+ maxAttempts : job . maxAttempts ,
821+ } ) ;
822+ }
823+
824+ // CRITICAL: For high concurrency, add a small delay to prevent thundering herd
825+ // This reduces the chance of multiple workers hitting the same race condition
826+ if ( Math . random ( ) < 0.1 ) {
827+ // 10% chance
828+ await new Promise ( ( resolve ) =>
829+ setTimeout ( resolve , Math . random ( ) * 100 ) ,
830+ ) ;
831+ }
796832 } else {
797833 // Use completeWithMetadata for atomic completion with metadata
798834 await this . q . completeWithMetadata ( job , handlerResult , {
@@ -808,11 +844,15 @@ class _Worker<T = any> extends TypedEventEmitter<WorkerEvents<T>> {
808844
809845 /**
810846 * Start monitoring for stuck worker conditions
847+ * BullMQ-inspired: More aggressive monitoring for high concurrency
811848 */
812849 private startStuckDetection ( ) : void {
850+ // More frequent checks for high concurrency environments
851+ const checkInterval = this . concurrency > 10 ? 15000 : 30000 ; // 15s for high concurrency, 30s otherwise
852+
813853 this . stuckDetectionTimer = setInterval ( async ( ) => {
814854 await this . checkForStuckConditions ( ) ;
815- } , 30000 ) ; // Check every 30 seconds
855+ } , checkInterval ) ;
816856 }
817857
818858 /**
@@ -900,6 +940,7 @@ class _Worker<T = any> extends TypedEventEmitter<WorkerEvents<T>> {
900940
901941 /**
902942 * Check if worker appears to be stuck
943+ * BullMQ-inspired: More sophisticated monitoring for high concurrency
903944 */
904945 private async checkForStuckConditions ( ) : Promise < void > {
905946 if ( this . stopping || this . closed ) return ;
@@ -909,33 +950,54 @@ class _Worker<T = any> extends TypedEventEmitter<WorkerEvents<T>> {
909950 const timeSinceLastJob =
910951 this . lastJobPickupTime > 0 ? now - this . lastJobPickupTime : now ;
911952
953+ // BullMQ-inspired: Adaptive thresholds based on concurrency
954+ const activityThreshold = this . concurrency > 10 ? 30000 : 60000 ; // 30s for high concurrency, 60s otherwise
955+ const emptyReservesThreshold = this . concurrency > 10 ? 25 : 50 ; // Lower threshold for high concurrency
956+ const jobStarvationThreshold = this . concurrency > 10 ? 60000 : 120000 ; // 1min for high concurrency, 2min otherwise
957+
912958 // Check for stuck conditions
913- if ( timeSinceLastActivity > 60000 ) {
914- // 1 minute without any activity
959+ if ( timeSinceLastActivity > activityThreshold ) {
960+ // No activity for threshold time
915961 this . logger . warn (
916- `STUCK WORKER ALERT: No activity for ${ Math . round ( timeSinceLastActivity / 1000 ) } s` ,
962+ `STUCK WORKER ALERT: No activity for ${ Math . round ( timeSinceLastActivity / 1000 ) } s (concurrency: ${ this . concurrency } ) ` ,
917963 ) ;
918964 await this . logWorkerStatus ( ) ;
919965 }
920966
921967 if (
922- this . blockingStats . consecutiveEmptyReserves > 50 &&
968+ this . blockingStats . consecutiveEmptyReserves > emptyReservesThreshold &&
923969 this . shouldWarnAboutEmptyReserves ( )
924970 ) {
925971 // Too many empty reserves (but queue might have jobs)
926972 this . logger . warn (
927- `BLOCKING ALERT: ${ this . blockingStats . consecutiveEmptyReserves } consecutive empty reserves` ,
973+ `BLOCKING ALERT: ${ this . blockingStats . consecutiveEmptyReserves } consecutive empty reserves (threshold: ${ emptyReservesThreshold } ) ` ,
928974 ) ;
929975 await this . logWorkerStatus ( ) ;
930976 }
931977
932- if ( timeSinceLastJob > 120000 && this . totalJobsProcessed > 0 ) {
933- // 2 minutes since last job (but has processed jobs before)
978+ if (
979+ timeSinceLastJob > jobStarvationThreshold &&
980+ this . totalJobsProcessed > 0
981+ ) {
982+ // No jobs for threshold time (but has processed jobs before)
934983 this . logger . warn (
935- `JOB STARVATION ALERT: No jobs for ${ Math . round ( timeSinceLastJob / 1000 ) } s` ,
984+ `JOB STARVATION ALERT: No jobs for ${ Math . round ( timeSinceLastJob / 1000 ) } s (concurrency: ${ this . concurrency } ) ` ,
936985 ) ;
937986 await this . logWorkerStatus ( ) ;
938987 }
988+
989+ // BullMQ-inspired: Check for heartbeat failures in high concurrency
990+ if ( this . concurrency > 10 && this . jobsInProgress . size > 0 ) {
991+ const longRunningJobs = Array . from ( this . jobsInProgress ) . filter (
992+ ( item ) => now - item . ts > ( this . q . jobTimeoutMs || 30000 ) / 2 ,
993+ ) ;
994+
995+ if ( longRunningJobs . length > 0 ) {
996+ this . logger . warn (
997+ `HEARTBEAT ALERT: ${ longRunningJobs . length } jobs running longer than half timeout (${ this . q . jobTimeoutMs || 30000 } ms) - check for event loop blocking` ,
998+ ) ;
999+ }
1000+ }
9391001 }
9401002
9411003 /**
@@ -1177,14 +1239,14 @@ class _Worker<T = any> extends TypedEventEmitter<WorkerEvents<T>> {
11771239 let heartbeatDelayTimer : NodeJS . Timeout | undefined ;
11781240
11791241 const startHeartbeat = ( ) => {
1180- // Extend lock every jobTimeout/2 for more aggressive renewal
1242+ // BullMQ-inspired: Adaptive heartbeat interval based on concurrency
11811243 const minInterval = Math . max (
1182- this . hbMs ,
1244+ this . hbMs , // Use the worker's configured heartbeat interval
11831245 Math . floor ( ( this . q . jobTimeoutMs || 30000 ) / 2 ) ,
11841246 ) ;
11851247
11861248 this . logger . debug (
1187- `Starting heartbeat for job ${ job . id } (interval: ${ minInterval } ms)` ,
1249+ `Starting heartbeat for job ${ job . id } (interval: ${ minInterval } ms, concurrency: ${ this . concurrency } )` ,
11881250 ) ;
11891251
11901252 hbTimer = setInterval ( async ( ) => {
@@ -1221,7 +1283,7 @@ class _Worker<T = any> extends TypedEventEmitter<WorkerEvents<T>> {
12211283 } ;
12221284
12231285 try {
1224- // Smart heartbeat: only start for jobs that might actually timeout
1286+ // BullMQ-inspired: Smart heartbeat with adaptive timing
12251287 // Skip heartbeat for short jobs (< jobTimeoutMs / 3) to reduce Redis load
12261288 const jobTimeout = this . q . jobTimeoutMs || 30000 ;
12271289 const heartbeatThreshold = jobTimeout / 3 ;
0 commit comments