Skip to content

Commit 815a8d2

Browse files
NarayanamurtyNmaddy-kerneldev
authored andcommitted
powerpc/eeh: fix recursive pci_lock_rescan_remove locking in EEH event handling
The recent commit 1010b4c ("powerpc/eeh: Make EEH driver device hotplug safe") restructured the EEH driver to improve synchronization with the PCI hotplug layer. However, it inadvertently moved pci_lock_rescan_remove() outside its intended scope in eeh_handle_normal_event(), leading to broken PCI error reporting and improper EEH event triggering. Specifically, eeh_handle_normal_event() acquired pci_lock_rescan_remove() before calling eeh_pe_bus_get(), but eeh_pe_bus_get() itself attempts to acquire the same lock internally, causing nested locking and disrupting normal EEH event handling paths. This patch adds a boolean parameter do_lock to _eeh_pe_bus_get(), with two public wrappers: eeh_pe_bus_get() with locking enabled. eeh_pe_bus_get_nolock() that skips locking. Callers that already hold pci_lock_rescan_remove() now use eeh_pe_bus_get_nolock() to avoid recursive lock acquisition. Additionally, pci_lock_rescan_remove() calls are restored to the correct position—after eeh_pe_bus_get() and immediately before iterating affected PEs and devices. This ensures EEH-triggered PCI removes occur under proper bus rescan locking without recursive lock contention. The eeh_pe_loc_get() function has been split into two functions: eeh_pe_loc_get(struct eeh_pe *pe) which retrieves the loc for given PE. eeh_pe_loc_get_bus(struct pci_bus *bus) which retrieves the location code for given bus. This resolves lockdep warnings such as: <snip> [ 84.964298] [ T928] ============================================ [ 84.964304] [ T928] WARNING: possible recursive locking detected [ 84.964311] [ T928] 6.18.0-rc3 #51 Not tainted [ 84.964315] [ T928] -------------------------------------------- [ 84.964320] [ T928] eehd/928 is trying to acquire lock: [ 84.964324] [ T928] c000000003b29d58 (pci_rescan_remove_lock){+.+.}-{3:3}, at: pci_lock_rescan_remove+0x28/0x40 [ 84.964342] [ T928] but task is already holding lock: [ 84.964347] [ T928] c000000003b29d58 (pci_rescan_remove_lock){+.+.}-{3:3}, at: pci_lock_rescan_remove+0x28/0x40 [ 84.964357] [ T928] other info that might help us debug this: [ 84.964363] [ T928] Possible unsafe locking scenario: [ 84.964367] [ T928] CPU0 [ 84.964370] [ T928] ---- [ 84.964373] [ T928] lock(pci_rescan_remove_lock); [ 84.964378] [ T928] lock(pci_rescan_remove_lock); [ 84.964383] [ T928] *** DEADLOCK *** [ 84.964388] [ T928] May be due to missing lock nesting notation [ 84.964393] [ T928] 1 lock held by eehd/928: [ 84.964397] [ T928] #0: c000000003b29d58 (pci_rescan_remove_lock){+.+.}-{3:3}, at: pci_lock_rescan_remove+0x28/0x40 [ 84.964408] [ T928] stack backtrace: [ 84.964414] [ T928] CPU: 2 UID: 0 PID: 928 Comm: eehd Not tainted 6.18.0-rc3 #51 VOLUNTARY [ 84.964417] [ T928] Hardware name: IBM,9080-HEX POWER10 (architected) 0x800200 0xf000006 of:IBM,FW1060.00 (NH1060_022) hv:phyp pSeries [ 84.964419] [ T928] Call Trace: [ 84.964420] [ T928] [c0000011a7157990] [c000000001705de4] dump_stack_lvl+0xc8/0x130 (unreliable) [ 84.964424] [ T928] [c0000011a71579d0] [c0000000002f66e0] print_deadlock_bug+0x430/0x440 [ 84.964428] [ T928] [c0000011a7157a70] [c0000000002fd0c0] __lock_acquire+0x1530/0x2d80 [ 84.964431] [ T928] [c0000011a7157ba0] [c0000000002fea54] lock_acquire+0x144/0x410 [ 84.964433] [ T928] [c0000011a7157cb0] [c0000011a7157cb0] __mutex_lock+0xf4/0x1050 [ 84.964436] [ T928] [c0000011a7157e00] [c000000000de21d8] pci_lock_rescan_remove+0x28/0x40 [ 84.964439] [ T928] [c0000011a7157e20] [c00000000004ed98] eeh_pe_bus_get+0x48/0xc0 [ 84.964442] [ T928] [c0000011a7157e50] [c000000000050434] eeh_handle_normal_event+0x64/0xa60 [ 84.964446] [ T928] [c0000011a7157f30] [c000000000051de8] eeh_event_handler+0xf8/0x190 [ 84.964450] [ T928] [c0000011a7157f90] [c0000000002747ac] kthread+0x16c/0x180 [ 84.964453] [ T928] [c0000011a7157fe0] [c00000000000ded8] start_kernel_thread+0x14/0x18 </snip> Fixes: 1010b4c ("powerpc/eeh: Make EEH driver device hotplug safe") Signed-off-by: Narayana Murty N <nnmlinux@linux.ibm.com> Reviewed-by: Sourabh Jain <sourabhjain@linux.ibm.com> Reviewed-by: Mahesh Salgaonkar <mahesh@linux.ibm.com> Signed-off-by: Madhavan Srinivasan <maddy@linux.ibm.com> Link: https://patch.msgid.link/20251210142559.8874-1-nnmlinux@linux.ibm.com
1 parent c0215e2 commit 815a8d2

File tree

3 files changed

+78
-9
lines changed

3 files changed

+78
-9
lines changed

arch/powerpc/include/asm/eeh.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,8 @@ void eeh_pe_dev_traverse(struct eeh_pe *root,
289289
void eeh_pe_restore_bars(struct eeh_pe *pe);
290290
const char *eeh_pe_loc_get(struct eeh_pe *pe);
291291
struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe);
292+
const char *eeh_pe_loc_get_bus(struct pci_bus *bus);
293+
struct pci_bus *eeh_pe_bus_get_nolock(struct eeh_pe *pe);
292294

293295
void eeh_show_enabled(void);
294296
int __init eeh_init(struct eeh_ops *ops);

arch/powerpc/kernel/eeh_driver.c

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -846,7 +846,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
846846

847847
pci_lock_rescan_remove();
848848

849-
bus = eeh_pe_bus_get(pe);
849+
bus = eeh_pe_bus_get_nolock(pe);
850850
if (!bus) {
851851
pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n",
852852
__func__, pe->phb->global_number, pe->addr);
@@ -886,14 +886,15 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
886886
/* Log the event */
887887
if (pe->type & EEH_PE_PHB) {
888888
pr_err("EEH: Recovering PHB#%x, location: %s\n",
889-
pe->phb->global_number, eeh_pe_loc_get(pe));
889+
pe->phb->global_number, eeh_pe_loc_get_bus(bus));
890890
} else {
891891
struct eeh_pe *phb_pe = eeh_phb_pe_get(pe->phb);
892892

893893
pr_err("EEH: Recovering PHB#%x-PE#%x\n",
894894
pe->phb->global_number, pe->addr);
895895
pr_err("EEH: PE location: %s, PHB location: %s\n",
896-
eeh_pe_loc_get(pe), eeh_pe_loc_get(phb_pe));
896+
eeh_pe_loc_get_bus(bus),
897+
eeh_pe_loc_get_bus(eeh_pe_bus_get_nolock(phb_pe)));
897898
}
898899

899900
#ifdef CONFIG_STACKTRACE
@@ -1098,7 +1099,7 @@ void eeh_handle_normal_event(struct eeh_pe *pe)
10981099
eeh_pe_state_clear(pe, EEH_PE_PRI_BUS, true);
10991100
eeh_pe_dev_mode_mark(pe, EEH_DEV_REMOVED);
11001101

1101-
bus = eeh_pe_bus_get(pe);
1102+
bus = eeh_pe_bus_get_nolock(pe);
11021103
if (bus)
11031104
pci_hp_remove_devices(bus);
11041105
else
@@ -1222,7 +1223,7 @@ void eeh_handle_special_event(void)
12221223
(phb_pe->state & EEH_PE_RECOVERING))
12231224
continue;
12241225

1225-
bus = eeh_pe_bus_get(phb_pe);
1226+
bus = eeh_pe_bus_get_nolock(phb_pe);
12261227
if (!bus) {
12271228
pr_err("%s: Cannot find PCI bus for "
12281229
"PHB#%x-PE#%x\n",

arch/powerpc/kernel/eeh_pe.c

Lines changed: 70 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -812,6 +812,24 @@ void eeh_pe_restore_bars(struct eeh_pe *pe)
812812
const char *eeh_pe_loc_get(struct eeh_pe *pe)
813813
{
814814
struct pci_bus *bus = eeh_pe_bus_get(pe);
815+
return eeh_pe_loc_get_bus(bus);
816+
}
817+
818+
/**
819+
* eeh_pe_loc_get_bus - Retrieve location code binding to the given PCI bus
820+
* @bus: PCI bus
821+
*
822+
* Retrieve the location code associated with the given PCI bus. If the bus
823+
* is a root bus, the location code is fetched from the PHB device tree node
824+
* or root port. Otherwise, the location code is obtained from the device
825+
* tree node of the upstream bridge of the bus. The function walks up the
826+
* bus hierarchy if necessary, checking each node for the appropriate
827+
* location code property ("ibm,io-base-loc-code" for root buses,
828+
* "ibm,slot-location-code" for others). If no location code is found,
829+
* returns "N/A".
830+
*/
831+
const char *eeh_pe_loc_get_bus(struct pci_bus *bus)
832+
{
815833
struct device_node *dn;
816834
const char *loc = NULL;
817835

@@ -838,16 +856,17 @@ const char *eeh_pe_loc_get(struct eeh_pe *pe)
838856
}
839857

840858
/**
841-
* eeh_pe_bus_get - Retrieve PCI bus according to the given PE
859+
* _eeh_pe_bus_get - Retrieve PCI bus according to the given PE
842860
* @pe: EEH PE
861+
* @do_lock: Is the caller already held the pci_lock_rescan_remove?
843862
*
844863
* Retrieve the PCI bus according to the given PE. Basically,
845864
* there're 3 types of PEs: PHB/Bus/Device. For PHB PE, the
846865
* primary PCI bus will be retrieved. The parent bus will be
847866
* returned for BUS PE. However, we don't have associated PCI
848867
* bus for DEVICE PE.
849868
*/
850-
struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
869+
static struct pci_bus *_eeh_pe_bus_get(struct eeh_pe *pe, bool do_lock)
851870
{
852871
struct eeh_dev *edev;
853872
struct pci_dev *pdev;
@@ -862,11 +881,58 @@ struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
862881

863882
/* Retrieve the parent PCI bus of first (top) PCI device */
864883
edev = list_first_entry_or_null(&pe->edevs, struct eeh_dev, entry);
865-
pci_lock_rescan_remove();
884+
if (do_lock)
885+
pci_lock_rescan_remove();
866886
pdev = eeh_dev_to_pci_dev(edev);
867887
if (pdev)
868888
bus = pdev->bus;
869-
pci_unlock_rescan_remove();
889+
if (do_lock)
890+
pci_unlock_rescan_remove();
870891

871892
return bus;
872893
}
894+
895+
/**
896+
* eeh_pe_bus_get - Retrieve PCI bus associated with the given EEH PE, locking
897+
* if needed
898+
* @pe: Pointer to the EEH PE
899+
*
900+
* This function is a wrapper around _eeh_pe_bus_get(), which retrieves the PCI
901+
* bus associated with the provided EEH PE structure. It acquires the PCI
902+
* rescans lock to ensure safe access to shared data during the retrieval
903+
* process. This function should be used when the caller requires the PCI bus
904+
* while holding the rescan/remove lock, typically during operations that modify
905+
* or inspect PCIe device state in a safe manner.
906+
*
907+
* RETURNS:
908+
* A pointer to the PCI bus associated with the EEH PE, or NULL if none found.
909+
*/
910+
911+
struct pci_bus *eeh_pe_bus_get(struct eeh_pe *pe)
912+
{
913+
return _eeh_pe_bus_get(pe, true);
914+
}
915+
916+
/**
917+
* eeh_pe_bus_get_nolock - Retrieve PCI bus associated with the given EEH PE
918+
* without locking
919+
* @pe: Pointer to the EEH PE
920+
*
921+
* This function is a variant of _eeh_pe_bus_get() that retrieves the PCI bus
922+
* associated with the specified EEH PE without acquiring the
923+
* pci_lock_rescan_remove lock. It should only be used when the caller can
924+
* guarantee safe access to PE structures without the need for that lock,
925+
* typically in contexts where the lock is already held locking is otherwise
926+
* managed.
927+
*
928+
* RETURNS:
929+
* pointer to the PCI bus associated with the EEH PE, or NULL if none is found.
930+
*
931+
* NOTE:
932+
* Use this function carefully to avoid race conditions and data corruption.
933+
*/
934+
935+
struct pci_bus *eeh_pe_bus_get_nolock(struct eeh_pe *pe)
936+
{
937+
return _eeh_pe_bus_get(pe, false);
938+
}

0 commit comments

Comments
 (0)