deepmodeling · AsymmetryChou · Jun 14, 2026 · Jun 13, 2026 · Jun 13, 2026 · Jun 13, 2026
diff --git a/dpnegf/entrypoints/run.py b/dpnegf/entrypoints/run.py
@@ -63,7 +63,10 @@ def run(
     in_common_options = {}
     if jdata.get("device", None):
         in_common_options.update({"device": jdata["device"]})
-
+    # Otherwise build on the default device (CPU). The NEGF pipeline runs
+    # hamiltonian init and Fermi-level calc on CPU; only RGF runs on
+    # RGF_device, and DeviceProperty handles CPU→RGF_device transfer.
+
     if jdata.get("dtype", None):
         in_common_options.update({"dtype": jdata["dtype"]})
 

diff --git a/dpnegf/negf/density.py b/dpnegf/negf/density.py
@@ -190,7 +190,7 @@ def integrate(self, deviceprop, kpoint, eta_lead=1e-5, eta_device=0.,Vbias=None,
                 deviceprop.lead_L.self_energy(kpoint=kpoint, energy=e, eta_lead=eta_lead)
                 deviceprop.lead_R.self_energy(kpoint=kpoint, energy=e, eta_lead=eta_lead)
                 deviceprop.cal_green_function(energy=e, kpoint=kpoint, block_tridiagonal=block_tridiagonal, eta_device=eta_device)
-                gr_gamma_ga = torch.mm(torch.mm(deviceprop.grd[0], deviceprop.lead_R.gamma), deviceprop.grd[0].conj().T).real
+                gr_gamma_ga = torch.mm(torch.mm(deviceprop.grd[0], deviceprop.lead_R.gamma.to(deviceprop.grd[0].device)), deviceprop.grd[0].conj().T).real
                 gr_gamma_ga = gr_gamma_ga * (deviceprop.lead_R.fermi_dirac(e+deviceprop.mu) - deviceprop.lead_L.fermi_dirac(e+deviceprop.mu))
                 DM_neq = DM_neq + wlg[i] * gr_gamma_ga
         else:
@@ -292,10 +292,11 @@ def density_integrate_Fiori(self,e_grid,kpoint,Vbias,block_tridiagonal,subblocks
             x0 = min(lx, tx)
             x1 = min(rx, ty)
 
-            gammaL = torch.zeros(size=(tx, tx), dtype=torch.complex128)
-            gammaL[:x0, :x0] += deviceprop.lead_L.gamma[:x0, :x0]
-            gammaR = torch.zeros(size=(ty, ty), dtype=torch.complex128)
-            gammaR[-x1:, -x1:] += deviceprop.lead_R.gamma[-x1:, -x1:]
+            gf_device = deviceprop.g_trans.device
+            gammaL = torch.zeros(size=(tx, tx), dtype=torch.complex128, device=gf_device)
+            gammaL[:x0, :x0] += deviceprop.lead_L.gamma[:x0, :x0].to(gf_device)
+            gammaR = torch.zeros(size=(ty, ty), dtype=torch.complex128, device=gf_device)
+            gammaR[-x1:, -x1:] += deviceprop.lead_R.gamma[-x1:, -x1:].to(gf_device)
 
             if not block_tridiagonal:
                 A_Rd = [torch.mm(torch.mm(deviceprop.grd[i],gammaR),deviceprop.grd[i].conj().T) for i in range(len(deviceprop.grd))]

diff --git a/dpnegf/negf/device_property.py b/dpnegf/negf/device_property.py
@@ -2,6 +2,7 @@
 import logging
 import torch
 import os
+from typing import Union
 from dpnegf.negf.negf_utils import update_kmap, update_temp_file,gauss_xw, leggauss
 from dpnegf.negf.density import Ozaki
 from dpnegf.utils.constants import Boltzmann, eV2J,pi
@@ -24,7 +25,7 @@ def _build_s_in_batched(hd, seinL, seinR, idx0, idy0, idx1, idy1):
     here are the already-scaled 1j*(seL - seL.mH) * f tensors of shape [B,n,n]).
     '''
     B = seinL.shape[0]
-    s_in = [torch.zeros((B,) + tuple(blk.shape), dtype=torch.complex128) for blk in hd]
+    s_in = [torch.zeros((B,) + tuple(blk.shape), dtype=torch.complex128, device=blk.device) for blk in hd]
     s_in[0][:, :idx0, :idy0] = s_in[0][:, :idx0, :idy0] + seinL[:, :idx0, :idy0]
     s_in[-1][:, -idx1:, -idy1:] = s_in[-1][:, -idx1:, -idy1:] + seinR[:, -idx1:, -idy1:]
     return s_in
@@ -90,14 +91,17 @@ class DeviceProperty(object):
             calculate density matrix     
 
     '''    
-    def __init__(self, hamiltonian, structure, results_path, e_T=300, 
-                 efermi: dict=None, chemiPot: dict=None, E_ref: float=None ) -> None:
+    def __init__(self, hamiltonian, structure, results_path, e_T=300,
+                 efermi: dict=None, chemiPot: dict=None, E_ref: float=None,
+                 rgf_device: Union[str, torch.device]="cpu") -> None:
         self.greenfuncs = 0
         self.hamiltonian = hamiltonian
         self.structure = structure # ase Atoms
         self.results_path = results_path
         self.cdtype = torch.complex128
-        self.device = "cpu"
+        if isinstance(rgf_device, str):
+            rgf_device = torch.device(rgf_device)
+        self.rgf_device = rgf_device
         self.kBT = Boltzmann * e_T / eV2J
         self.e_T = e_T
         # self.efermi = efermi
@@ -164,7 +168,7 @@ def cal_green_function(self, energy, kpoint, eta_device=0., block_tridiagonal=Tr
             A boolean parameter that indicates whether the last column blocks of the retarded Green's function are needed.
         '''
         assert len(np.array(kpoint).reshape(-1)) == 3
-        energy = torch.as_tensor(energy, dtype=torch.complex128)
+        energy = torch.as_tensor(energy, dtype=torch.complex128, device=self.rgf_device)
         if energy.ndim == 0:
             energy = energy.reshape(1)
         assert energy.ndim == 1, f"energy must be 0-d, scalar, or 1-D [B]; got shape {tuple(energy.shape)}"
@@ -197,8 +201,9 @@ def cal_green_function(self, energy, kpoint, eta_device=0., block_tridiagonal=Tr
                 self.V = torch.tensor(0.)
         else:
             self.V = Vbias
-        
+
         assert torch.is_tensor(self.V)
+        self.V = self.V.to(self.rgf_device)
         if not self.oldV is None:
             if torch.abs(self.V - self.oldV).sum() > 1e-5:
                 self.newV_flag = True
@@ -207,17 +212,25 @@ def cal_green_function(self, energy, kpoint, eta_device=0., block_tridiagonal=Tr
         else:
             self.newV_flag = True  # for the first time to run cal_green_function in Poisson-NEGF SCF
 
-        if (not (hasattr(self, "hd") and hasattr(self, "sd"))) or (self.newK_flag or self.newV_flag):               
+        if (not (hasattr(self, "hd") and hasattr(self, "sd"))) or (self.newK_flag or self.newV_flag):
             self.hd, self.sd, self.hl, self.su, self.sl, self.hu = self.hamiltonian.get_hs_device(self.kpoint, self.V, block_tridiagonal)
+            # TODO: if all blocks transferred to GPU, OOM may happen for large systems. 
+            # Optimization should be implemented here.
+            self.hd = [b.to(self.rgf_device) for b in self.hd]
+            self.sd = [b.to(self.rgf_device) for b in self.sd]
+            self.hl = [b.to(self.rgf_device) for b in self.hl]
+            self.su = [b.to(self.rgf_device) for b in self.su]
+            self.sl = [b.to(self.rgf_device) for b in self.sl]
+            self.hu = [b.to(self.rgf_device) for b in self.hu]
 
 
         tags = ["g_trans","gr_lc", \
                "grd", "grl", "gru", "gr_left", \
                "gnd", "gnl", "gnu", "gin_left", \
                "gpd", "gpl", "gpu", "gip_left"]
 
-        seL = self.lead_L.se
-        seR = self.lead_R.se
+        seL = self.lead_L.se.to(self.rgf_device)
+        seR = self.lead_R.se.to(self.rgf_device)
         if batched_mode:
             assert seL.ndim == 3 and seR.ndim == 3, f"In batched mode, the self-energy should have shape [B,n,n], but got {seL.shape} and {seR.shape}"
         else:
@@ -254,7 +267,7 @@ def cal_green_function(self, energy, kpoint, eta_device=0., block_tridiagonal=Tr
             else:
                 seinL = 1j*(seL-seL.conj().T) * self.lead_L.fermi_dirac(energy+self.E_ref).reshape(-1)
                 seinR = 1j*(seR-seR.conj().T) * self.lead_R.fermi_dirac(energy+self.E_ref).reshape(-1)
-                s_in = [torch.zeros(i.shape).cdouble() for i in self.hd]
+                s_in = [torch.zeros(i.shape, dtype=torch.complex128, device=self.rgf_device) for i in self.hd]
                 s_in[0][:idx0,:idy0] = s_in[0][:idx0,:idy0] + seinL[:idx0,:idy0]
                 s_in[-1][-idx1:,-idy1:] = s_in[-1][-idx1:,-idy1:] + seinR[-idx1:,-idy1:]
         else:
@@ -363,17 +376,17 @@ def _cal_tc_(self):
         g_trans = self.g_trans
         batched = g_trans.ndim == 3
         tx, ty = g_trans.shape[-2], g_trans.shape[-1]
-        gammaL_full = self.lead_L.gamma
-        gammaR_full = self.lead_R.gamma
+        gammaL_full = self.lead_L.gamma.to(self.rgf_device)
+        gammaR_full = self.lead_R.gamma.to(self.rgf_device)
         lx = gammaL_full.shape[-2]
         rx = gammaR_full.shape[-2]
         x0 = min(lx, tx)
         x1 = min(rx, ty)
 
         gL_shape = (g_trans.shape[0], tx, tx) if batched else (tx, tx)
         gR_shape = (g_trans.shape[0], ty, ty) if batched else (ty, ty)
-        gammaL = torch.zeros(size=gL_shape, dtype=self.cdtype, device=self.device)
-        gammaR = torch.zeros(size=gR_shape, dtype=self.cdtype, device=self.device)
+        gammaL = torch.zeros(size=gL_shape, dtype=self.cdtype, device=self.rgf_device)
+        gammaR = torch.zeros(size=gR_shape, dtype=self.cdtype, device=self.rgf_device)
         if batched:
             gammaL[:, :x0, :x0] = gammaL[:, :x0, :x0] + gammaL_full[:, :x0, :x0]
             gammaR[:, -x1:, -x1:] = gammaR[:, -x1:, -x1:] + gammaR_full[:, -x1:, -x1:]
@@ -394,9 +407,16 @@ def _cal_dos_(self):
         '''
         dos = 0
         #TODO: transfer cal_dos to static method for any k and energy
-        if (not(hasattr(self, "hd") and hasattr(self, "sd"))) or (self.newK_flag or self.newV_flag):               
+        if (not(hasattr(self, "hd") and hasattr(self, "sd"))) or (self.newK_flag or self.newV_flag):
             self.hd, self.sd, self.hl, self.su, self.sl, self.hu = \
                 self.hamiltonian.get_hs_device(self.kpoint, self.V, self.block_tridiagonal)
+            # defensive .to(self.device) in case the blocks came back from a cached/legacy path on CPU.
+            self.hd = [b.to(self.rgf_device) for b in self.hd]
+            self.sd = [b.to(self.rgf_device) for b in self.sd]
+            self.hl = [b.to(self.rgf_device) for b in self.hl]
+            self.su = [b.to(self.rgf_device) for b in self.su]
+            self.sl = [b.to(self.rgf_device) for b in self.sl]
+            self.hu = [b.to(self.rgf_device) for b in self.hu]
 
         for jj in range(len(self.grd)):
             if not self.block_tridiagonal or len(self.gru) == 0:
@@ -460,8 +480,9 @@ def _cal_local_current_(self):
         # check the energy grid satisfied the requirement
 
         na = len(self.norbs_per_atom)
-        local_current = torch.zeros(na, na)
+        local_current = torch.zeros(na, na, device=self.rgf_device)
         hd = self.hamiltonian.get_hs_device(kpoint=self.kpoint, V=self.V, block_tridiagonal=self.block_tridiagonal)[0][0]
+        hd = hd.to(self.rgf_device)  # defensive .to(self.device) in case the block came back from a cached/legacy path on CPU.
 
         for i in range(na):
             for j in range(na):
@@ -490,7 +511,7 @@ def _cal_density_(self, dm_options):
 
         '''
         dm = Ozaki(**dm_options)
-        DM_eq, DM_neq = dm.integrate(deviceprop=self.device, kpoint=self.kpoint)
+        DM_eq, DM_neq = dm.integrate(deviceprop=self.rgf_device, kpoint=self.kpoint)
 
         return DM_eq, DM_neq
 

diff --git a/dpnegf/negf/lead_property.py b/dpnegf/negf/lead_property.py
@@ -246,7 +246,7 @@ def _load_self_energy(save_path: str, kpoint, energy, save_format: str):
         elif save_format == "h5":
             try:
                 data = read_from_hdf5(save_path, kpoint, energy)
-                se = torch.as_tensor(data, dtype=torch.complex128)  # 自能一般是复数
+                se = torch.as_tensor(data, dtype=torch.complex128)  # self-energy should be complex
             except KeyError as e:
                 kx, ky, kz = np.asarray(kpoint, dtype=float).reshape(3)
                 ev = energy.item() if hasattr(energy, "item") else float(energy)

diff --git a/dpnegf/negf/negf_hamiltonian_init.py b/dpnegf/negf/negf_hamiltonian_init.py
@@ -70,8 +70,9 @@ def __init__(self,
 
         if isinstance(torch_device, str):
             torch_device = torch.device(torch_device)
-        self.torch_device = torch_device   
+        self.torch_device = torch_device
         self.model = model
+        self.model.to(self.torch_device)
         self.AtomicData_options = AtomicData_options
         self.model.eval()
 
@@ -417,8 +418,8 @@ def Hamiltonian_initialized(self,kpoints:List[List[float]],useBloch:bool,bloch_f
                                 f.create_dataset(f"{key}", data=np.array("None", dtype=h5py.string_dtype()))
                         elif key in ["HL", "SL", "HDL", "SDL", "HLL", "SLL"]:
                             # TODO: HDL,SDL can be saved in reduced form by eliminating zero rows
-                            f.create_dataset(f"{key}_real", data=items.real.numpy().astype(numpy_dtype))
-                            f.create_dataset(f"{key}_imag", data=items.imag.numpy().astype(numpy_dtype))
+                            f.create_dataset(f"{key}_real", data=items.real.cpu().numpy().astype(numpy_dtype))
+                            f.create_dataset(f"{key}_imag", data=items.imag.cpu().numpy().astype(numpy_dtype))
                         else:
                             raise ValueError(f"Unsupported key {key} in HS_leads")
 
@@ -462,11 +463,11 @@ def Hamiltonian_initialized(self,kpoints:List[List[float]],useBloch:bool,bloch_f
                             sub_block_len = len(item)
                             for idx_block in range(sub_block_len):  
                                 # group.create_dataset(f"{key}_k{idx_k}_b{idx_block}", data=item[idx_block].numpy())
-                                group.create_dataset(f"{key}_k{idx_k}_b{idx_block}_real", data=item[idx_block].real.numpy())
-                                group.create_dataset(f"{key}_k{idx_k}_b{idx_block}_imag", data=item[idx_block].imag.numpy())
+                                group.create_dataset(f"{key}_k{idx_k}_b{idx_block}_real", data=item[idx_block].real.cpu().numpy())
+                                group.create_dataset(f"{key}_k{idx_k}_b{idx_block}_imag", data=item[idx_block].imag.cpu().numpy())
                     else:
                         assert key in ["HD", "SD", "Hall", "Sall"], f"Unsupported key {key} for not block_tridiagnal case"
-                        f.create_dataset(f"{key}", data=items.numpy())
+                        f.create_dataset(f"{key}", data=items.cpu().numpy())
                 else:
                     raise ValueError(f"Unsupported key {key} in HS_device")
 
@@ -846,6 +847,15 @@ def get_hs_device(self, kpoint=[0,0,0], V=None, block_tridiagonal=False, only_su
                                                  HS_device["hl"][ik], HS_device["su"][ik], \
                                                  HS_device["sl"][ik], HS_device["hu"][ik]
 
+            # move blocks onto torch_device so downstream RGF runs on the chosen device.
+            hd_k = [b.to(self.torch_device) for b in hd_k]
+            sd_k = [b.to(self.torch_device) for b in sd_k]
+            hl_k = [b.to(self.torch_device) for b in hl_k]
+            su_k = [b.to(self.torch_device) for b in su_k]
+            sl_k = [b.to(self.torch_device) for b in sl_k]
+            hu_k = [b.to(self.torch_device) for b in hu_k]
+            V = torch.as_tensor(V).to(self.torch_device)
+
             if V.shape == torch.Size([]):
                 allorb = sum([hd_k[i].shape[0] for i in range(len(hd_k))])
                 V = V.repeat(allorb)
@@ -855,15 +865,18 @@ def get_hs_device(self, kpoint=[0,0,0], V=None, block_tridiagonal=False, only_su
                 l_slice = slice(counted, counted+hd_k[i].shape[0])
                 V_sub = V[l_slice].view(-1,1).cdouble()
                 hd_k[i] = hd_k[i] - V_sub * sd_k[i]
-                if i<len(hd_k)-1: 
+                if i<len(hd_k)-1:
                     hu_k[i] = hu_k[i] - V_sub * su_k[i]
                 if i > 0:
                     hl_k[i-1] = hl_k[i-1] - V_sub * sl_k[i-1]
                 counted += hd_k[i].shape[0]
-            
+
             return hd_k , sd_k, hl_k , su_k, sl_k, hu_k
         else:
             HD_k, SD_k = HS_device["HD"][ik], HS_device["SD"][ik]
+            HD_k = HD_k.to(self.torch_device)
+            SD_k = SD_k.to(self.torch_device)
+            V = torch.as_tensor(V).to(self.torch_device)
             return HD_k - V*SD_k, SD_k, [], [], [], []
 
     def get_hs_lead(self, kpoint, tab, v):