ddPCR select plate file as last modified, added arg specifying whether to extract metadata from plate

kaseylove · kaseylove · commit 6d1cfa36faf6 · 2026-01-26T18:36:02.000-05:00
diff --git a/src/rushd/ddpcr.py b/src/rushd/ddpcr.py
@@ -1,5 +1,5 @@
 """
-Common function for analyzing ddPCR data in Pandas Dataframes.
+Common functions for analyzing ddPCR data in Pandas Dataframes.
 
 Extracts data and metadata from .ddpcr files.
 Allows users to specify custom metadata applied via well mapping.
@@ -34,20 +34,20 @@ class DataPathError(RuntimeError):
     """Error raised when the path to the data is not specified correctly."""
 
 
-def load_ddpcr_metadata(tmp_data_path: Path) -> Dict[Any, Any]:
+def load_ddpcr_metadata(unzipped_path: Path) -> Dict[Any, Any]:
     """
     Load well metadata from an unzipped .ddpcr file.
 
     Generates a metadata dict in the same format as the YAML well mapping,
     i.e., key -> {well -> value}. The columns are a subset of the 
     metadata associated with each well in the BioRad software, namely 
     sample names (numbered 'Sample description' fields, returned as 
-    numbered 'condition' keys) and targets for each channel/dye (returned
-    as '[channel]_target' keys).
+    numbered 'sample_description' keys) and targets for each channel/dye 
+    (returned as '[channel]_target' keys).
 
     Parameters
     ----------
-    tmp_data_path: Path
+    unzipped_path: Path
         Path to unzipped .ddpcr file
 
     Returns
@@ -60,24 +60,27 @@ def load_ddpcr_metadata(tmp_data_path: Path) -> Dict[Any, Any]:
         
     # Create map of well index -> ID 
     well_id_map = {}
-    for f in (tmp_data_path/'PeakMetaData').glob("*.ddmetajson"):
+    for f in (unzipped_path/'PeakMetaData').glob("*.ddmetajson"):
         with open(f, 'r') as file:
             d = json.load(file)
             well_id_map[d['WellIndex']] = re.compile(filename_regex).match(file.name).group("well")
     
-    # Get plate file name
-    # TODO: change to glob all .plt files and choose latest modified one
+    # Get plate file name from last modified .ddplt file
     plate_file = ''
-    with open(tmp_data_path/'RunInfo.json', 'r') as f:
-        plate_file = Path(f['PlateFileName'])
+    last_mod_time = 0
+    for f in unzipped_path.glob("*.ddplt"):
+        mtime = f.stat().st_mtime
+        if mtime > last_mod_time:
+            last_mod_time = mtime
+            plate_file = f.name
     
     # Load metadata from plate file
     metadata_from_plt = {}
-    with open(tmp_data_path/plate_file, 'r') as file:
+    with open(unzipped_path/plate_file, 'r') as file:
         f = json.load(file)
         for w in f['WellSamples']:
             well = well_id_map[w['WellIndex']]
-            condition_map = {f'condition{i}': val for i,val in enumerate(w['SampleIds'])}
+            condition_map = {f'sample_description_{i+1}': val for i,val in enumerate(w['SampleIds'])}
             target_map = {p['Dye']['DyeName']+'_target': p['TargetName'] for p in w['Panel']['Targets']}
             metadata_from_plt[well] = condition_map | target_map
     
@@ -88,6 +91,8 @@ def load_ddpcr_metadata(tmp_data_path: Path) -> Dict[Any, Any]:
 def load_ddpcr(
     data_path: Union[str, Path],
     yaml_path: Union[str, Path],
+    *,
+    extract_metadata: Optional[bool] = True,
 ) -> pd.DataFrame:
     """
     Load ddPCR data into DataFrame with associated metadata.
@@ -105,6 +110,12 @@ def load_ddpcr(
     yaml_path: str or Path
         Path to .yaml file to use for associating metadata with well IDs.
         All metadata must be contained under the header 'metadata'.
+    extract_metadata: Optional bool, default True
+        Whether to extract metadata from the .ddpcr file. If True,
+        adds a subset of the metadata associated with each well in the 
+        BioRad software, namely sample names (numbered 'Sample description' fields,
+        returned as numbered 'condition' keys) and targets for each channel/dye 
+        (returned as '[channel]_target' keys).
 
     Returns
     -------
@@ -121,15 +132,18 @@ def load_ddpcr(
     with py7zr.SevenZipFile(data_path, 'r', password='1b53402e-503a-4303-bf86-71af1f3178dd') as experiment:
         experiment.extractall(path=tmp_data_path)
 
+    metadata_map = {}
+
+    # Load metadata from .yaml file
     if yaml_path is not None:
         try:
             metadata_map = flow.load_well_metadata(yaml_path)
         except FileNotFoundError as err:
             raise YamlError("Specified metadata YAML file does not exist!") from err
     
     # Load metadata from .ddpcr file
-    else:
-        metadata_map = load_ddpcr_metadata(tmp_data_path)
+    if extract_metadata:
+        metadata_map = metadata_map | load_ddpcr_metadata(tmp_data_path)
 
     # Load data for each well
     data_list = []
@@ -144,8 +158,10 @@ def load_ddpcr(
             channel_map = {c['Channel']-1: c['Dye'] for c in d["DataAcquisitionInfo"]['ChannelMap']}
             df = pd.DataFrame(np.transpose(d['PeakInfo']['Amplitudes'])).rename(columns=channel_map)
 
+            well = f.stem
+            df.insert(0, 'well', [well]*len(df))
+
             # Add metadata to DataFrame
-            well = file.stem
             index = 0
             for k, v in metadata_map.items():
                 # Replace custom metadata keys with <NA> if not present
@@ -154,9 +170,10 @@ def load_ddpcr(
 
             data_list.append(df)
 
+    # Fill empty values with <NA> and drop empty columns
+    data = pd.concat(data_list, ignore_index=True).replace([float('nan'), np.nan, ''], pd.NA).dropna(axis='columns', how='all')
+
     # Delete unzipped files
     shutil.rmtree(tmp_data_path)
-
-    data = pd.concat(data_list, ignore_index=True)
     
     return data