11"""
2- Common function for analyzing ddPCR data in Pandas Dataframes.
2+ Common functions for analyzing ddPCR data in Pandas Dataframes.
33
44Extracts data and metadata from .ddpcr files.
55Allows users to specify custom metadata applied via well mapping.
@@ -34,20 +34,20 @@ class DataPathError(RuntimeError):
3434 """Error raised when the path to the data is not specified correctly."""
3535
3636
37- def load_ddpcr_metadata (tmp_data_path : Path ) -> Dict [Any , Any ]:
37+ def load_ddpcr_metadata (unzipped_path : Path ) -> Dict [Any , Any ]:
3838 """
3939 Load well metadata from an unzipped .ddpcr file.
4040
4141 Generates a metadata dict in the same format as the YAML well mapping,
4242 i.e., key -> {well -> value}. The columns are a subset of the
4343 metadata associated with each well in the BioRad software, namely
4444 sample names (numbered 'Sample description' fields, returned as
45- numbered 'condition ' keys) and targets for each channel/dye (returned
46- as '[channel]_target' keys).
45+ numbered 'sample_description ' keys) and targets for each channel/dye
46+ (returned as '[channel]_target' keys).
4747
4848 Parameters
4949 ----------
50- tmp_data_path : Path
50+ unzipped_path : Path
5151 Path to unzipped .ddpcr file
5252
5353 Returns
@@ -60,24 +60,27 @@ def load_ddpcr_metadata(tmp_data_path: Path) -> Dict[Any, Any]:
6060
6161 # Create map of well index -> ID
6262 well_id_map = {}
63- for f in (tmp_data_path / 'PeakMetaData' ).glob ("*.ddmetajson" ):
63+ for f in (unzipped_path / 'PeakMetaData' ).glob ("*.ddmetajson" ):
6464 with open (f , 'r' ) as file :
6565 d = json .load (file )
6666 well_id_map [d ['WellIndex' ]] = re .compile (filename_regex ).match (file .name ).group ("well" )
6767
68- # Get plate file name
69- # TODO: change to glob all .plt files and choose latest modified one
68+ # Get plate file name from last modified .ddplt file
7069 plate_file = ''
71- with open (tmp_data_path / 'RunInfo.json' , 'r' ) as f :
72- plate_file = Path (f ['PlateFileName' ])
70+ last_mod_time = 0
71+ for f in unzipped_path .glob ("*.ddplt" ):
72+ mtime = f .stat ().st_mtime
73+ if mtime > last_mod_time :
74+ last_mod_time = mtime
75+ plate_file = f .name
7376
7477 # Load metadata from plate file
7578 metadata_from_plt = {}
76- with open (tmp_data_path / plate_file , 'r' ) as file :
79+ with open (unzipped_path / plate_file , 'r' ) as file :
7780 f = json .load (file )
7881 for w in f ['WellSamples' ]:
7982 well = well_id_map [w ['WellIndex' ]]
80- condition_map = {f'condition { i } ' : val for i ,val in enumerate (w ['SampleIds' ])}
83+ condition_map = {f'sample_description_ { i + 1 } ' : val for i ,val in enumerate (w ['SampleIds' ])}
8184 target_map = {p ['Dye' ]['DyeName' ]+ '_target' : p ['TargetName' ] for p in w ['Panel' ]['Targets' ]}
8285 metadata_from_plt [well ] = condition_map | target_map
8386
@@ -88,6 +91,8 @@ def load_ddpcr_metadata(tmp_data_path: Path) -> Dict[Any, Any]:
8891def load_ddpcr (
8992 data_path : Union [str , Path ],
9093 yaml_path : Union [str , Path ],
94+ * ,
95+ extract_metadata : Optional [bool ] = True ,
9196) -> pd .DataFrame :
9297 """
9398 Load ddPCR data into DataFrame with associated metadata.
@@ -105,6 +110,12 @@ def load_ddpcr(
105110 yaml_path: str or Path
106111 Path to .yaml file to use for associating metadata with well IDs.
107112 All metadata must be contained under the header 'metadata'.
113+ extract_metadata: Optional bool, default True
114+ Whether to extract metadata from the .ddpcr file. If True,
115+ adds a subset of the metadata associated with each well in the
116+ BioRad software, namely sample names (numbered 'Sample description' fields,
117+ returned as numbered 'condition' keys) and targets for each channel/dye
118+ (returned as '[channel]_target' keys).
108119
109120 Returns
110121 -------
@@ -121,15 +132,18 @@ def load_ddpcr(
121132 with py7zr .SevenZipFile (data_path , 'r' , password = '1b53402e-503a-4303-bf86-71af1f3178dd' ) as experiment :
122133 experiment .extractall (path = tmp_data_path )
123134
135+ metadata_map = {}
136+
137+ # Load metadata from .yaml file
124138 if yaml_path is not None :
125139 try :
126140 metadata_map = flow .load_well_metadata (yaml_path )
127141 except FileNotFoundError as err :
128142 raise YamlError ("Specified metadata YAML file does not exist!" ) from err
129143
130144 # Load metadata from .ddpcr file
131- else :
132- metadata_map = load_ddpcr_metadata (tmp_data_path )
145+ if extract_metadata :
146+ metadata_map = metadata_map | load_ddpcr_metadata (tmp_data_path )
133147
134148 # Load data for each well
135149 data_list = []
@@ -144,8 +158,10 @@ def load_ddpcr(
144158 channel_map = {c ['Channel' ]- 1 : c ['Dye' ] for c in d ["DataAcquisitionInfo" ]['ChannelMap' ]}
145159 df = pd .DataFrame (np .transpose (d ['PeakInfo' ]['Amplitudes' ])).rename (columns = channel_map )
146160
161+ well = f .stem
162+ df .insert (0 , 'well' , [well ]* len (df ))
163+
147164 # Add metadata to DataFrame
148- well = file .stem
149165 index = 0
150166 for k , v in metadata_map .items ():
151167 # Replace custom metadata keys with <NA> if not present
@@ -154,9 +170,10 @@ def load_ddpcr(
154170
155171 data_list .append (df )
156172
173+ # Fill empty values with <NA> and drop empty columns
174+ data = pd .concat (data_list , ignore_index = True ).replace ([float ('nan' ), np .nan , '' ], pd .NA ).dropna (axis = 'columns' , how = 'all' )
175+
157176 # Delete unzipped files
158177 shutil .rmtree (tmp_data_path )
159-
160- data = pd .concat (data_list , ignore_index = True )
161178
162179 return data
0 commit comments