Skip to content

Commit 6d1cfa3

Browse files
committed
ddPCR select plate file as last modified, added arg specifying whether to extract metadata from plate
1 parent e217d5d commit 6d1cfa3

File tree

1 file changed

+34
-17
lines changed

1 file changed

+34
-17
lines changed

src/rushd/ddpcr.py

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
"""
2-
Common function for analyzing ddPCR data in Pandas Dataframes.
2+
Common functions for analyzing ddPCR data in Pandas Dataframes.
33
44
Extracts data and metadata from .ddpcr files.
55
Allows users to specify custom metadata applied via well mapping.
@@ -34,20 +34,20 @@ class DataPathError(RuntimeError):
3434
"""Error raised when the path to the data is not specified correctly."""
3535

3636

37-
def load_ddpcr_metadata(tmp_data_path: Path) -> Dict[Any, Any]:
37+
def load_ddpcr_metadata(unzipped_path: Path) -> Dict[Any, Any]:
3838
"""
3939
Load well metadata from an unzipped .ddpcr file.
4040
4141
Generates a metadata dict in the same format as the YAML well mapping,
4242
i.e., key -> {well -> value}. The columns are a subset of the
4343
metadata associated with each well in the BioRad software, namely
4444
sample names (numbered 'Sample description' fields, returned as
45-
numbered 'condition' keys) and targets for each channel/dye (returned
46-
as '[channel]_target' keys).
45+
numbered 'sample_description' keys) and targets for each channel/dye
46+
(returned as '[channel]_target' keys).
4747
4848
Parameters
4949
----------
50-
tmp_data_path: Path
50+
unzipped_path: Path
5151
Path to unzipped .ddpcr file
5252
5353
Returns
@@ -60,24 +60,27 @@ def load_ddpcr_metadata(tmp_data_path: Path) -> Dict[Any, Any]:
6060

6161
# Create map of well index -> ID
6262
well_id_map = {}
63-
for f in (tmp_data_path/'PeakMetaData').glob("*.ddmetajson"):
63+
for f in (unzipped_path/'PeakMetaData').glob("*.ddmetajson"):
6464
with open(f, 'r') as file:
6565
d = json.load(file)
6666
well_id_map[d['WellIndex']] = re.compile(filename_regex).match(file.name).group("well")
6767

68-
# Get plate file name
69-
# TODO: change to glob all .plt files and choose latest modified one
68+
# Get plate file name from last modified .ddplt file
7069
plate_file = ''
71-
with open(tmp_data_path/'RunInfo.json', 'r') as f:
72-
plate_file = Path(f['PlateFileName'])
70+
last_mod_time = 0
71+
for f in unzipped_path.glob("*.ddplt"):
72+
mtime = f.stat().st_mtime
73+
if mtime > last_mod_time:
74+
last_mod_time = mtime
75+
plate_file = f.name
7376

7477
# Load metadata from plate file
7578
metadata_from_plt = {}
76-
with open(tmp_data_path/plate_file, 'r') as file:
79+
with open(unzipped_path/plate_file, 'r') as file:
7780
f = json.load(file)
7881
for w in f['WellSamples']:
7982
well = well_id_map[w['WellIndex']]
80-
condition_map = {f'condition{i}': val for i,val in enumerate(w['SampleIds'])}
83+
condition_map = {f'sample_description_{i+1}': val for i,val in enumerate(w['SampleIds'])}
8184
target_map = {p['Dye']['DyeName']+'_target': p['TargetName'] for p in w['Panel']['Targets']}
8285
metadata_from_plt[well] = condition_map | target_map
8386

@@ -88,6 +91,8 @@ def load_ddpcr_metadata(tmp_data_path: Path) -> Dict[Any, Any]:
8891
def load_ddpcr(
8992
data_path: Union[str, Path],
9093
yaml_path: Union[str, Path],
94+
*,
95+
extract_metadata: Optional[bool] = True,
9196
) -> pd.DataFrame:
9297
"""
9398
Load ddPCR data into DataFrame with associated metadata.
@@ -105,6 +110,12 @@ def load_ddpcr(
105110
yaml_path: str or Path
106111
Path to .yaml file to use for associating metadata with well IDs.
107112
All metadata must be contained under the header 'metadata'.
113+
extract_metadata: Optional bool, default True
114+
Whether to extract metadata from the .ddpcr file. If True,
115+
adds a subset of the metadata associated with each well in the
116+
BioRad software, namely sample names (numbered 'Sample description' fields,
117+
returned as numbered 'condition' keys) and targets for each channel/dye
118+
(returned as '[channel]_target' keys).
108119
109120
Returns
110121
-------
@@ -121,15 +132,18 @@ def load_ddpcr(
121132
with py7zr.SevenZipFile(data_path, 'r', password='1b53402e-503a-4303-bf86-71af1f3178dd') as experiment:
122133
experiment.extractall(path=tmp_data_path)
123134

135+
metadata_map = {}
136+
137+
# Load metadata from .yaml file
124138
if yaml_path is not None:
125139
try:
126140
metadata_map = flow.load_well_metadata(yaml_path)
127141
except FileNotFoundError as err:
128142
raise YamlError("Specified metadata YAML file does not exist!") from err
129143

130144
# Load metadata from .ddpcr file
131-
else:
132-
metadata_map = load_ddpcr_metadata(tmp_data_path)
145+
if extract_metadata:
146+
metadata_map = metadata_map | load_ddpcr_metadata(tmp_data_path)
133147

134148
# Load data for each well
135149
data_list = []
@@ -144,8 +158,10 @@ def load_ddpcr(
144158
channel_map = {c['Channel']-1: c['Dye'] for c in d["DataAcquisitionInfo"]['ChannelMap']}
145159
df = pd.DataFrame(np.transpose(d['PeakInfo']['Amplitudes'])).rename(columns=channel_map)
146160

161+
well = f.stem
162+
df.insert(0, 'well', [well]*len(df))
163+
147164
# Add metadata to DataFrame
148-
well = file.stem
149165
index = 0
150166
for k, v in metadata_map.items():
151167
# Replace custom metadata keys with <NA> if not present
@@ -154,9 +170,10 @@ def load_ddpcr(
154170

155171
data_list.append(df)
156172

173+
# Fill empty values with <NA> and drop empty columns
174+
data = pd.concat(data_list, ignore_index=True).replace([float('nan'), np.nan, ''], pd.NA).dropna(axis='columns', how='all')
175+
157176
# Delete unzipped files
158177
shutil.rmtree(tmp_data_path)
159-
160-
data = pd.concat(data_list, ignore_index=True)
161178

162179
return data

0 commit comments

Comments
 (0)