Skip to content

Commit 0ece700

Browse files
committed
beginning to add analysis functions, will remove for next release
1 parent 3ec0cb6 commit 0ece700

File tree

3 files changed

+238
-11
lines changed

3 files changed

+238
-11
lines changed

src/rushd/ddpcr.py

Lines changed: 55 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
class YamlError(RuntimeError):
3030
"""Error raised when there is an issue with the provided .yaml file."""
3131

32-
3332
class DataPathError(RuntimeError):
3433
"""Error raised when the path to the data is not specified correctly."""
3534

@@ -90,7 +89,7 @@ def load_ddpcr_metadata(unzipped_path: Path) -> Dict[Any, Any]:
9089

9190
def load_ddpcr(
9291
data_path: Union[str, Path],
93-
yaml_path: Union[str, Path],
92+
yaml_path: Optional[Union[str, Path]] = None,
9493
*,
9594
extract_metadata: Optional[bool] = True,
9695
) -> pd.DataFrame:
@@ -106,11 +105,11 @@ def load_ddpcr(
106105
Parameters
107106
----------
108107
data_path: str or Path
109-
Path to .ddpcr file
110-
yaml_path: str or Path
108+
Path to .ddpcr file.
109+
yaml_path: str or Path (optional)
111110
Path to .yaml file to use for associating metadata with well IDs.
112111
All metadata must be contained under the header 'metadata'.
113-
extract_metadata: Optional bool, default True
112+
extract_metadata: bool, default True
114113
Whether to extract metadata from the .ddpcr file. If True,
115114
adds a subset of the metadata associated with each well in the
116115
BioRad software, namely sample names (numbered 'Sample description' fields,
@@ -176,4 +175,54 @@ def load_ddpcr(
176175
# Delete unzipped files
177176
shutil.rmtree(tmp_data_path)
178177

179-
return data
178+
return data
179+
180+
181+
def calculate_copy_number(
182+
df: pd.DataFrame,
183+
exp_channel: str,
184+
ref_channel: str,
185+
gates: Dict[str, float],
186+
*,
187+
ref_copy_num: float = 2.0,
188+
) -> pd.DataFrame:
189+
"""
190+
Calculates copy number of an experimental target relative to a
191+
reference target.
192+
193+
Adds a column to the DataFrame with this computed value.
194+
Math is based on ... TODO
195+
196+
Parameters
197+
----------
198+
df: pandas DataFrame
199+
Data on which to calculate. Must contain columns corresponding
200+
to the experimental and reference channels.
201+
exp_channel: str
202+
Column in df containing measurements for the experimental target.
203+
ref_channel: str
204+
Column in df containing measurements for the reference target.
205+
gates: dict of (str: float) pairs
206+
Gates specifying threshold for positive droplets, one for each
207+
experimental and reference channel.
208+
ref_copy_num: float, default 2.0
209+
Known copy number of the reference gene. If not specified, defaults
210+
to 2.0 (diploid).
211+
212+
Returns
213+
-------
214+
The original DataFrame with a new column 'copy_num' containing the computed
215+
values.
216+
"""
217+
# TODO: throw error if channels not in df
218+
# TODO: check if there are no (negative) droplets before calculating
219+
data_exp = df[exp_channel]
220+
data_ref = df[ref_channel]
221+
222+
# Compute copies per droplet: -ln(num_negative / num_total)
223+
copies_exp = -np.log((data_exp < gates[exp_channel]).sum() / len(data_exp))
224+
copies_ref = -np.log((data_ref < gates[ref_channel]).sum() / len(data_ref))
225+
226+
# Normalize copies to the reference gene
227+
df['copy_num'] = copies_exp / copies_ref * ref_copy_num
228+
return df

src/rushd/qpcr.py

Lines changed: 168 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,11 @@
1515
except ImportError:
1616
from typing_extensions import Literal
1717

18-
import matplotlib.pyplot as plt
18+
import matplotlib
1919
import numpy as np
2020
import pandas as pd
2121
import yaml
22+
import scipy.stats
2223
from scipy.optimize import curve_fit
2324

2425
from . import flow
@@ -28,7 +29,7 @@ class YamlError(RuntimeError):
2829
"""Error raised when there is an issue with the provided .yaml file."""
2930

3031
class ColumnError(RuntimeError):
31-
"""Error raised when the data is missing a column specifying well IDs."""
32+
"""Error raised when the data is missing a required column."""
3233

3334
class DataPathError(RuntimeError):
3435
"""Error raised when the path to the data is not specified correctly."""
@@ -39,6 +40,9 @@ class GroupsError(RuntimeError):
3940
class RegexError(RuntimeError):
4041
"""Error raised when there is an issue with the file name regular expression."""
4142

43+
class InputError(RuntimeError):
44+
"""Error raised when there is an issue with an argument type."""
45+
4246

4347
def load_single_csv_with_metadata(
4448
data_path: Union[str, Path],
@@ -218,4 +222,165 @@ def load_plates_with_metadata(
218222

219223
# Concatenate all the data into a single DataFrame
220224
data = pd.concat(group_list, ignore_index=True).replace([float('nan'),np.nan], pd.NA)
221-
return data
225+
return data
226+
227+
228+
def calculate_standard(
229+
df: pd.DataFrame,
230+
amt_col: str,
231+
cp_col: str,
232+
ax: Optional[matplotlib.axes] = None
233+
) -> List[scipy.stats._stats_py.LinregressResult, float]:
234+
"""
235+
Calculate a standard curve for qPCR data.
236+
237+
For the given data, treats the values in 'amt_col' as
238+
input amounts and values in 'cp_col' as the corresponding
239+
cycle counts (Cp, aka Ct) from the qPCR output. Computes a linear
240+
regression on log10(amount) vs Cp, and returns this fit as well
241+
as the efficiency.
242+
243+
If axes are passed, plots the linear fit on the data, annotating
244+
the R^2 value and efficiency.
245+
246+
Parameters
247+
----------
248+
df: pandas DataFrame
249+
Data to use to fit.
250+
amt_col: str
251+
Name of column containing input amounts.
252+
cp_col: str
253+
Name of the column containing Cp values.
254+
ax: matplotlib.axes (optional)
255+
Axes on which to plot the data and fit.
256+
257+
Returns
258+
-------
259+
A tuple of the fit (output of a call to scipy.stats.linregress)
260+
and the calculated efficiency (float).
261+
"""
262+
if amt_col not in df.columns:
263+
raise ColumnError(f"Data is missing the 'amt_col' column {amt_col}")
264+
if cp_col not in df.columns:
265+
raise ColumnError(f"Data is missing the 'cp_col' column {cp_col}")
266+
267+
# Remove zero values and log10-transform input amounts
268+
df_subset = df[df[amt_col]>0].copy()
269+
df_subset['log10_'+amt_col] = df_subset[amt_col].astype(float).apply(np.log10)
270+
271+
# Fit data
272+
x = df_subset['log10_'+amt_col]
273+
y = df_subset[cp_col].astype(float)
274+
fit = scipy.stats.linregress(x, y)
275+
efficiency = (10**(-1/fit.slope) - 1)*100 # percentage
276+
277+
# Plot result
278+
if ax is not None:
279+
ax.scatter(df[amt_col], df[cp_col], label='data', ec='white', lw=0.75)
280+
xs = np.logspace(min(df_subset['log10_'+amt_col]), max(df_subset['log10_'+amt_col]), 1000)
281+
ys = fit.slope * np.log10(xs) + fit.intercept
282+
ax.plot(xs, ys, color='crimson', label='linear\nregression')
283+
ax.set_xscale('symlog', linthresh=min(df_subset[amt_col]))
284+
pad = 0.01
285+
ax.legend(loc='upper right', bbox_to_anchor=(1-pad, 1-pad))
286+
ax.annotate(f'$R^2$: {abs(fit.rvalue):0.3f}', (0+pad*2, 0.1), xycoords='axes fraction',
287+
ha='left', va='bottom', size='medium')
288+
ax.annotate(f'Efficiency: {efficiency:0.1f}%', (0+pad*2, 0+pad*2), xycoords='axes fraction',
289+
ha='left', va='bottom', size='medium')
290+
291+
return fit, efficiency
292+
293+
294+
def calculate_input_amount(
295+
y: float, # TODO: list of float
296+
fit: Union[scipy.stats._stats_py.LinregressResult, List[float]],
297+
) -> float:
298+
"""
299+
Given a cycle count (Cp, aka Ct value) and a linear regression fit,
300+
compute the amount of input.
301+
302+
Note that the linear regression fit is expected to have been performed
303+
on the log10-transform of the input amounts. Units of the returned value
304+
match those of the non-transformed input amount data.
305+
306+
Parameters
307+
----------
308+
y: float
309+
Cycle count (Cp, aka Ct value).
310+
fit: scipy LinregressResult object or list of two floats
311+
Linear fit to use. Accepts either the output of a call
312+
to scipy.stats.linregress or a list of the fit values
313+
[slope, intercept].
314+
315+
Returns
316+
-------
317+
A float of the calculated amount.
318+
"""
319+
if type(fit) is scipy.stats._stats_py.LinregressResult:
320+
return 10**((float(y)-fit.intercept)/fit.slope)
321+
if len(fit) < 2:
322+
raise InputError("'fit' is expected to be a list containing [slope, intercept]. Alternatively, pass a scipy LinregressResult object.")
323+
return 10**((float(y)-fit[1])/fit[0])
324+
325+
# TODO: add 'type' arg for dsDNA, ssDNA, ssRNA
326+
def convert_moles_to_mass(
327+
moles: Union[float, List[float]],
328+
length: Union[float, List[float]]
329+
) -> Union[float, List[float]]:
330+
"""
331+
For a given amount of DNA in moles, use its length
332+
to calculate its mass.
333+
334+
Formula from NEB:
335+
g = mol x (bp x 615.94 g/mol/bp + 36.04 g/mol)
336+
- mass of dsDNA (g) = moles dsDNA x (molecular weight of dsDNA (g/mol))
337+
- molecular weight of dsDNA = (number of base pairs of dsDNA x average molecular weight of a base pair) + 36.04 g/mol
338+
- average molecular weight of a base pair = 615.94 g/mol, excluding the water molecule removed during polymerization
339+
and assuming deprotonated phosphate hydroxyls
340+
- the additional 36.04 g/mol accounts for the 2 -OH and 2 -H added back to the ends
341+
- bases are assumed to be unmodified
342+
343+
Parameters
344+
----------
345+
moles: float or list of float
346+
Amount of dsDNA in moles.
347+
length: float or list of float
348+
Number of base pairs of the dsDNA (or average length of a heterogeneous sample).
349+
350+
Returns
351+
-------
352+
A float or list of floats of the calculated mass in grams.
353+
"""
354+
return np.array(moles) * (np.array(length) * 615.96 + 36.04)
355+
356+
357+
# TODO: add 'type' arg for dsDNA, ssDNA, ssRNA
358+
def convert_mass_to_moles(
359+
mass: Union[float, List[float]],
360+
length: Union[float, List[float]]
361+
) -> Union[float, List[float]]:
362+
"""
363+
For a given amount of DNA in moles, use its length
364+
to calculate its mass.
365+
366+
Formula from NEB:
367+
mol = g / (bp x 615.94 g/mol/bp + 36.04 g/mol)
368+
- moles dsDNA = mass of dsDNA (g) / (molecular weight of dsDNA (g/mol))
369+
- molecular weight of dsDNA = (number of base pairs of dsDNA x average molecular weight of a base pair) + 36.04 g/mol
370+
- average molecular weight of a base pair = 615.94 g/mol, excluding the water molecule removed during polymerization
371+
and assuming deprotonated phosphate hydroxyls
372+
- the additional 36.04 g/mol accounts for the 2 -OH and 2 -H added back to the ends
373+
- bases are assumed to be unmodified
374+
375+
Parameters
376+
----------
377+
mass: float or list of float
378+
Mass of dsDNA in grams.
379+
length: float or list of float
380+
Number of base pairs of the dsDNA (or average length of a heterogeneous sample).
381+
382+
Returns
383+
-------
384+
A float or list of floats of the calculated amount in moles.
385+
"""
386+
return np.array(mass) / (np.array(length) * 615.96 + 36.04)

tests/test_ddpcr.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,17 @@
1+
import os
2+
from pathlib import Path
13

4+
import pandas as pd
5+
import pytest
26

3-
# Test ddPCR
4-
# Test ddPCR bad file type
7+
from rushd import ddpcr
8+
9+
# test_ddpcr
10+
# test_ddpcr_no_metadata
11+
# test_no_yaml
12+
# test_no_yaml_metadata
13+
14+
# test_invalid_data_path
15+
# test_invalid_yaml
16+
17+
# test_multiple_plates

0 commit comments

Comments
 (0)