1515except ImportError :
1616 from typing_extensions import Literal
1717
18- import matplotlib . pyplot as plt
18+ import matplotlib
1919import numpy as np
2020import pandas as pd
2121import yaml
22+ import scipy .stats
2223from scipy .optimize import curve_fit
2324
2425from . import flow
@@ -28,7 +29,7 @@ class YamlError(RuntimeError):
2829 """Error raised when there is an issue with the provided .yaml file."""
2930
3031class ColumnError (RuntimeError ):
31- """Error raised when the data is missing a column specifying well IDs ."""
32+ """Error raised when the data is missing a required column ."""
3233
3334class DataPathError (RuntimeError ):
3435 """Error raised when the path to the data is not specified correctly."""
@@ -39,6 +40,9 @@ class GroupsError(RuntimeError):
3940class RegexError (RuntimeError ):
4041 """Error raised when there is an issue with the file name regular expression."""
4142
43+ class InputError (RuntimeError ):
44+ """Error raised when there is an issue with an argument type."""
45+
4246
4347def load_single_csv_with_metadata (
4448 data_path : Union [str , Path ],
@@ -218,4 +222,165 @@ def load_plates_with_metadata(
218222
219223 # Concatenate all the data into a single DataFrame
220224 data = pd .concat (group_list , ignore_index = True ).replace ([float ('nan' ),np .nan ], pd .NA )
221- return data
225+ return data
226+
227+
228+ def calculate_standard (
229+ df : pd .DataFrame ,
230+ amt_col : str ,
231+ cp_col : str ,
232+ ax : Optional [matplotlib .axes ] = None
233+ ) -> List [scipy .stats ._stats_py .LinregressResult , float ]:
234+ """
235+ Calculate a standard curve for qPCR data.
236+
237+ For the given data, treats the values in 'amt_col' as
238+ input amounts and values in 'cp_col' as the corresponding
239+ cycle counts (Cp, aka Ct) from the qPCR output. Computes a linear
240+ regression on log10(amount) vs Cp, and returns this fit as well
241+ as the efficiency.
242+
243+ If axes are passed, plots the linear fit on the data, annotating
244+ the R^2 value and efficiency.
245+
246+ Parameters
247+ ----------
248+ df: pandas DataFrame
249+ Data to use to fit.
250+ amt_col: str
251+ Name of column containing input amounts.
252+ cp_col: str
253+ Name of the column containing Cp values.
254+ ax: matplotlib.axes (optional)
255+ Axes on which to plot the data and fit.
256+
257+ Returns
258+ -------
259+ A tuple of the fit (output of a call to scipy.stats.linregress)
260+ and the calculated efficiency (float).
261+ """
262+ if amt_col not in df .columns :
263+ raise ColumnError (f"Data is missing the 'amt_col' column { amt_col } " )
264+ if cp_col not in df .columns :
265+ raise ColumnError (f"Data is missing the 'cp_col' column { cp_col } " )
266+
267+ # Remove zero values and log10-transform input amounts
268+ df_subset = df [df [amt_col ]> 0 ].copy ()
269+ df_subset ['log10_' + amt_col ] = df_subset [amt_col ].astype (float ).apply (np .log10 )
270+
271+ # Fit data
272+ x = df_subset ['log10_' + amt_col ]
273+ y = df_subset [cp_col ].astype (float )
274+ fit = scipy .stats .linregress (x , y )
275+ efficiency = (10 ** (- 1 / fit .slope ) - 1 )* 100 # percentage
276+
277+ # Plot result
278+ if ax is not None :
279+ ax .scatter (df [amt_col ], df [cp_col ], label = 'data' , ec = 'white' , lw = 0.75 )
280+ xs = np .logspace (min (df_subset ['log10_' + amt_col ]), max (df_subset ['log10_' + amt_col ]), 1000 )
281+ ys = fit .slope * np .log10 (xs ) + fit .intercept
282+ ax .plot (xs , ys , color = 'crimson' , label = 'linear\n regression' )
283+ ax .set_xscale ('symlog' , linthresh = min (df_subset [amt_col ]))
284+ pad = 0.01
285+ ax .legend (loc = 'upper right' , bbox_to_anchor = (1 - pad , 1 - pad ))
286+ ax .annotate (f'$R^2$: { abs (fit .rvalue ):0.3f} ' , (0 + pad * 2 , 0.1 ), xycoords = 'axes fraction' ,
287+ ha = 'left' , va = 'bottom' , size = 'medium' )
288+ ax .annotate (f'Efficiency: { efficiency :0.1f} %' , (0 + pad * 2 , 0 + pad * 2 ), xycoords = 'axes fraction' ,
289+ ha = 'left' , va = 'bottom' , size = 'medium' )
290+
291+ return fit , efficiency
292+
293+
294+ def calculate_input_amount (
295+ y : float , # TODO: list of float
296+ fit : Union [scipy .stats ._stats_py .LinregressResult , List [float ]],
297+ ) -> float :
298+ """
299+ Given a cycle count (Cp, aka Ct value) and a linear regression fit,
300+ compute the amount of input.
301+
302+ Note that the linear regression fit is expected to have been performed
303+ on the log10-transform of the input amounts. Units of the returned value
304+ match those of the non-transformed input amount data.
305+
306+ Parameters
307+ ----------
308+ y: float
309+ Cycle count (Cp, aka Ct value).
310+ fit: scipy LinregressResult object or list of two floats
311+ Linear fit to use. Accepts either the output of a call
312+ to scipy.stats.linregress or a list of the fit values
313+ [slope, intercept].
314+
315+ Returns
316+ -------
317+ A float of the calculated amount.
318+ """
319+ if type (fit ) is scipy .stats ._stats_py .LinregressResult :
320+ return 10 ** ((float (y )- fit .intercept )/ fit .slope )
321+ if len (fit ) < 2 :
322+ raise InputError ("'fit' is expected to be a list containing [slope, intercept]. Alternatively, pass a scipy LinregressResult object." )
323+ return 10 ** ((float (y )- fit [1 ])/ fit [0 ])
324+
325+ # TODO: add 'type' arg for dsDNA, ssDNA, ssRNA
326+ def convert_moles_to_mass (
327+ moles : Union [float , List [float ]],
328+ length : Union [float , List [float ]]
329+ ) -> Union [float , List [float ]]:
330+ """
331+ For a given amount of DNA in moles, use its length
332+ to calculate its mass.
333+
334+ Formula from NEB:
335+ g = mol x (bp x 615.94 g/mol/bp + 36.04 g/mol)
336+ - mass of dsDNA (g) = moles dsDNA x (molecular weight of dsDNA (g/mol))
337+ - molecular weight of dsDNA = (number of base pairs of dsDNA x average molecular weight of a base pair) + 36.04 g/mol
338+ - average molecular weight of a base pair = 615.94 g/mol, excluding the water molecule removed during polymerization
339+ and assuming deprotonated phosphate hydroxyls
340+ - the additional 36.04 g/mol accounts for the 2 -OH and 2 -H added back to the ends
341+ - bases are assumed to be unmodified
342+
343+ Parameters
344+ ----------
345+ moles: float or list of float
346+ Amount of dsDNA in moles.
347+ length: float or list of float
348+ Number of base pairs of the dsDNA (or average length of a heterogeneous sample).
349+
350+ Returns
351+ -------
352+ A float or list of floats of the calculated mass in grams.
353+ """
354+ return np .array (moles ) * (np .array (length ) * 615.96 + 36.04 )
355+
356+
357+ # TODO: add 'type' arg for dsDNA, ssDNA, ssRNA
358+ def convert_mass_to_moles (
359+ mass : Union [float , List [float ]],
360+ length : Union [float , List [float ]]
361+ ) -> Union [float , List [float ]]:
362+ """
363+ For a given amount of DNA in moles, use its length
364+ to calculate its mass.
365+
366+ Formula from NEB:
367+ mol = g / (bp x 615.94 g/mol/bp + 36.04 g/mol)
368+ - moles dsDNA = mass of dsDNA (g) / (molecular weight of dsDNA (g/mol))
369+ - molecular weight of dsDNA = (number of base pairs of dsDNA x average molecular weight of a base pair) + 36.04 g/mol
370+ - average molecular weight of a base pair = 615.94 g/mol, excluding the water molecule removed during polymerization
371+ and assuming deprotonated phosphate hydroxyls
372+ - the additional 36.04 g/mol accounts for the 2 -OH and 2 -H added back to the ends
373+ - bases are assumed to be unmodified
374+
375+ Parameters
376+ ----------
377+ mass: float or list of float
378+ Mass of dsDNA in grams.
379+ length: float or list of float
380+ Number of base pairs of the dsDNA (or average length of a heterogeneous sample).
381+
382+ Returns
383+ -------
384+ A float or list of floats of the calculated amount in moles.
385+ """
386+ return np .array (mass ) / (np .array (length ) * 615.96 + 36.04 )
0 commit comments