beginning to add analysis functions, will remove for next release

kaseylove · kaseylove · commit 0ece7000060a · 2026-01-27T09:36:53.000-05:00
diff --git a/src/rushd/ddpcr.py b/src/rushd/ddpcr.py
@@ -29,7 +29,6 @@
 class YamlError(RuntimeError):
     """Error raised when there is an issue with the provided .yaml file."""
 
-
 class DataPathError(RuntimeError):
     """Error raised when the path to the data is not specified correctly."""
 
@@ -90,7 +89,7 @@ def load_ddpcr_metadata(unzipped_path: Path) -> Dict[Any, Any]:
 
 def load_ddpcr(
     data_path: Union[str, Path],
-    yaml_path: Union[str, Path],
+    yaml_path: Optional[Union[str, Path]] = None,
     *,
     extract_metadata: Optional[bool] = True,
 ) -> pd.DataFrame:
@@ -106,11 +105,11 @@ def load_ddpcr(
     Parameters
     ----------
     data_path: str or Path
-        Path to .ddpcr file
-    yaml_path: str or Path
+        Path to .ddpcr file.
+    yaml_path: str or Path (optional)
         Path to .yaml file to use for associating metadata with well IDs.
         All metadata must be contained under the header 'metadata'.
-    extract_metadata: Optional bool, default True
+    extract_metadata: bool, default True
         Whether to extract metadata from the .ddpcr file. If True,
         adds a subset of the metadata associated with each well in the 
         BioRad software, namely sample names (numbered 'Sample description' fields,
@@ -176,4 +175,54 @@ def load_ddpcr(
     # Delete unzipped files
     shutil.rmtree(tmp_data_path)
     
-    return data
+    return data
+
+
+def calculate_copy_number(
+    df: pd.DataFrame,
+    exp_channel: str,
+    ref_channel: str,
+    gates: Dict[str, float],
+    *,
+    ref_copy_num: float = 2.0,
+) -> pd.DataFrame:
+    """
+    Calculates copy number of an experimental target relative to a
+    reference target.
+
+    Adds a column to the DataFrame with this computed value.
+    Math is based on ... TODO
+
+    Parameters
+    ----------
+    df: pandas DataFrame
+        Data on which to calculate. Must contain columns corresponding 
+        to the experimental and reference channels.
+    exp_channel: str
+        Column in df containing measurements for the experimental target.
+    ref_channel: str
+        Column in df containing measurements for the reference target.
+    gates: dict of (str: float) pairs
+        Gates specifying threshold for positive droplets, one for each
+        experimental and reference channel.
+    ref_copy_num: float, default 2.0
+        Known copy number of the reference gene. If not specified, defaults
+        to 2.0 (diploid).
+
+    Returns
+    -------
+    The original DataFrame with a new column 'copy_num' containing the computed
+    values.
+    """
+    # TODO: throw error if channels not in df
+    # TODO: check if there are no (negative) droplets before calculating
+    data_exp = df[exp_channel]
+    data_ref = df[ref_channel]
+
+    # Compute copies per droplet: -ln(num_negative / num_total)
+    copies_exp = -np.log((data_exp < gates[exp_channel]).sum() / len(data_exp))
+    copies_ref = -np.log((data_ref < gates[ref_channel]).sum() / len(data_ref))
+
+    # Normalize copies to the reference gene
+    df['copy_num'] = copies_exp / copies_ref * ref_copy_num
+    return df
diff --git a/src/rushd/qpcr.py b/src/rushd/qpcr.py
@@ -15,10 +15,11 @@
 except ImportError:
     from typing_extensions import Literal
 
-import matplotlib.pyplot as plt
+import matplotlib
 import numpy as np
 import pandas as pd
 import yaml
+import scipy.stats
 from scipy.optimize import curve_fit
 
 from . import flow
@@ -28,7 +29,7 @@ class YamlError(RuntimeError):
     """Error raised when there is an issue with the provided .yaml file."""
 
 class ColumnError(RuntimeError):
-    """Error raised when the data is missing a column specifying well IDs."""
+    """Error raised when the data is missing a required column."""
 
 class DataPathError(RuntimeError):
     """Error raised when the path to the data is not specified correctly."""
@@ -39,6 +40,9 @@ class GroupsError(RuntimeError):
 class RegexError(RuntimeError):
     """Error raised when there is an issue with the file name regular expression."""
 
+class InputError(RuntimeError):
+    """Error raised when there is an issue with an argument type."""
+
 
 def load_single_csv_with_metadata(
     data_path: Union[str, Path],
@@ -218,4 +222,165 @@ def load_plates_with_metadata(
 
     # Concatenate all the data into a single DataFrame
     data = pd.concat(group_list, ignore_index=True).replace([float('nan'),np.nan], pd.NA)
-    return data
+    return data
+
+
+def calculate_standard(
+    df: pd.DataFrame,
+    amt_col: str,
+    cp_col: str,
+    ax: Optional[matplotlib.axes] = None
+) -> List[scipy.stats._stats_py.LinregressResult, float]:
+    """
+    Calculate a standard curve for qPCR data.
+
+    For the given data, treats the values in 'amt_col' as 
+    input amounts and values in 'cp_col' as the corresponding
+    cycle counts (Cp, aka Ct) from the qPCR output. Computes a linear 
+    regression on log10(amount) vs Cp, and returns this fit as well
+    as the efficiency.
+
+    If axes are passed, plots the linear fit on the data, annotating 
+    the R^2 value and efficiency.
+
+    Parameters
+    ----------
+    df: pandas DataFrame
+        Data to use to fit.
+    amt_col: str
+        Name of column containing input amounts.
+    cp_col: str
+        Name of the column containing Cp values.
+    ax: matplotlib.axes (optional)
+        Axes on which to plot the data and fit.
+
+    Returns
+    -------
+    A tuple of the fit (output of a call to scipy.stats.linregress)
+    and the calculated efficiency (float).
+    """
+    if amt_col not in df.columns:
+        raise ColumnError(f"Data is missing the 'amt_col' column {amt_col}")
+    if cp_col not in df.columns:
+        raise ColumnError(f"Data is missing the 'cp_col' column {cp_col}")
+    
+    # Remove zero values and log10-transform input amounts
+    df_subset = df[df[amt_col]>0].copy()
+    df_subset['log10_'+amt_col] = df_subset[amt_col].astype(float).apply(np.log10)
+
+    # Fit data
+    x = df_subset['log10_'+amt_col]
+    y = df_subset[cp_col].astype(float)
+    fit = scipy.stats.linregress(x, y)
+    efficiency = (10**(-1/fit.slope) - 1)*100 # percentage
+    
+    # Plot result
+    if ax is not None:
+        ax.scatter(df[amt_col], df[cp_col], label='data', ec='white', lw=0.75)
+        xs = np.logspace(min(df_subset['log10_'+amt_col]), max(df_subset['log10_'+amt_col]), 1000)
+        ys = fit.slope * np.log10(xs) + fit.intercept
+        ax.plot(xs, ys, color='crimson', label='linear\nregression')
+        ax.set_xscale('symlog', linthresh=min(df_subset[amt_col]))
+        pad = 0.01
+        ax.legend(loc='upper right', bbox_to_anchor=(1-pad, 1-pad))
+        ax.annotate(f'$R^2$: {abs(fit.rvalue):0.3f}', (0+pad*2, 0.1), xycoords='axes fraction',
+                    ha='left', va='bottom', size='medium')
+        ax.annotate(f'Efficiency: {efficiency:0.1f}%', (0+pad*2, 0+pad*2), xycoords='axes fraction',
+                    ha='left', va='bottom', size='medium')
+    
+    return fit, efficiency
+
+
+def calculate_input_amount(
+    y: float, # TODO: list of float
+    fit: Union[scipy.stats._stats_py.LinregressResult, List[float]],
+) -> float:
+    """
+    Given a cycle count (Cp, aka Ct value) and a linear regression fit, 
+    compute the amount of input.
+
+    Note that the linear regression fit is expected to have been performed
+    on the log10-transform of the input amounts. Units of the returned value
+    match those of the non-transformed input amount data.
+
+    Parameters
+    ----------
+    y: float
+        Cycle count (Cp, aka Ct value).
+    fit: scipy LinregressResult object or list of two floats
+        Linear fit to use. Accepts either the output of a call
+        to scipy.stats.linregress or a list of the fit values
+        [slope, intercept].
+
+    Returns
+    -------
+    A float of the calculated amount.
+    """
+    if type(fit) is scipy.stats._stats_py.LinregressResult:
+        return 10**((float(y)-fit.intercept)/fit.slope)
+    if len(fit) < 2:
+        raise InputError("'fit' is expected to be a list containing [slope, intercept]. Alternatively, pass a scipy LinregressResult object.")
+    return 10**((float(y)-fit[1])/fit[0])
+
+# TODO: add 'type' arg for dsDNA, ssDNA, ssRNA
+def convert_moles_to_mass(
+    moles: Union[float, List[float]],
+    length: Union[float, List[float]]
+) -> Union[float, List[float]]:
+    """
+    For a given amount of DNA in moles, use its length
+    to calculate its mass.
+
+    Formula from NEB: 
+    g = mol x (bp x 615.94 g/mol/bp + 36.04 g/mol)
+     - mass of dsDNA (g) = moles dsDNA x (molecular weight of dsDNA (g/mol))
+     - molecular weight of dsDNA = (number of base pairs of dsDNA x average molecular weight of a base pair) + 36.04 g/mol
+     - average molecular weight of a base pair = 615.94 g/mol, excluding the water molecule removed during polymerization 
+       and assuming deprotonated phosphate hydroxyls
+     - the additional 36.04 g/mol accounts for the 2 -OH and 2 -H added back to the ends
+     - bases are assumed to be unmodified
+
+    Parameters
+    ----------
+    moles: float or list of float
+        Amount of dsDNA in moles.
+    length: float or list of float
+        Number of base pairs of the dsDNA (or average length of a heterogeneous sample).
+    
+    Returns
+    -------
+    A float or list of floats of the calculated mass in grams.
+    """
+    return np.array(moles) * (np.array(length) * 615.96 + 36.04)
+
+
+# TODO: add 'type' arg for dsDNA, ssDNA, ssRNA
+def convert_mass_to_moles(
+    mass: Union[float, List[float]],
+    length: Union[float, List[float]]
+) -> Union[float, List[float]]:
+    """
+    For a given amount of DNA in moles, use its length
+    to calculate its mass.
+
+    Formula from NEB: 
+    mol = g / (bp x 615.94 g/mol/bp + 36.04 g/mol)
+     - moles dsDNA = mass of dsDNA (g) / (molecular weight of dsDNA (g/mol))
+     - molecular weight of dsDNA = (number of base pairs of dsDNA x average molecular weight of a base pair) + 36.04 g/mol
+     - average molecular weight of a base pair = 615.94 g/mol, excluding the water molecule removed during polymerization 
+       and assuming deprotonated phosphate hydroxyls
+     - the additional 36.04 g/mol accounts for the 2 -OH and 2 -H added back to the ends
+     - bases are assumed to be unmodified
+
+    Parameters
+    ----------
+    mass: float or list of float
+        Mass of dsDNA in grams.
+    length: float or list of float
+        Number of base pairs of the dsDNA (or average length of a heterogeneous sample).
+    
+    Returns
+    -------
+    A float or list of floats of the calculated amount in moles.
+    """
+    return np.array(mass) / (np.array(length) * 615.96 + 36.04)
diff --git a/tests/test_ddpcr.py b/tests/test_ddpcr.py
@@ -1,4 +1,17 @@
+import os
+from pathlib import Path
 
+import pandas as pd
+import pytest
 
-# Test ddPCR
-# Test ddPCR bad file type
+from rushd import ddpcr
+
+# test_ddpcr
+# test_ddpcr_no_metadata
+# test_no_yaml
+# test_no_yaml_metadata
+
+# test_invalid_data_path
+# test_invalid_yaml
+
+# test_multiple_plates