Merge pull request #42 from AllenCell/feature/bootstrap

smishra3 · web-flow · commit accd29c60a94 · 2026-01-28T11:39:56.000-08:00
Feature/bootstrap
diff --git a/EMT_data_analysis/analysis_scripts/Analysis_tools.py b/EMT_data_analysis/analysis_scripts/Analysis_tools.py
@@ -20,6 +20,37 @@
 
 warnings.filterwarnings("ignore")
 
+rng = np.random.default_rng(42) 
+
+def bootstrap_corr(x, y, method="Pearson", confidence_interval=0.95, n_bootstraps=2000, seed=42, verbose=True):
+    x = np.asarray(x)
+    y = np.asarray(y)
+
+    corr_func = pearsonr
+    if method == "Spearman":
+        corr_func = spearmanr
+
+    r_observed, p_value = corr_func(x, y)
+    n_samples = len(x)
+
+    inds = np.arange(n_samples)
+    boot_corrs = []
+    for _ in range(n_bootstraps):
+        resampled_inds = rng.choice(inds, size=n_samples, replace=True)
+        rx = x[resampled_inds]
+        ry = y[resampled_inds]
+        boot_r, _ = corr_func(rx, ry)
+        boot_corrs.append(boot_r)
+    alpha = (1.0-confidence_interval)/2.0
+    ci_low = np.nanpercentile(boot_corrs, 100*alpha)
+    ci_high = np.nanpercentile(boot_corrs, 100*(1.0-alpha))
+
+    if verbose:
+        print(f'Number of samples: {n_samples}')
+        print(f'{method} Correlation: {r_observed:.3g} | 95% CI: [{ci_low:.2f}, {ci_high:.2f}] | p-value: {p_value:.3g}')
+
+    return r_observed, p_value, ci_low, ci_high
+
 def run_all_analyses():
     """
     Run all analysis functions
@@ -29,6 +60,8 @@ def run_all_analyses():
 
     df = io.load_image_analysis_extracted_features()
     
+    print("Loaded image analysis extracted features and started running analyses...")
+
     plot_area_at_glass_all_data(df, FIGS_DIR, OUT_TYPE)
     plot_area_at_glass_h2b(df, FIGS_DIR, OUT_TYPE)
     plot_migration_timing_all_data(df, FIGS_DIR, OUT_TYPE)
@@ -611,10 +644,8 @@ def plot_gene_expression_experiments(df, figs_dir, out_type):
             migration = df_g['Migration Onset Time (Footprint Area Based)'][[cond in val for val in df_g['Experimental Condition'].values]].dropna()
             metric = df_g['gene_metric'][[cond in val for val in df_g['Experimental Condition'].values]].dropna()
         
-            pearson, p_pvalue = pearsonr(migration, metric)
-            spearman, s_pvalue = spearmanr(migration, metric)
-            print(f'Pearson Correlation: {pearson:.3g} | p-value: {p_pvalue:.3g}')
-            print(f'Spearman Correlation: {spearman:.3g} | p-value: {s_pvalue:.3g}')
+            pearson, p_pvalue, _, _ = bootstrap_corr(migration, metric, "Pearson")
+            spearman, s_pvalue, _, _ = bootstrap_corr(migration, metric, "Spearman")
 
             X = df_g['Migration Onset Time (Footprint Area Based)'][[cond in val for val in df_g['Experimental Condition'].values]].dropna()
             Y = df_g['gene_metric'][[cond in val for val in df_g['Experimental Condition'].values]].dropna()
@@ -638,11 +669,9 @@ def plot_gene_expression_experiments(df, figs_dir, out_type):
         migration = df_g['Migration Onset Time (Footprint Area Based)']
         metric = df_g['gene_metric']
 
-        pearson, p_pvalue = pearsonr(migration, metric)
-        spearman, s_pvalue = spearmanr(migration, metric)
-        print(f'Pearson Correlation: {pearson:.3g} | p-value: {p_pvalue:.3g}')
-        print(f'Spearman Correlation: {spearman:.3g} | p-value: {s_pvalue:.3g}')
-        
+        pearson, p_pvalue, _, _ = bootstrap_corr(migration, metric, "Pearson")
+        spearman, s_pvalue, _, _ = bootstrap_corr(migration, metric, "Spearman")
+
         X = df_g['Migration Onset Time (Footprint Area Based)']
         Y = df_g['gene_metric']
 
@@ -719,6 +748,8 @@ def plot_collagenase_analysis(df, figs_dir, out_type):
         X = df_gene['Collagenease concentration (ug/mL)']
         Y = df_gene['Migration Onset Time (Footprint Area Based)']
 
+        _ = bootstrap_corr(X, Y, "Spearman")
+
         # It's important to add a constant (intercept) to the model
         X = sm.add_constant(X)
 
@@ -747,7 +778,6 @@ def plot_collagenase_analysis(df, figs_dir, out_type):
         else:
             print("\nConclusion: The p-value for the slope is not less than 0.05, so we cannot conclude there is a significant linear relationship.")
 
-
 def plot_mmp_inhibitor_migration(df, figs_dir, out_type):
     """
     Parameters
@@ -1015,8 +1045,8 @@ def plot_inside_outside_migration_timing(df, figs_dir, out_type):
     X = dfio_scatter['Migration Onset Time (Footprint Area Based)']
     Y = dfio_scatter['Migration Onset Time (Inside/Outside Basement Membrane Based)']
 
-    p_results = pearsonr(X.values, Y.values)
-    r_results = spearmanr(X.values, Y.values)
+    p_results = bootstrap_corr(X.values, Y.values, "Pearson")
+    r_results = bootstrap_corr(X.values, Y.values, "Spearman")
     print('n: {0:d}'.format(n_movies_io))
     print('Pearson Correlation: {0:.3g} | p-Value: {1:.3g}'.format(p_results.statistic, p_results.pvalue))
     print('Spearman Correlation: {0:.3g} | p-Value: {1:.3g}'.format(r_results.statistic, r_results.pvalue))
diff --git a/EMT_data_analysis/tools/io.py b/EMT_data_analysis/tools/io.py
@@ -12,12 +12,15 @@ def load_imaging_and_segmentation_dataset():
     return df
 
 def load_image_analysis_extracted_features(load_from_aws: bool = True):
-    path = "https://allencell.s3.amazonaws.com/aics/emt_timelapse_dataset/manifests/Image_analysis_extracted_features.csv?versionId=ehxRXxC0FpidcpgXU_z.51T.nkWB0Yuj"
-    if not load_from_aws:
-        # Or read from local if the user decides to run Metric_computation.py
-        metric_comp_results_dir = get_results_directory_name() / "metric_computation"
-        path = metric_comp_results_dir / "Image_analysis_extracted_features.csv"
-    df = pd.read_csv(path)
+    metric_comp_results_dir = get_results_directory_name() / "metric_computation"
+    path = metric_comp_results_dir / "Image_analysis_extracted_features.csv"
+    try:
+        print('Trying to load features from local path.')
+        df = pd.read_csv(path)
+    except Exception:
+        print(f'Features not found at {path}. Loading from AWS instead. This may take a while...')
+        path = "https://allencell.s3.amazonaws.com/aics/emt_timelapse_dataset/manifests/Image_analysis_extracted_features.csv?versionId=ehxRXxC0FpidcpgXU_z.51T.nkWB0Yuj"
+        df = pd.read_csv(path)
     return df
 
 def load_inside_outside_classification(load_from_aws: bool = True):