Skip to content

Commit accd29c

Browse files
authored
Merge pull request #42 from AllenCell/feature/bootstrap
Feature/bootstrap
2 parents 403839b + 148c486 commit accd29c

2 files changed

Lines changed: 51 additions & 18 deletions

File tree

EMT_data_analysis/analysis_scripts/Analysis_tools.py

Lines changed: 42 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,37 @@
2020

2121
warnings.filterwarnings("ignore")
2222

23+
rng = np.random.default_rng(42)
24+
25+
def bootstrap_corr(x, y, method="Pearson", confidence_interval=0.95, n_bootstraps=2000, seed=42, verbose=True):
26+
x = np.asarray(x)
27+
y = np.asarray(y)
28+
29+
corr_func = pearsonr
30+
if method == "Spearman":
31+
corr_func = spearmanr
32+
33+
r_observed, p_value = corr_func(x, y)
34+
n_samples = len(x)
35+
36+
inds = np.arange(n_samples)
37+
boot_corrs = []
38+
for _ in range(n_bootstraps):
39+
resampled_inds = rng.choice(inds, size=n_samples, replace=True)
40+
rx = x[resampled_inds]
41+
ry = y[resampled_inds]
42+
boot_r, _ = corr_func(rx, ry)
43+
boot_corrs.append(boot_r)
44+
alpha = (1.0-confidence_interval)/2.0
45+
ci_low = np.nanpercentile(boot_corrs, 100*alpha)
46+
ci_high = np.nanpercentile(boot_corrs, 100*(1.0-alpha))
47+
48+
if verbose:
49+
print(f'Number of samples: {n_samples}')
50+
print(f'{method} Correlation: {r_observed:.3g} | 95% CI: [{ci_low:.2f}, {ci_high:.2f}] | p-value: {p_value:.3g}')
51+
52+
return r_observed, p_value, ci_low, ci_high
53+
2354
def run_all_analyses():
2455
"""
2556
Run all analysis functions
@@ -29,6 +60,8 @@ def run_all_analyses():
2960

3061
df = io.load_image_analysis_extracted_features()
3162

63+
print("Loaded image analysis extracted features and started running analyses...")
64+
3265
plot_area_at_glass_all_data(df, FIGS_DIR, OUT_TYPE)
3366
plot_area_at_glass_h2b(df, FIGS_DIR, OUT_TYPE)
3467
plot_migration_timing_all_data(df, FIGS_DIR, OUT_TYPE)
@@ -611,10 +644,8 @@ def plot_gene_expression_experiments(df, figs_dir, out_type):
611644
migration = df_g['Migration Onset Time (Footprint Area Based)'][[cond in val for val in df_g['Experimental Condition'].values]].dropna()
612645
metric = df_g['gene_metric'][[cond in val for val in df_g['Experimental Condition'].values]].dropna()
613646

614-
pearson, p_pvalue = pearsonr(migration, metric)
615-
spearman, s_pvalue = spearmanr(migration, metric)
616-
print(f'Pearson Correlation: {pearson:.3g} | p-value: {p_pvalue:.3g}')
617-
print(f'Spearman Correlation: {spearman:.3g} | p-value: {s_pvalue:.3g}')
647+
pearson, p_pvalue, _, _ = bootstrap_corr(migration, metric, "Pearson")
648+
spearman, s_pvalue, _, _ = bootstrap_corr(migration, metric, "Spearman")
618649

619650
X = df_g['Migration Onset Time (Footprint Area Based)'][[cond in val for val in df_g['Experimental Condition'].values]].dropna()
620651
Y = df_g['gene_metric'][[cond in val for val in df_g['Experimental Condition'].values]].dropna()
@@ -638,11 +669,9 @@ def plot_gene_expression_experiments(df, figs_dir, out_type):
638669
migration = df_g['Migration Onset Time (Footprint Area Based)']
639670
metric = df_g['gene_metric']
640671

641-
pearson, p_pvalue = pearsonr(migration, metric)
642-
spearman, s_pvalue = spearmanr(migration, metric)
643-
print(f'Pearson Correlation: {pearson:.3g} | p-value: {p_pvalue:.3g}')
644-
print(f'Spearman Correlation: {spearman:.3g} | p-value: {s_pvalue:.3g}')
645-
672+
pearson, p_pvalue, _, _ = bootstrap_corr(migration, metric, "Pearson")
673+
spearman, s_pvalue, _, _ = bootstrap_corr(migration, metric, "Spearman")
674+
646675
X = df_g['Migration Onset Time (Footprint Area Based)']
647676
Y = df_g['gene_metric']
648677

@@ -719,6 +748,8 @@ def plot_collagenase_analysis(df, figs_dir, out_type):
719748
X = df_gene['Collagenease concentration (ug/mL)']
720749
Y = df_gene['Migration Onset Time (Footprint Area Based)']
721750

751+
_ = bootstrap_corr(X, Y, "Spearman")
752+
722753
# It's important to add a constant (intercept) to the model
723754
X = sm.add_constant(X)
724755

@@ -747,7 +778,6 @@ def plot_collagenase_analysis(df, figs_dir, out_type):
747778
else:
748779
print("\nConclusion: The p-value for the slope is not less than 0.05, so we cannot conclude there is a significant linear relationship.")
749780

750-
751781
def plot_mmp_inhibitor_migration(df, figs_dir, out_type):
752782
"""
753783
Parameters
@@ -1015,8 +1045,8 @@ def plot_inside_outside_migration_timing(df, figs_dir, out_type):
10151045
X = dfio_scatter['Migration Onset Time (Footprint Area Based)']
10161046
Y = dfio_scatter['Migration Onset Time (Inside/Outside Basement Membrane Based)']
10171047

1018-
p_results = pearsonr(X.values, Y.values)
1019-
r_results = spearmanr(X.values, Y.values)
1048+
p_results = bootstrap_corr(X.values, Y.values, "Pearson")
1049+
r_results = bootstrap_corr(X.values, Y.values, "Spearman")
10201050
print('n: {0:d}'.format(n_movies_io))
10211051
print('Pearson Correlation: {0:.3g} | p-Value: {1:.3g}'.format(p_results.statistic, p_results.pvalue))
10221052
print('Spearman Correlation: {0:.3g} | p-Value: {1:.3g}'.format(r_results.statistic, r_results.pvalue))

EMT_data_analysis/tools/io.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,15 @@ def load_imaging_and_segmentation_dataset():
1212
return df
1313

1414
def load_image_analysis_extracted_features(load_from_aws: bool = True):
15-
path = "https://allencell.s3.amazonaws.com/aics/emt_timelapse_dataset/manifests/Image_analysis_extracted_features.csv?versionId=ehxRXxC0FpidcpgXU_z.51T.nkWB0Yuj"
16-
if not load_from_aws:
17-
# Or read from local if the user decides to run Metric_computation.py
18-
metric_comp_results_dir = get_results_directory_name() / "metric_computation"
19-
path = metric_comp_results_dir / "Image_analysis_extracted_features.csv"
20-
df = pd.read_csv(path)
15+
metric_comp_results_dir = get_results_directory_name() / "metric_computation"
16+
path = metric_comp_results_dir / "Image_analysis_extracted_features.csv"
17+
try:
18+
print('Trying to load features from local path.')
19+
df = pd.read_csv(path)
20+
except Exception:
21+
print(f'Features not found at {path}. Loading from AWS instead. This may take a while...')
22+
path = "https://allencell.s3.amazonaws.com/aics/emt_timelapse_dataset/manifests/Image_analysis_extracted_features.csv?versionId=ehxRXxC0FpidcpgXU_z.51T.nkWB0Yuj"
23+
df = pd.read_csv(path)
2124
return df
2225

2326
def load_inside_outside_classification(load_from_aws: bool = True):

0 commit comments

Comments
 (0)