-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsim_framework.py
More file actions
337 lines (251 loc) · 14.5 KB
/
sim_framework.py
File metadata and controls
337 lines (251 loc) · 14.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
import numpy as np
from scipy.stats import multivariate_normal
import statsmodels.api as sm
from scipy import stats
from scipy.stats import pearsonr
def simulate_phenotypes_and_biomarkers_with_correlations(N, n, m, k, n_B, n_P, n_BP, p, sigma_B, sigma_P, sigma_BP, sigma_epsilon_B, sigma_epsilon_P):
"""
Simulate phenotypes and biomarkers with specified levels of heritability and genetic correlation,
and calculate genetic correlations for each biomarker, each phenotype, and each pair of biomarker/phenotype.
Parameters:
N (int): Number of samples
n (int): Number of genetic variants.
m (int): Number of biomarkers.
k (int): Number of phenotypes.
n_B (int): Number of causal variants for biomarkers.
n_P (int): Number of causal variants for phenotypes.
n_BP (int): Number of causal variants shared between biomarkers and phenotypes.
p (float): Minor Allele Frequence.
sigma_B (float): Average genetic variance of causal variants for biomarkers.
sigma_P (float): Average genetic variance of causal variants for phenotypes.
sigma_BP (float): Average covariance between the effect sizes of biomarkers and phenotypes due to shared causal variants.
sigma_epsilon_B (float): Environmental variance for biomarkers.
sigma_epsilon_P (float): Environmental variance for phenotypes.
Returns:
B (numpy.ndarray): Simulated biomarkers (N x m matrix).
P (numpy.ndarray): Simulated phenotypes (N x k matrix).
h2_B (float): Heritability of biomarkers.
h2_P (float): Heritability of phenotypes.
rho_B (numpy.ndarray): Genetic correlation matrix for biomarkers (m x m matrix).
rho_P (numpy.ndarray): Genetic correlation matrix for phenotypes (k x k matrix).
rho_BP (numpy.ndarray): Genetic correlation matrix between biomarkers and phenotypes (m x k matrix).
G (numpy.ndarray): Genetic variant matrix (N x n matrix)
causal_indices_B (numpy.ndarray): Indices of causal variants for biomarkers
causal_indices_P (numpy.ndarray): Indices of causal variants for phenotypes
E_B (numpy.ndarray): Biomarker effect sizes
E_P (numpy.ndarray): Phenotype effect sizes
env_B (numpy.ndarray): Simulated environmental effect on biomarkers
env_P (numpy.ndarray): Simulated environmental effect on phenotypes
"""
# Generate genetic variants matrix for N samples and n variants
G = np.random.binomial(2, p, (N, n))
# Initialize phenotype and biomarker matrices
B = np.zeros((N, m)) if m > 0 else None
P = np.zeros((N, k)) if k > 0 else None
# Initialize genetic effects matrices for biomarkers and phenotypes
E_B = np.zeros((m, n)) if m > 0 else None
E_P = np.zeros((k, n)) if k > 0 else None
# Select causal indices for shared biomarkers and phenotypes
causal_indices_shared = np.random.choice(n, n_BP, replace=False)
# Select additional unique causal indices for biomarkers
causal_indices_B_unique = np.random.choice(np.setdiff1d(np.arange(n), causal_indices_shared), n_B - n_BP, replace=False)
# Select additional unique causal indices for phenotypes
causal_indices_P_unique = np.random.choice(np.setdiff1d(np.arange(n),
np.concatenate((causal_indices_shared,
causal_indices_B_unique))),
n_P - n_BP, replace=False)
# Combine the shared and unique causal indices for biomarkers and phenotypes
causal_indices_B = np.concatenate((causal_indices_shared, causal_indices_B_unique))
causal_indices_P = np.concatenate((causal_indices_shared, causal_indices_P_unique))
env_B = np.random.normal(0, np.sqrt(sigma_epsilon_B), (N, m))
env_P = np.random.normal(0, np.sqrt(sigma_epsilon_P), (N, k))
# Generate causal effects for biomarkers
if m > 0:
for i in causal_indices_B:
E_B[:, i] = multivariate_normal.rvs(mean=np.zeros(m), cov=sigma_B * np.eye(m))
B = np.dot(G[:, causal_indices_B], E_B[:, causal_indices_B].T) + env_B
# Generate causal effects for phenotypes
if k > 0:
for j in causal_indices_P:
E_P[:, j] = multivariate_normal.rvs(mean=np.zeros(k), cov=sigma_P * np.eye(k))
P = np.dot(G[:, causal_indices_P], E_P[:, causal_indices_P].T) + env_P
# Calculate heritability for biomarkers and phenotypes
h2_B = np.var(np.dot(G[:, causal_indices_B], E_B[:, causal_indices_B].T), axis=0) / np.var(B, axis=0) if m > 0 else None
h2_P = np.var(np.dot(G[:, causal_indices_P], E_P[:, causal_indices_P].T), axis=0) / np.var(P, axis=0) if k > 0 else None
# Calculate genetic correlation matrices
if m > 0:
rho_B = np.corrcoef(E_B)
else:
rho_B = None
if k > 0:
rho_P = np.corrcoef(E_P)
else:
rho_P = None
# Genetic correlation matrix between biomarkers and phenotypes
if m > 0 and k > 0:
shared_indices = np.intersect1d(causal_indices_B, causal_indices_P)
rho_BP = np.zeros((m, k))
if shared_indices.size > 0:
EB_shared = E_B[:, shared_indices]
EP_shared = E_P[:, shared_indices]
# Compute the correlations between the effects of biomarkers and phenotypes for shared causal variants
for i in range(m): # Loop through each biomarker
for j in range(k): # Loop through each phenotype
# Get the effects of the ith biomarker and jth phenotype for the shared indices
effects_B = EB_shared[i, :]
effects_P = EP_shared[j, :]
# Calculate the correlation between the effects
correlation = np.corrcoef(effects_B, effects_P)[0, 1]
rho_BP[i, j] = correlation
else:
rho_BP = np.empty((m, k)).fill(np.nan)
else:
rho_BP = np.empty((m, k)).fill(np.nan)
return B, P, h2_B, h2_P, rho_B, rho_P, rho_BP, G, causal_indices_B, causal_indices_P, E_B, E_P, env_B, env_P
def simulate_downstream_phenotypes(biomarkers, correlations, min_env_variance):
"""
Simulate downstream phenotypes from biomarkers with specified degree of environmental influence
and assuming that the standard deviation of the phenotypes is 1.
:param biomarkers: numpy array (N x m), where m = number of biomarkers, N = number of samples
:param correlations: numpy array (k x m), where k is the number of downstream phenotypes,
containing the desired correlations between biomarkers and phenotypes
:param min_env_variance: numpy array (k,), containing the minimum environmental variance for each phenotype
:return: numpy array (N x k), simulated downstream phenotypes
"""
N = biomarkers.shape[0]
k, m = correlations.shape
std_biomarkers = np.std(biomarkers, axis=0)
# Construct the A matrix to reflect genetic influence
A = correlations / std_biomarkers
# Calculate genetic contribution to variance
genetic_variance = np.sum((A ** 2) * (std_biomarkers ** 2), axis=1)
# Calculate the minimum environmental variance needed to reach a phenotype variance of 1
base_env_variance = 1 - genetic_variance
# Ensure that environmental variance is at least the provided minimum
corrected_env_variance = np.maximum(base_env_variance, min_env_variance)
# Simulate environmental effects with the proper variance
epsilon = np.random.multivariate_normal(np.zeros(k), np.diag(corrected_env_variance), N)
# Generate downstream phenotypes
P_downstream = biomarkers @ A.T + epsilon
# Normalize the final phenotypes to have standard deviation of 1 (optional)
P_downstream = (P_downstream - np.mean(P_downstream, axis=0)) / np.std(P_downstream, axis=0)
return P_downstream
def run_gwas_return_metrics(phenotype, Genotype_matrix, causal_indices, effect_size=None):
"""
Perform per variant linear regression based association between Phenotype and Genotype matrix.
Then generate standard metrics including the correlation between true effect size and estimated effect size.
Parameters:
phenotype: numpy array (N x 1), N = number of samples.
Genotype_matrix: numpy array (N x m), m = number of SNPs.
causal_indices: numpy array (m x 1). Each element is the index of the causal variant.
effect_size: numpy array (m x 1), where each element is the true effect size for each variant.
Returns:
P_values: numpy array (m x 1), p-values of each SNP
Beta_estimates: numpy array (m x 1), estimated effect size of each SNP
TPR: float, True Positive Rate after correction
FPR: float, False Positive Rate after correction
FDR: float, False Discovery Rate
Effect_correlation: float, correlation between true effect size and GWAS estimated effect size
"""
n, m = Genotype_matrix.shape
P_values = np.zeros(m)
Beta_estimates = np.zeros(m)
# Perform the per variant linear regression
for i in range(m):
# Prepare the design matrix for linear regression
X = sm.add_constant(Genotype_matrix[:, i])
# Perform linear regression
model = sm.OLS(phenotype, X).fit()
# Store the coefficient (effect size) and p-value for the SNP
Beta_estimates[i] = model.params[1]
P_values[i] = model.pvalues[1]
# Apply Bonferroni correction
bonferroni_alpha = 0.05 / m
significant_indices = P_values < bonferroni_alpha
# Calculate TPR and FPR
true_positives = significant_indices[causal_indices].sum()
TPR = true_positives / len(causal_indices)
true_negatives = (~significant_indices[~np.isin(range(m), causal_indices)]).sum()
false_positives = significant_indices[~np.isin(range(m), causal_indices)].sum()
FPR = false_positives / (false_positives + true_negatives)
# Calculate FDR
predicted_positives = significant_indices.sum()
FDR = false_positives / predicted_positives if predicted_positives > 0 else 0
# Calculate the correlation between estimated effect sizes and true effect sizes if effect_sizes are provided
if effect_size is not None:
Effect_correlation, _ = pearsonr(Beta_estimates, effect_size)
else:
Effect_correlation = None
return P_values, Beta_estimates, TPR, FPR, Effect_correlation, FDR
def fit_regression(phenotype, biomarkers):
"""
Fit a linear regression model: phenotype ~ biomarkers
Parameters:
phenotype: numpy array (N x 1), N = number of samples.
biomarkers: numpy array (N x m), m = number of biomarkers.
Returns:
pheno_pred: numpy array (N x 1), predicted value of phenotype using biomarkers
"""
# Add a constant (intercept) to the biomarkers matrix
biomarkers_with_intercept = sm.add_constant(biomarkers)
# Fit the Ordinary Least Squares (OLS) model
model = sm.OLS(phenotype, biomarkers_with_intercept)
results = model.fit()
# Make predictions using the biomarkers
pheno_pred = results.predict(biomarkers_with_intercept)
return pheno_pred
def simulate_downstream_phenotypes_with_direct_genetic_effects(N, G, biomarkers, correlations,
env_variance, n_pd, sigma_pd,
causal_indices_B):
"""
Simulate downstream phenotypes from biomarkers and direct genetic effects with specified
degree of environmental influence.
Parameters:
N (int): Number of samples
G (numpy.ndarray): Genetic variant matrix (N x n matrix)
biomarkers (numpy.ndarray): Simulated biomarkers (N x m matrix)
correlations (numpy.ndarray): Desired correlations between biomarkers and phenotypes (k x m matrix)
min_env_variance (numpy.ndarray): (k,) dim array containing the minimum environmental variance for each phenotype
n_pd (int): Number of causal variants directly affecting downstream phenotypes
sigma_pd (float): Variance of the direct genetic effect sizes for downstream phenotypes
causal_indices_B (numpy.ndarray): Indices of causal variants for biomarkers
Returns:
P_downstream (numpy.ndarray): Simulated downstream phenotypes (N x k matrix)
direct_h2 (numpy.ndarray): Direct heritability of downstream phenotypes (k,)
indirect_h2 (numpy.ndarray): Indirect heritability of downstream phenotypes (k,)
causal_indices_pd (numpy.ndarray): Indices of causal variants directly affecting downstream phenotypes
"""
N, n = G.shape
k, m = correlations.shape
# Calculate the standard deviations of the biomarkers
std_biomarkers = np.std(biomarkers, axis=0)
# Construct the A matrix to reflect genetic influence
A = correlations / std_biomarkers
# Calculate genetic contribution to variance through biomarkers
genetic_variance = np.sum((A ** 2) * (std_biomarkers ** 2), axis=1)
# Calculate environmental variance needed to reach a phenotype variance of 1, given indirect effects
base_env_variance = 1 - genetic_variance
# Ensure that environmental variance is at least the given minimum
corrected_env_variance = np.maximum(base_env_variance, env_variance)
# Select causal variants for direct genetic effects on downstream phenotypes
causal_indices_pd = np.random.choice(np.setdiff1d(np.arange(n), causal_indices_B), n_pd, replace=False)
# Generate direct genetic effects for downstream phenotypes
E_pd = np.zeros((n, k))
for i in causal_indices_pd:
E_pd[i] = multivariate_normal.rvs(mean=np.zeros(k), cov=sigma_pd * np.eye(k))
# Calculate direct genetic effects
direct_genetic_effects = G[:, causal_indices_pd].dot(E_pd[causal_indices_pd])
# Calculate indirect genetic effects through biomarkers
indirect_genetic_effects = biomarkers.dot(A.T)
# Simulate environmental effects for each sample with corrected variance
epsilon = np.random.multivariate_normal(np.zeros(k), np.diag(corrected_env_variance), N)
# Calculate the downstream phenotypes incorporating direct and indirect genetic effects and environmental effects
P_downstream = indirect_genetic_effects + direct_genetic_effects + epsilon
# Calculate variances for heritability estimates
var_direct_genetic = np.var(direct_genetic_effects, axis=0)
var_indirect_genetic = np.var(indirect_genetic_effects, axis=0)
var_total = np.var(P_downstream, axis=0)
# Calculate heritabilities
direct_h2 = var_direct_genetic / var_total
indirect_h2 = var_indirect_genetic / var_total
return P_downstream, direct_h2, indirect_h2, causal_indices_pd