-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimulation.py
More file actions
65 lines (54 loc) · 2.34 KB
/
simulation.py
File metadata and controls
65 lines (54 loc) · 2.34 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import numpy as np
import pandas as pd
from sklearn.cluster import AffinityPropagation
from sklearn.preprocessing import StandardScaler
def generate_data(n_samples=5000, n_features=6):
"""
Generates a synthetic dataset with a known causal structure for confounder analysis.
Args:
n_samples (int): The number of samples to generate.
n_features (int): The number of Z variables (potential confounders).
Returns:
pandas.DataFrame: A DataFrame with the generated data, including columns for
outcome (Y), treatment (X), confounders (Z1, Z2, ...), and cluster.
"""
# Generate base features from a normal distribution
Z = pd.DataFrame(np.random.randn(n_samples, n_features), columns=[f'Z{i+1}' for i in range(n_features)])
# Define the causal structure
# Z1, Z2 are linear confounders of X and Y
# Z3 has a non-linear (quadratic) effect on Y
# Z4, Z5, Z6 are noise variables
# Treatment assignment (X) depends on Z1 and Z2
x_logits = 0.6 * Z['Z1'] + 0.9 * Z['Z2'] + np.random.randn(n_samples)
x_prob = 1 / (1 + np.exp(-x_logits))
X = np.random.binomial(1, x_prob, size=n_samples)
# Outcome (Y) depends on X, Z1, Z2, and a non-linear term for Z3
Y = (
2 * X +
1.5 * Z['Z1'] -
1.2 * Z['Z2'] +
0.5 * (Z['Z3'] ** 2) + # Non-linear effect
np.random.randn(n_samples)
)
# Combine into a single DataFrame
df = pd.concat([pd.Series(Y, name='Y'), pd.Series(X, name='X'), Z], axis=1)
# Generate clusters using Affinity Propagation on Z variables
# Scale data for clustering
scaler = StandardScaler()
Z_scaled = scaler.fit_transform(Z)
# Using a damping factor and preference can help with convergence and cluster size.
# A lower (more negative) preference leads to fewer, larger clusters.
af = AffinityPropagation(damping=0.9, preference=-100, random_state=42)
clusters = af.fit_predict(Z_scaled)
df['cluster'] = clusters
return df
if __name__ == '__main__':
# Example of how to generate the data and print some info
data = generate_data()
print("Generated Data Head:")
print(data.head())
print("\nData Info:")
data.info()
print("\nCluster Distribution:")
print(data['cluster'].value_counts())
print(f"\nGenerated {len(data['cluster'].unique())} clusters.")