superconfounder/simulation.py at feature/confounder-analysis-module · zhenchenwang/superconfounder · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
import numpy as np
import pandas as pd
from sklearn.cluster import AffinityPropagation
from sklearn.preprocessing import StandardScaler

def generate_data(n_samples=5000, n_features=6):
    """
    Generates a synthetic dataset with a known causal structure for confounder analysis.

    Args:
        n_samples (int): The number of samples to generate.
        n_features (int): The number of Z variables (potential confounders).

    Returns:
        pandas.DataFrame: A DataFrame with the generated data, including columns for
                          outcome (Y), treatment (X), confounders (Z1, Z2, ...), and cluster.
    """
    # Generate base features from a normal distribution
    Z = pd.DataFrame(np.random.randn(n_samples, n_features), columns=[f'Z{i+1}' for i in range(n_features)])

    # Define the causal structure
    # Z1, Z2 are linear confounders of X and Y
    # Z3 has a non-linear (quadratic) effect on Y
    # Z4, Z5, Z6 are noise variables

    # Treatment assignment (X) depends on Z1 and Z2
    x_logits = 0.6 * Z['Z1'] + 0.9 * Z['Z2'] + np.random.randn(n_samples)
    x_prob = 1 / (1 + np.exp(-x_logits))
    X = np.random.binomial(1, x_prob, size=n_samples)

    # Outcome (Y) depends on X, Z1, Z2, and a non-linear term for Z3
    Y = (
        2 * X +
        1.5 * Z['Z1'] -
        1.2 * Z['Z2'] +
        0.5 * (Z['Z3'] ** 2) +  # Non-linear effect
        np.random.randn(n_samples)
    )

    # Combine into a single DataFrame
    df = pd.concat([pd.Series(Y, name='Y'), pd.Series(X, name='X'), Z], axis=1)

    # Generate clusters using Affinity Propagation on Z variables
    # Scale data for clustering
    scaler = StandardScaler()
    Z_scaled = scaler.fit_transform(Z)

    # Using a damping factor and preference can help with convergence and cluster size.
    # A lower (more negative) preference leads to fewer, larger clusters.
    af = AffinityPropagation(damping=0.9, preference=-100, random_state=42)
    clusters = af.fit_predict(Z_scaled)
    df['cluster'] = clusters

    return df

if __name__ == '__main__':
    # Example of how to generate the data and print some info
    data = generate_data()
    print("Generated Data Head:")
    print(data.head())
    print("\nData Info:")
    data.info()
    print("\nCluster Distribution:")
    print(data['cluster'].value_counts())
    print(f"\nGenerated {len(data['cluster'].unique())} clusters.")