Var_Annot_Eval/annotation_evaluation.py at main · nmtrang00/Var_Annot_Eval · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
from optparse import OptionParser

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ppscore as pps
from dython.nominal import compute_associations

parser = OptionParser()
parser.add_option("-i", "--input", type=str,
                    default='./GRCh37_20210315_v1.4.0.KSE.clvrvAdded.tab',
                    help="Input tab-formated file")
parser.add_option("-o", "--output", type=str,
                    default='./output',
                    help="Output folder path")
(options, args) = parser.parse_args()

dpi = 300
fontsize = 20
fig_height = 40
fig_width = 40

features = 'gerp_rs,phastCon46,phyloP46,SIFT_score,SIFT_median,SIFT_prediction,Polyphen-2.HumDiv,Polyphen-2.HumVar,MT_treevote,fathmm-mkl_C.score,fathmm-mkl_NC.score,fathmm-xf_C.score,fathmm-xf_NC.score,CADD,CADD_phred,PrimateAI,mcap_sensitivityv1.4,SpliceAI_DS_AG,SpliceAI_DS_AL,SpliceAI_DS_DG,SpliceAI_DS_DL,Ada_score,AF_HC'.split(',')
features.append('Clinsig_model')

def cm_to_inch(value):
    return value/2.54

def process(df):
    fill_columns = 'SpliceAI_DS_AG,SpliceAI_DS_AL,SpliceAI_DS_DG,SpliceAI_DS_DL,Ada_score,AF_HC'.split(',')
    values = {x:0 for x in fill_columns}
    df.fillna(values,inplace=True)
    df.columns = df.columns.str.replace(' ', '')

def calculate_ppscore_matrix(df,file_name):
    filename = os.path.join(options.output, file_name+'_'+'pps.png')
    if os.path.isfile(filename) == False:
        matrix_df = pps.matrix(df, sample=None)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
        matrix_df = matrix_df[features].reindex(features)
        matrix_df.to_csv(os.path.join(options.output, file_name+'_'+'pps.csv'),index=False)

        plt.figure(figsize=(cm_to_inch(fig_height),cm_to_inch(fig_width))).set_facecolor("w")
        sns.set(font_scale=fontsize-18)
        sns.heatmap(matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5)#, annot=True)
        plt.xticks(fontsize=fontsize)
        plt.xlabel("Feature",fontsize=fontsize+2)
        plt.yticks(fontsize=fontsize)
        plt.ylabel("Target",fontsize=fontsize+2)
        #plt.title(file_name  + " PPS")
        plt.savefig(filename,bbox_inches='tight',dpi=dpi)
        plt.clf()
        plt.close('all')


def calculate_associations(df,file_name):
    print("Calculate association")
    filename = os.path.join(options.output, file_name+'_'+'associations_theilu.png')
    if os.path.exists(filename)== False:
        try:
            corr = compute_associations(df,clustering=True,theil_u=True,nan_strategy="drop_samples")
            corr = corr[features].reindex(features)
            corr.to_csv(os.path.join(options.output, file_name+'_'+'associations_replacenan0_theilu.csv'),index=False)
            plt.figure(figsize=(cm_to_inch(fig_height),cm_to_inch(fig_width))).set_facecolor("w")
            sns.set(font_scale=fontsize-18)
            sns.heatmap(corr, vmin=-1, vmax=1, cmap="Blues", linewidths=0.5)
            plt.xticks(fontsize=fontsize)
            plt.yticks(fontsize=fontsize)
            plt.savefig(filename,bbox_inches='tight',dpi=dpi)
            plt.clf()
            plt.close('all')
        except Exception as e:
            print(e)

    filename = os.path.join(options.output, file_name+'_'+'associations_cramers_v.png')
    if os.path.exists(filename)== False:
        corr = compute_associations(df,clustering=True,theil_u=False, nan_strategy="drop_samples")
        corr = corr[features].reindex(features)
        corr.to_csv(os.path.join(options.output, file_name+'_'+'associations_replacenan0_cramers_v.csv'),index=False)
        plt.figure(figsize=(cm_to_inch(fig_height),cm_to_inch(fig_width))).set_facecolor("w")
        sns.set(font_scale=fontsize-18)
        sns.heatmap(corr, vmin=-1, vmax=1, cmap="Blues", linewidths=0.5)
        plt.xticks(fontsize=fontsize)
        plt.yticks(fontsize=fontsize)
        plt.savefig(filename,bbox_inches='tight',dpi=dpi)
        plt.clf()
        plt.close('all')

def main():
    df = pd.read_csv(options.input,delimiter='\t')
    process(df)
    df_snp = df[df['is_snp']==1]
    df_indel = df[df['is_snp']==0]
    df_indel.drop(['var','id','is_snp'],axis=1,inplace=True)
    df_snp.drop(['var','id','is_snp'],axis=1,inplace=True)
    calculate_ppscore_matrix(df_indel.drop(['clvrv.stars'],axis=1),'GRCh37_20210315_v1.4.0.KSE.clvrvAdded'+'_indel_authorfill')
    calculate_ppscore_matrix(df_snp.drop(['clvrv.stars'],axis=1),'GRCh37_20210315_v1.4.0.KSE.clvrvAdded'+'_snp_authorfill')
    calculate_associations(df_indel.drop(['clvrv.stars'],axis=1),'GRCh37_20210315_v1.4.0.KSE.clvrvAdded'+'_indel_authorfill')
    calculate_associations(df_snp.drop(['clvrv.stars'],axis=1),'GRCh37_20210315_v1.4.0.KSE.clvrvAdded'+'_snp_authorfill')

if __name__ =='__main__':
    main()