-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathannotation_evaluation.py
More file actions
102 lines (89 loc) · 4.68 KB
/
annotation_evaluation.py
File metadata and controls
102 lines (89 loc) · 4.68 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import os
from optparse import OptionParser
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import ppscore as pps
from dython.nominal import compute_associations
parser = OptionParser()
parser.add_option("-i", "--input", type=str,
default='./GRCh37_20210315_v1.4.0.KSE.clvrvAdded.tab',
help="Input tab-formated file")
parser.add_option("-o", "--output", type=str,
default='./output',
help="Output folder path")
(options, args) = parser.parse_args()
dpi = 300
fontsize = 20
fig_height = 40
fig_width = 40
features = 'gerp_rs,phastCon46,phyloP46,SIFT_score,SIFT_median,SIFT_prediction,Polyphen-2.HumDiv,Polyphen-2.HumVar,MT_treevote,fathmm-mkl_C.score,fathmm-mkl_NC.score,fathmm-xf_C.score,fathmm-xf_NC.score,CADD,CADD_phred,PrimateAI,mcap_sensitivityv1.4,SpliceAI_DS_AG,SpliceAI_DS_AL,SpliceAI_DS_DG,SpliceAI_DS_DL,Ada_score,AF_HC'.split(',')
features.append('Clinsig_model')
def cm_to_inch(value):
return value/2.54
def process(df):
fill_columns = 'SpliceAI_DS_AG,SpliceAI_DS_AL,SpliceAI_DS_DG,SpliceAI_DS_DL,Ada_score,AF_HC'.split(',')
values = {x:0 for x in fill_columns}
df.fillna(values,inplace=True)
df.columns = df.columns.str.replace(' ', '')
def calculate_ppscore_matrix(df,file_name):
filename = os.path.join(options.output, file_name+'_'+'pps.png')
if os.path.isfile(filename) == False:
matrix_df = pps.matrix(df, sample=None)[['x', 'y', 'ppscore']].pivot(columns='x', index='y', values='ppscore')
matrix_df = matrix_df[features].reindex(features)
matrix_df.to_csv(os.path.join(options.output, file_name+'_'+'pps.csv'),index=False)
plt.figure(figsize=(cm_to_inch(fig_height),cm_to_inch(fig_width))).set_facecolor("w")
sns.set(font_scale=fontsize-18)
sns.heatmap(matrix_df, vmin=0, vmax=1, cmap="Blues", linewidths=0.5)#, annot=True)
plt.xticks(fontsize=fontsize)
plt.xlabel("Feature",fontsize=fontsize+2)
plt.yticks(fontsize=fontsize)
plt.ylabel("Target",fontsize=fontsize+2)
#plt.title(file_name + " PPS")
plt.savefig(filename,bbox_inches='tight',dpi=dpi)
plt.clf()
plt.close('all')
def calculate_associations(df,file_name):
print("Calculate association")
filename = os.path.join(options.output, file_name+'_'+'associations_theilu.png')
if os.path.exists(filename)== False:
try:
corr = compute_associations(df,clustering=True,theil_u=True,nan_strategy="drop_samples")
corr = corr[features].reindex(features)
corr.to_csv(os.path.join(options.output, file_name+'_'+'associations_replacenan0_theilu.csv'),index=False)
plt.figure(figsize=(cm_to_inch(fig_height),cm_to_inch(fig_width))).set_facecolor("w")
sns.set(font_scale=fontsize-18)
sns.heatmap(corr, vmin=-1, vmax=1, cmap="Blues", linewidths=0.5)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.savefig(filename,bbox_inches='tight',dpi=dpi)
plt.clf()
plt.close('all')
except Exception as e:
print(e)
filename = os.path.join(options.output, file_name+'_'+'associations_cramers_v.png')
if os.path.exists(filename)== False:
corr = compute_associations(df,clustering=True,theil_u=False, nan_strategy="drop_samples")
corr = corr[features].reindex(features)
corr.to_csv(os.path.join(options.output, file_name+'_'+'associations_replacenan0_cramers_v.csv'),index=False)
plt.figure(figsize=(cm_to_inch(fig_height),cm_to_inch(fig_width))).set_facecolor("w")
sns.set(font_scale=fontsize-18)
sns.heatmap(corr, vmin=-1, vmax=1, cmap="Blues", linewidths=0.5)
plt.xticks(fontsize=fontsize)
plt.yticks(fontsize=fontsize)
plt.savefig(filename,bbox_inches='tight',dpi=dpi)
plt.clf()
plt.close('all')
def main():
df = pd.read_csv(options.input,delimiter='\t')
process(df)
df_snp = df[df['is_snp']==1]
df_indel = df[df['is_snp']==0]
df_indel.drop(['var','id','is_snp'],axis=1,inplace=True)
df_snp.drop(['var','id','is_snp'],axis=1,inplace=True)
calculate_ppscore_matrix(df_indel.drop(['clvrv.stars'],axis=1),'GRCh37_20210315_v1.4.0.KSE.clvrvAdded'+'_indel_authorfill')
calculate_ppscore_matrix(df_snp.drop(['clvrv.stars'],axis=1),'GRCh37_20210315_v1.4.0.KSE.clvrvAdded'+'_snp_authorfill')
calculate_associations(df_indel.drop(['clvrv.stars'],axis=1),'GRCh37_20210315_v1.4.0.KSE.clvrvAdded'+'_indel_authorfill')
calculate_associations(df_snp.drop(['clvrv.stars'],axis=1),'GRCh37_20210315_v1.4.0.KSE.clvrvAdded'+'_snp_authorfill')
if __name__ =='__main__':
main()