-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathLifeArc_application.py
More file actions
78 lines (58 loc) · 2.9 KB
/
LifeArc_application.py
File metadata and controls
78 lines (58 loc) · 2.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 6 15:51:20 2019
@author: ibarlow
"""
""" script to generate screen stats and Z-factor numbers for LifeArc
Use Antipsychotics data to find the average and standard deviation for High Control
(Drug with stong effect eg. Chlorpromazine HCl) and the background (DMSO)
Use these data to calculate the Z-factor as well, as determined in
Zhang et al 1999 (J. Biomol screen. 4, 67-73)"""
import pandas as pd
import numpy as np
import os
from scipy import stats
feat_file = '/Volumes/behavgenom$/Ida/Data/Antipsychotics/features_summary_tierpsy_plate_20190531_162311.csv'
filename_file = '/Volumes/behavgenom$/Ida/Data/Antipsychotics/filenames_summary_tierpsy_plate_20190531_162311.csv'
metadata_file = '/Volumes/behavgenom$/Ida/Data/Antipsychotics/metadata_all.csv'
background = 'DMSO'
HighControl = 'Chlopromazine hydrocholoride'
testFeature1 = 'relative_to_body_speed_midbody_IQR'
testFeature2 = 'eigen_projection_2_abs_IQR'
#import data
featMat = pd.read_csv(feat_file, index_col='file_id')
filenameMat = pd.read_csv(filename_file, index_col='file_id')
metadata = pd.read_csv(metadata_file, index_col= False)
#make big dataframe with all the data
featMat_all = pd.concat([featMat, filenameMat], axis=1, join='inner')
featMat_all['basename'] = featMat_all['file_name'].apply(lambda x: '_'.join(os.path.basename(x).split('_')[:-1]))
#get basename of metadata too
metadata['basename'] = metadata['filename'].apply(lambda x: '_'.join(os.path.basename(x).split('.')[:-1]))
#concat
featMat_metadata= pd.concat([featMat_all.set_index('basename'),
metadata.set_index('basename')], axis=1, join='inner')
featMatFinal = featMat_metadata.reset_index(drop=False)
#drop bad files
featMatFinal = featMatFinal[featMatFinal.is_good==True]
featMatFinal.drop(columns = ['is_good', 'file_name'], inplace=True)
#zscore data before calculating z-factor
featMatFinal.fillna(featMatFinal.mean(axis=0), inplace=True)
#stats.zscore
FeatMatZ = pd.DataFrame(stats.zscore(featMatFinal.drop(columns = metadata.columns),
ddof= 1,
axis=0),
columns = featMatFinal.drop(columns =metadata.columns).columns)
FeatMatZ = pd.concat([FeatMatZ, metadata],axis=1)
#now extract out feature values
featMatFinal_grouped = featMatFinal.groupby('drug type')
#example code to run to get the values
featMatFinal_grouped.get_group(background)[testFeature1].mean()
featMatFinal_grouped.get_group(HighControl)[testFeature1].mean()
featMatFinal_grouped.get_group(background)[testFeature1].std()
featMatFinal_grouped.get_group(HighControl)[testFeature1].std()
featMatFinal_grouped.get_group(background)[testFeature2].mean()
featMatFinal_grouped.get_group(HighControl)[testFeature2].mean()
featMatFinal_grouped.get_group(background)[testFeature2].std()
featMatFinal_grouped.get_group(HighControl)[testFeature2].std()
#calculate Z-factor