-
Notifications
You must be signed in to change notification settings - Fork 3
/
mrmr1.py
185 lines (173 loc) · 7.07 KB
/
mrmr1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
import re
from pearson_coeff import *
# Open the data file
fid = open("colon", "r ")
# fid = open("file.txt", "r ")
# Read the data line wise and store in 2-D list
data = []
while True :
line = fid.readline().strip()
if line == '':
break
else:
data.append(re.findall(r"[-]?\d*\.\d |[-]?\d ", line))
# Close opened file
fid.close()
# h = classification variable
# gi = gene variable
# klist = list of classes available(k), ksize = size of kth class(is not nk, no of rows having k)
# n = sum of all nk, ksum = sum of all gi of some particular k
# kvar = variance of kth class, var = overall variance
# gbar = mean of all gi for each i(col), gkbar = mean of all gi for each k(class)
# ftestval = value of ftest function for each gi
klist = []; ksize = {}; ksum = {}; kvar = {}
gbar = []; gkbar = []; ftestval = {}
# first row of data list has meta-data, not actual data
# row = no of samples, col = no of features
row = int(data[0][0])
col = int(data[0][1])
for si in data[1:]:
# si[-1] is the class and has to be in klist, if not, then include it, also update ksize
# if it is there then update the size
if si[-1] not in klist:
klist.append(si[-1])
ksize[si[-1]] = 1
else:
if ksize.has_key(si[-1]):
ksize[si[-1]] = 1
else:
print "Error : class in klist, not found in dict(ksize)"
exit()
# ksum stores the sum of all features of each class
if not ksum.has_key(si[-1]):
ksum[si[-1]] = 0
for j in si[:-1]:
ksum[si[-1]] = float(j)
# find each kvar for each class
for ki in klist:
# tmean is the mean and tsum is the sum of squared differences
tmean = float(ksum[ki]/(ksize[ki]*col))
tsum = 0
# Here we are going down row wise and checking if this row is of the ki class
for di in data[1:]:
if di[-1] == ki:
for num in di[:-1]:
tsum = (tmean - float(num))**2
if not kvar.has_key(ki):
kvar[ki] = float(tsum/(ksize[ki]*col))
# will not do kvar[i] = kvar[i]**2, as variance is denoted by sigma^2
else:
print "some error occurred in calculating var(k).."
# find the pooled variance = var (for all classes)
# nk = size of each class x features in each col( = col)
var = 0
for i in klist:
var = float((ksize[i]-1)*kvar[i])
var /= float(row - len(klist))
# Now after calculating the pooled variance, we move on to find the F-Test values
# of all the gene variables(gi), but first find gbar and gkbar
for gi in xrange(col):
gbar.append(0)
for si in xrange(1, row 1, 1):
gbar[gi] = float(data[si][gi])
if len(gkbar) <= gi:
# This means that no entry has been made in gkbar for this value of gi
# make an entry to increase the list index, else it will cause exception
gkbar.append({data[si][-1]: float(data[si][gi])})
else:
# Means some entry has been made, if the class for this row is present then
# simple add the value, else add this class as a key and initialize with this si value
if gkbar[gi].has_key(data[si][-1]):
gkbar[gi][data[si][-1]] = float(data[si][gi])
else:
gkbar[gi][data[si][-1]] = float(data[si][gi])
# Divide the sum by the number of samples(rows) to actually get the mean
gbar[gi] = float(gbar[gi]/row)
for ki in klist:
if gkbar[gi].has_key(ki):
# Sum of gi for each class/size of the class, correspondingly
gkbar[gi][ki] = float(gkbar[gi][ki]/ksize[ki])
# For F-Test function calculation
for gi in xrange(col):
tsum = float(0.0)
# print '\n\ngi : ', gi
for ki in klist:
tsum = (((ksize[ki])*(gkbar[gi][ki]-gbar[gi])**2)/(row-1))
# print (((ksize[ki])*(gkbar[gi][ki]-gbar[gi])**2)/(row-1))
# print 'ki : ', ki
# print 'ksize[ki] : ', ksize[ki]
# print 'gkbar[gi][ki] : ', gkbar[gi][ki]
# print 'gbar[gi] : ', gbar[gi]
# #print 'ksize[ki] : ', ksize[ki]
# print 'tsum : %f' % tsum
# print 'var : ', var
ftestval[gi] = float(tsum/var)
# print 'gbar : ', gbar
# print 'gkbar :', gkbar
# print 'klist : ', klist
# print 'ksize : ', ksize
# print 'ksum : ', ksum
# print 'kvar : ', kvar
# print 'var : ', var
print 'ftestval : ', ftestval
# maxftestval stores the feature with maxvalue of f-Test which starts the algo as first
# selected feature
# selectedfeatures stores the list of features selected as set of reduced features
# nonselectedfeatures stores the list of features not selected yet
# corr_coef stores the value of correlation coefficient for any (gi,gj) pair in dictionary way
maxftestval = max(ftestval, key=lambda k: ftestval[k])
print 'maxftestval : ',maxftestval
selectedfeatures = []
selectedfeatures.append(maxftestval)
nonselectedfeatures = list(set(ftestval.keys())-{maxftestval})
print 'nonselectedfeaturelist : ',nonselectedfeatures
# data matrix is transposed to align it feature wise not sample wise
transdata = [list(float(n) for n in x) for x in zip(*data[1:])]
corr_coef = {}
reqFeatures = 500
counter = 1
# for fcount in range(1,reqFeatures):
# # funcval and gi store the maximum value to select the best suitable gi every cycle
# funcval = -10.0; gi = -2
# for giNs in nonselectedfeatures:
# sigmacorval = 0.0
# for gjS in selectedfeatures:
# counter = 1
# if corr_coef.has_key('%d %d' % (giNs, gjS)):
# sigmacorval = corr_coef['%d %d' % (giNs, gjS)]
# elif corr_coef.has_key('%d %d' % (gjS, giNs)):
# sigmacorval = corr_coef['%d %d' % (gjS, giNs)]
# else:
# corr_coef['%d %d'%(giNs,gjS)] = pearson_coefficient(row, transdata[giNs], transdata[gjS])
# sigmacorval = corr_coef['%d %d'%(giNs,gjS)]
# #print 'giNs:',giNs,' gjS:',gjS,
# sigmacorval /= len(selectedfeatures)
# #print ''
# #print ' sigmaCorr:',sigmacorval,
# #print 'decide:', ftestval[giNs] - sigmacorval
# if (ftestval[giNs] - sigmacorval) > funcval:
# funcval = ftestval[giNs] - sigmacorval
# gi = giNs
# if gi != -2:
# selectedfeatures.append(gi)
# nonselectedfeatures.remove(gi)
sigmacorval = {}
for fcount in range(1,reqFeatures):
funcval = -10.0; gi = -2
prevFeature = selectedfeatures[-1]
for giNs in nonselectedfeatures:
if sigmacorval.has_key(giNs):
sigmacorval[giNs] = pearson_coefficient(row,transdata[giNs],transdata[prevFeature])
else:
sigmacorval[giNs] = pearson_coefficient(row,transdata[giNs],transdata[prevFeature])
if (ftestval[giNs] - (sigmacorval[giNs]/len(selectedfeatures))) > funcval:
funcval = ftestval[giNs] - (sigmacorval[giNs]/len(selectedfeatures))
gi = giNs
counter = 1
if gi != -2:
selectedfeatures.append(gi)
nonselectedfeatures.remove(gi)
del sigmacorval[gi]
print 'selectedfeatures : ',selectedfeatures, len(selectedfeatures)
print 'nonselectedfeaturelist : ',nonselectedfeatures, len(nonselectedfeatures)
print counter