From Dziuda’s Data Mining for Genomics and Proteomics
I had mentioned doing this while taking the Genomics class and several people expressed interest in seeing it. I was going to do it in some other language, but since taking Linear Models I’m much more comfortable in R. Also, I found someone’s implementation of the T2 function so I modified it (website is in the notes).
The code contains comments and call examples. Also, there’s another function at the bottom, WriteT2Values(), which writes a file of the T2 for each column. Per the note on the bottom of p. 147 instead of selecting any random variable to start with it’s probably a better idea to choose from the top 1000, for example.
I’ve been playing around with it for text mining – sometimes the solve() errors out because the matrix is singular (which I’m assuming is because the matrices in text mining are much more sparse than in genomics).
I would appreciate any feedback or suggestions for improvement:
The code:
#Stepwise Hybrid Feature Selection
SHFS = function(TrainingData, stop_p, stop_T2, excludeColumns, classCondition, randomFlag=TRUE, J=2) {
#-- Input
#-- TrainingData
#-- J: number of classes, default=2
#-- stop_p: Stop when number of parameters excedes stop_p
#-- stop_T2: Stop when T2 excedes stop_T2
#-- randomFlag: TRUE or FALSE - Starting point flag; default=TRUE
#-- excludeColumns: columns to exclude from consideration like id or class indictor
#-- classCondition: a condition which differentiates the classes, e.g. quote(Hyp_Ind=='true')
#-- Call:
#-- SHFS(TrainingData = poe, stop_p = 5, stop_T2 = 10000, excludeColumns=c(1,2),
#-- classCondition=quote(author=='Doyle'))
#-- Possible speed gains (?):
#-- Make tsquare use global TrainingData
#-- get rid of printing
N = length(TrainingData[,1])
pool <<- seq(1, ncol(TrainingData))
pool <<- pool[-excludeColumns]
poolSize = length(pool)
p = length(pool)
currentSet <<- NULL
markerSize = 0
currentT2 = 0.0
step = 1
cat("Starting : N=", N, ", J=", J, ", p=", p, ", stop_T2 =", stop_T2, ", stop_p =", stop_p, ", markerSize =", markerSize, "\n")
while ( markerSize < stop_p && currentT2 < stop_T2 && (markerSize < (N-J-2) || (N-J-2 ==0) ) && markerSize < p ) {
maxGain = 0.0
markerSize = markerSize + 1
if (markerSize == 1 && randomFlag == TRUE) {
# first time thru loop and caller wants to start with a random variable
#-- Comment/Uncomment these lines depending on
#-- which method of selecting a random var you want to use
#selectedVar = as.integer(runif(1, 1, poolSize))
selectedVar = selectRandomVar(1000)
cat("selectedVar = ", selectedVar, "\n")
} else {
# Forward selection: add the variable that maximizes T^2 of this step
for (i in pool) {
#-- call tsquare with currentSet plus i appended onto the end
deltaT2 = tsquare(c(currentSet, i), TrainingData, classCondition) - currentT2
if (deltaT2 >= maxGain) {
maxGain = deltaT2
selectedVar = i
}
} #for (i in pool)
} #if (markerSize...)
currentT2 = currentT2 + maxGain
poolToMarker(selectedVar)
poolSize = poolSize - 1
minLoss = currentT2
#-- Backward optimization: if elimination of any variable results in T2 greater
#-- than that of previous step, eliminate one that minimally decreases T2
if (markerSize > 2) {
for (i in currentSet) {
#-- call tsquare with currentSet without i
deltaT2 = currentT2 - tsquare( currentSet[currentSet != i], TrainingData, classCondition )
if (deltaT2 <= minLoss ) {
minLoss = deltaT2
removedVar = i
}
}
if (minLoss < maxGain) {
markerToPool(i)
poolSize = poolSize + 1
markerSize = markerSize - 1
currentT2 = currentT2 - minLoss
}
#? saveSet(currentSet)
} # if (markerSize > 2)
cat("Step=", step, "poolSize=", poolSize, ", currentT2=", currentT2, ", markerSize =", markerSize, "\n")
step = step + 1
flush.console()
} #while
cat("currentT2=", currentT2, "\n")
return (currentSet)
} #function
selectRandomVar = function (n) {
# Select random column from top n
cols = read.csv("C:\\out-T2.csv")
cols = head(cols, n)
rowIdx = sample(nrow(cols), 1)
return(cols[rowIdx, 2])
}
poolToMarker = function(theVar) {
#-- Add to currentSet
currentSet <<- c(currentSet, theVar)
#append(currentSet, theVar)
#-- Remove from pool
pool <<- pool[!pool==theVar]
}
markerToPool = function(theVar) {
#-- Add to currentSet
#append(pool, theVar)
pool <<- c(pool, theVar)
#-- Remove from pool
currentSet <<- currentSet[!currentSet==theVar]
}
tsquare<-function(list1, data, classCondition){
#-- Function calculates Hotelling's T-square
#-- Adapted from: http://www.stat.sc.edu/~habing/courses/530rF03.html
#-- Changed from original to accept a list of columns - this way it can
#-- be iteratively to try different sets of columns. Also added
#-- the classCondition
#-- Inputs:
#-- list1 - a list of columns which should be used - this is so we don't have to use all of the dataframe, data
#-- data: a dataframe
#-- classCondition: the "where" clause which differentiates the two classes (e.g. "Control_Ind == 'Y' ")
#-- Call:
#-- > tsquare(c(1,2,3,4), test, quote(Hyp_Ind=='true'))
#-- [1] 249.4002
##
#1) Make sure they are both matrices
##
group1 <- as.matrix(subset(data, select=list1, subset=(eval(classCondition))))
group2 <- as.matrix(subset(data, select=list1, subset=(!eval(classCondition))))
#-- Add matrix of all 1's to each group
#group1 <<- group1 + matrix(1, nrow=length(group1[,1]), ncol=length(group1[1,]))
#group2 <<- group2 + matrix(1, nrow=length(group2[,1]), ncol=length(group2[1,]))
##
#2) Find the mean vector and covariance matrix for the first group#
##
n1 <- length(group1[,1])
xbar1 <- apply(group1,2,mean)
C1 <- cov(group1)
##
#3) find the mean vector and covariance matrix for the second group#
##
n2 <- length(group2[,1])
xbar2 <- apply(group2,2,mean)
C2 <- cov(group2)
##
#4) find the pooled covariance matrix and its inverse#
##
C <- ((n1-1)*C1+(n2-1)*C2)/(n1+n2-2)
Cinv <- solve(C)
##
#5) multiply to find the T-square stat and convert it to F and p-value#
##
Tsquare <- (n1*n2)*t(xbar1-xbar2)%*%Cinv%*%(xbar1-xbar2)/(n1+n2)
#p <- length(group1[1,])
#F <- (n1+n2-p-1)*Tsquare/((n1+n2-2)*p)
#p.value <- 1 - pf(F,df1=p,df2=n1+n2-p-1)
##
#Finally, output all of the results#
##
#return(n1,xbar1,C1,xbar2,C2,n2,C,p,Tsquare,F,p.value)
return(as.numeric(Tsquare))
}
WriteT2Values = function(data, excludeColumns, classCondition, outFile) {
#-- Write the top n T2 values to a file
#-- Inputs:
#-- excludeColumns - any columns like index and class Condition we don't want to include
#-- classCondition - the condition which separates the classes
#-- outFile - name of output file
#-- Call:
# WriteT2Values(someData, c(1,2), classCondition=quote(author=='Doyle'), "C:\\data\\out-T2.csv")
pool <<- seq(1, ncol(data))
pool <<- pool[-excludeColumns]
DF = data.frame()
for (i in pool) {
T2 = tsquare(i, data , classCondition)
newRow = data.frame(col=i, T2Value = T2)
DF <- rbind(DF, newRow)
}
write.csv(DF[order(DF$T2Value, decreasing=TRUE),], outFile)
}