R implementation of Stepwise Hybrid Feature Selection

From Dziuda’s Data Mining for Genomics and Proteomics

I had mentioned doing this while taking the Genomics class and several people expressed interest in seeing it.  I was going to do it in some other language, but since taking Linear Models I’m much more comfortable in R.  Also, I found someone’s implementation of the T2 function so I modified it (website is in the notes).

The code contains comments and call examples.  Also, there’s another function at the bottom, WriteT2Values(), which writes a file of the T2 for each column.  Per the note on the bottom of p. 147 instead of selecting any random variable to start with it’s probably a better idea to choose from the top 1000, for example.

I’ve been playing around with it for text mining – sometimes the solve() errors out because the matrix is singular (which I’m assuming is because the matrices in text mining are much more sparse than in genomics).

I would appreciate any feedback or suggestions for improvement:

The code:


#Stepwise Hybrid Feature Selection
SHFS = function(TrainingData, stop_p, stop_T2, excludeColumns, classCondition, randomFlag=TRUE, J=2) {
#-- Input
#-- TrainingData
#-- J:  number of classes, default=2
#-- stop_p:  Stop when number of parameters excedes stop_p
#-- stop_T2: Stop when T2 excedes  stop_T2
#-- randomFlag:  TRUE or FALSE  - Starting point flag; default=TRUE
#-- excludeColumns:  columns to exclude from consideration like id or class indictor
#-- classCondition:  a condition which differentiates the classes, e.g. quote(Hyp_Ind=='true')
#-- Call:
#--  SHFS(TrainingData = poe, stop_p = 5, stop_T2 = 10000, excludeColumns=c(1,2), 
#--         classCondition=quote(author=='Doyle'))

#-- Possible speed gains (?):
#-- Make tsquare use global TrainingData
#-- get rid of printing



N = length(TrainingData[,1])

pool <<- seq(1, ncol(TrainingData))
pool <<- pool[-excludeColumns]
poolSize = length(pool)
p = length(pool)

currentSet <<- NULL


markerSize = 0
currentT2 = 0.0

step = 1

cat("Starting : N=", N, ", J=", J, ", p=", p, ", stop_T2 =", stop_T2, ", stop_p =", stop_p,  ", markerSize =", markerSize, "\n")

while ( markerSize < stop_p  && currentT2 < stop_T2 && (markerSize < (N-J-2) || (N-J-2 ==0) ) && markerSize < p ) {
maxGain = 0.0
markerSize = markerSize + 1
if (markerSize == 1 && randomFlag == TRUE) {
	# first time thru loop and caller wants to start with a random variable 
		#-- Comment/Uncomment these lines depending on 
		#-- which method of selecting a random var you want to use
		#selectedVar = as.integer(runif(1, 1, poolSize))
		selectedVar = selectRandomVar(1000)
		cat("selectedVar = ", selectedVar, "\n")
		
	} else {
	# Forward selection:  add the variable that maximizes T^2 of this step
	for (i in pool) {
		#-- call tsquare with currentSet plus i appended onto the end
		deltaT2 = tsquare(c(currentSet, i), TrainingData, classCondition) - currentT2
		if (deltaT2 >= maxGain) {
			maxGain = deltaT2
			selectedVar = i 
		}
	
	} #for (i in pool)
} #if (markerSize...)
currentT2 = currentT2 + maxGain
poolToMarker(selectedVar)
poolSize = poolSize - 1


minLoss = currentT2
#-- Backward optimization:  if elimination of any variable results in T2 greater
#--  than that of previous step, eliminate one that minimally decreases T2
if (markerSize > 2) {
	for (i in currentSet) {
		#-- call tsquare with currentSet without i
		deltaT2 = currentT2 - tsquare( currentSet[currentSet != i], TrainingData, classCondition )
		if (deltaT2 <= minLoss ) {
			minLoss = deltaT2
			removedVar = i
		}
	
	}
	
	if (minLoss < maxGain) {
		markerToPool(i)
		poolSize = poolSize + 1
		markerSize = markerSize - 1
		currentT2 = currentT2 - minLoss
	}
	
	#? saveSet(currentSet)

} # if (markerSize > 2)


cat("Step=", step, "poolSize=", poolSize, ", currentT2=", currentT2, ", markerSize =", markerSize, "\n")
step = step + 1
flush.console()
} #while

cat("currentT2=", currentT2, "\n")
return (currentSet)

} #function

selectRandomVar = function (n) {
# Select random column from top n

cols = read.csv("C:\\out-T2.csv")

cols = head(cols, n)

rowIdx = sample(nrow(cols), 1)

return(cols[rowIdx, 2])


}

poolToMarker = function(theVar) {
	#-- Add to currentSet
	currentSet <<- c(currentSet, theVar)
	#append(currentSet, theVar)
	#-- Remove from pool
	pool <<- pool[!pool==theVar]
	
}

markerToPool = function(theVar) {
	#-- Add to currentSet
	#append(pool, theVar)
	pool <<- c(pool, theVar)
	#-- Remove from pool
	currentSet <<- currentSet[!currentSet==theVar]
	
}


tsquare<-function(list1, data, classCondition){
#-- Function calculates Hotelling's T-square
#-- Adapted from:  http://www.stat.sc.edu/~habing/courses/530rF03.html
#-- Changed from original to accept a list of columns - this way it can
#-- be iteratively to try different sets of columns.  Also added 
#-- the classCondition

#-- Inputs:
#-- list1 - a list of columns which should be used - this is so we don't have to use all of the dataframe, data
#-- data: a dataframe
#-- classCondition:  the "where" clause which differentiates the two classes (e.g. "Control_Ind == 'Y' ")

#-- Call:
#-- > tsquare(c(1,2,3,4), test, quote(Hyp_Ind=='true'))
#-- [1] 249.4002


##
#1) Make sure they are both matrices
##

group1 <- as.matrix(subset(data, select=list1, subset=(eval(classCondition))))
group2 <- as.matrix(subset(data, select=list1, subset=(!eval(classCondition))))

#-- Add matrix of all 1's to each group
#group1 <<- group1 + matrix(1, nrow=length(group1[,1]), ncol=length(group1[1,]))
#group2 <<- group2 + matrix(1, nrow=length(group2[,1]), ncol=length(group2[1,]))


##
#2) Find the mean vector and covariance matrix for the first group#
##
n1 <- length(group1[,1])
xbar1 <- apply(group1,2,mean)
C1 <- cov(group1)
##
#3) find the mean vector and covariance matrix for the second group#
##
n2 <- length(group2[,1])
xbar2 <- apply(group2,2,mean)
C2 <- cov(group2)
##
#4) find the pooled covariance matrix and its inverse#
##
C <- ((n1-1)*C1+(n2-1)*C2)/(n1+n2-2)
Cinv <- solve(C)  


##
#5) multiply to find the T-square stat and convert it to F and p-value#
##
Tsquare <- (n1*n2)*t(xbar1-xbar2)%*%Cinv%*%(xbar1-xbar2)/(n1+n2)
#p <- length(group1[1,])
#F <- (n1+n2-p-1)*Tsquare/((n1+n2-2)*p)
#p.value <- 1 - pf(F,df1=p,df2=n1+n2-p-1)
##
#Finally, output all of the results#
##
#return(n1,xbar1,C1,xbar2,C2,n2,C,p,Tsquare,F,p.value)
return(as.numeric(Tsquare))

}


WriteT2Values = function(data, excludeColumns, classCondition, outFile) {
#-- Write the top n T2 values to a file
#-- Inputs:
#--		excludeColumns - any columns like index and class Condition we don't want to include
#--		classCondition - the condition which separates the classes
#--		outFile - name of output file
#-- Call:
# WriteT2Values(someData, c(1,2), classCondition=quote(author=='Doyle'), "C:\\data\\out-T2.csv")

pool <<- seq(1, ncol(data))
pool <<- pool[-excludeColumns]

DF = data.frame()

for (i in pool) {

		T2 = tsquare(i, data , classCondition)
		newRow = data.frame(col=i, T2Value = T2)
		DF <- rbind(DF, newRow)
		
} 

write.csv(DF[order(DF$T2Value, decreasing=TRUE),], outFile)

}
Advertisements

Leave a Reply

Fill in your details below or click an icon to log in:

WordPress.com Logo

You are commenting using your WordPress.com account. Log Out / Change )

Twitter picture

You are commenting using your Twitter account. Log Out / Change )

Facebook photo

You are commenting using your Facebook account. Log Out / Change )

Google+ photo

You are commenting using your Google+ account. Log Out / Change )

Connecting to %s

%d bloggers like this: