animals=read.table("http://www.unt.edu/rss/class/mike/data/animals.txt", header=T) install.packages(c("MASS","gbm","rpart","RWeka"),contriburl = "http://cran.stat.ucla.edu/bin/windows/contrib/2.6/") library(MASS) library(gbm) library(rpart) library(RWeka) #Logistic Regression GLM.0= glm(decision ~ 1, family=binomial(logit), data=animals) summary(GLM.0) AIC(GLM.0) # BIC GLM.1 <- glm(decision ~ idealism + relativism + gender + research, family=binomial(logit), data=animals) summary(GLM.1) confint(GLM.1, level=.95, type="LR") AIC(GLM.1) # BIC indicates a significantly better model #First compare the two model BIC (lower better). Here is a statistical test. anova(GLM.0,GLM.1, test="Chisq") #predict(GLM.1, type="response") #predicted probabilities if you want to see them table(fitted(GLM.1)>.50,GLM.1$y) #Except for rounding, this will reproduce the table from SPSS #226/315 = 71.7% correct classification #DFA on the same data dfa.mod <- lda(decision ~ idealism + relativism + gender + research, data=animals) dfa.mod predclass=predict(dfa.mod)$class table(predclass,animals$decision) #228/315 = 72.4% correct classification, slightly better than logreg #Basic tree treemod=rpart(decision ~ idealism + relativism + gender + research, method="class",data=animals) plot(treemod, uniform = T, branch = 1, compress = F, margin = 0.1) text(treemod, all = T, use.n=T, fancy = T, fwidth=0,fheight=0) #Maximize the plot. For reference: Gender a=male, b=female; Research: a through e. cosmetic Human Medical Meat theory veterinary table(predict(treemod, animals, type="class"), animals$decision) #77.5% classification, notably better. #Pruned tree prunetree<- prune(treemod, cp= .02) #cp is a complexity parameter. It might be useful to think of this from regression terms, where a cp=.05 would mean we'd need an Rsq increase of .05 before bothering to split again. However here it is conceived in terms of misclassification rate. #Alternatively, one can create trees based on a desirable number of splits, the minimum number of cases that should be in a terminal node etc. plot(prunetree, uniform = T, branch = 1, compress = F, margin = 0.1) text(prunetree, all = T, use.n=T, fancy = T, fwidth=0,fheight=0) table(predict(prunetree, animals, type="class"), animals$decision) #Weka is a meta function in which one can perform a variety of functions using it. These are just a couple examples. #Kmeans clustering wekaclust=IBk(decision ~ idealism + relativism + gender + research, data=animals,control = Weka_control(K=17)) #k = 17 neighbors (I just used sqrt of N). If this isn't specified it will use k=1 and classification is 99% but obviously overfitted and does much worse w/ new data. summary(wekaclust,class=T,numFolds=10) #Built in k-fold validation. May delete numFolds part if desired to just see the first solution. It does very well with new data. #Tree approach similar to before. Uses the J48. wekatree=J48(decision ~ idealism + relativism + gender + research, data=animals,control = Weka_control(B=T)) #J48 is the algorithm used for determining trees. B = t forces binary splits only. #wekatree #If you want to see the cut points summary(wekatree,class=T) #This approach is about the best for this data. 83% correct classification and about as good with new data as a regular logreg was with the original. #plot(wekatree) #Requires the party package and several dependencies #Bagging wekabag=Bagging(decision ~ idealism + relativism + gender + research, data=animals,control = Weka_control(N=10)) #N=10 refers to data splits to be used in the pruning process) summary(wekabag,class=T) #also very good classification #Boosting #Boosted logistic using Weka. wekalogit=LogitBoost(decision ~ idealism + relativism + gender + research, data=animals,control = Weka_control(Q=T)) #Q tells it to use bootstrapped samples instead of boosting to the original. Yes, a bootstrapped boost. summary(wekalogit,class=T) #Better than the regular logreg but not much #gbm decision2=as.numeric(animals$decision)-1 #make outcome a 0,1 numeric variable that gbm requires. 1 will actually refer to 'stop' the research. gbm1=gbm(decision2 ~ idealism + relativism + gender + research, data=animals, distribution="bernoulli",n.trees=10000, cv.folds=10, verbose=F) best.iter=gbm.perf(gbm1,method="cv") #Finding the best model. The training data error rate will continue to go down but we eventually begin to overfit and not generalize, hence test error goes up. summary(gbm1,n.trees=best.iter) #summary of the best model. Plots the relative importance of the variables. plot.gbm(gbm1,1:3,best.iter) #Fun plots. The y-axis is log odds. As an example of interpretation f.predict <- predict.gbm(gbm1, animals,n.trees=best.iter, type="response") table(decision2,f.predict>.50) #classification table. 72.4% overall, for test data (i.e. this is how well it's doing with 'new' data). par(mfrow=c(2,2)) plot.gbm(gbm1,1,best.iter) #Same as the above pretty graph but one variable at a time plot.gbm(gbm1,2,best.iter) plot.gbm(gbm1,3,best.iter) plot.gbm(gbm1,4,best.iter) par(mfrow=c(1,1))