lloc<-function(x,est=tmean,...){ if(!is.list(x))val<-est(x,...) if(is.list(x)){ val<-NA for(i in 1:length(x))val[i]<-est(x[[i]],...) } if(is.matrix(x))val<-apply(x,2,est,...) val } ghdist<-function(n,g=0,h=0){ # # generate n observations from a g and h dist. # x<-rnorm(n) if (g>0){ ghdist<-(exp(g*x)-1)*exp(h*x^2/2)/g } if(g==0)ghdist<-x*exp(h*x^2/2) ghdist } wincor<-function(x,y,tr=.2){ # Compute the Winsorized correlation between x and y. # # tr is the amount of Winsorization # This function also returns the Winsorized covariance # # Pairwise deletion of missing values is performed. # sig<-NA if(length(x)!=length(y))stop("Lengths of vectors are not equal") if(sum(is.na(c(x,y)))!=0){ m1<-matrix(c(x,y),nrow=length(x),ncol=2) m1<-elimna(m1) x<-m1[,1] y<-m1[,2] } g<-floor(tr*length(x)) xvec<-winval(x,tr) yvec<-winval(y,tr) wcor<-cor(xvec,yvec) wcov<-var(xvec,yvec) if(sum(x==y)!=length(x)){ test<-wcor*sqrt((length(x)-2)/(1.-wcor^2)) sig<-2*(1-pt(abs(test),length(x)-2*g-2)) } list(cor=wcor,cov=wcov,siglevel=sig) } bivar<-function(x){ # compute biweight midvariance of x m<-median(x) u<-abs((x-m)/(9*qnorm(.75)*mad(x))) av<-ifelse(u<1,1,0) top<-length(x)*sum(av*(x-m)^2*(1-u^2)^4) bot<-sum(av*(1-u^2)*(1-5*u^2)) bi<-top/bot^2 bi } mjse<-function(x,q=.5){ # # Compute the Maritz-Jarrett estimate of the standard error of # X sub m, m=[qn+.5] # The default value for q is .5 # n<-length(x) m<-floor(q*n+.5) vec<-seq(along=x) w<-pbeta(vec/n,m-1,n-m)-pbeta((vec-1)/n,m-1,n-m) # W sub i values y<-sort(x) c1<-sum(w*y) c2<-sum(w*y*y) mjse<-sqrt(c2-c1^2) mjse } pbvar<-function(x,beta=.2){ # Compute the percentage bend midvariance # # beta is the bending constant for omega sub N. # pbvar=0 w<-abs(x-median(x)) w<-sort(w) m<-floor((1-beta)*length(x)+.5) omega<-w[m] if(omega>0){ y<-(x-median(x))/omega z<-ifelse(y>1,1,y) z<-ifelse(z<(-1),-1,z) pbvar<-length(x)*omega^2*sum(z^2)/(length(x[abs(y)<1]))^2 } pbvar } win<-function(x,tr=.2){ # # Compute the gamma Winsorized mean for the data in the vector x. # # tr is the amount of Winsorization # y<-sort(x) n<-length(x) ibot<-floor(tr*n)+1 itop<-length(x)-ibot+1 xbot<-y[ibot] xtop<-y[itop] y<-ifelse(y<=xbot,xbot,y) y<-ifelse(y>=xtop,xtop,y) win<-mean(y) win } hd<-function(x,q=.5){ # # Compute the Harrell-Davis estimate of the qth quantile # # The vector x contains the data, # and the desired quantile is q # The default value for q is .5. # if(length(x)!=length(x[!is.na(x)]))stop("Remove missing values from x") n<-length(x) m1<-(n+1)*q m2<-(n+1)*(1-q) vec<-seq(along=x) w<-pbeta(vec/n,m1,m2)-pbeta((vec-1)/n,m1,m2) # W sub i values y<-sort(x) hd<-sum(w*y) hd } mestse<-function(x,bend=1.28,op=2){ # # Estimate the standard error of M-estimator using Huber's Psi # using estimate of influence function # n<-length(x) mestse<-sqrt(sum((ifmest(x,bend,op=2)^2))/(n*(n-1))) mestse } omega<-function(x,beta=.1){ # Compute the estimate of the measure omega as described in # chapter 3. # The default value is beta=.1 because this function is used to # compute the percentage bend midvariance. # y<-abs(x-median(x)) y<-sort(y) m<-floor((1-beta)*length(x)+.5) omega<-y[m]/qnorm(1-beta/2) # omega is rescaled to equal sigma # under normality omega } qse<-function(x,q=.5,op=3){ # # Compute the standard error of qth sample quantile estimator # based on the single order statistic, x sub ([qn+.5]) (See Ch 3) # # Store the data in vector # x, and the desired quantile in q # The default value for q is .5 # # op=1 Use Rosenblatt's shifted histogram # op=2 Use expected frequency curve # op=3 Use adaptive kernel density estimator # y <- sort(x) n <- length(x) iq <- floor(q * n + 0.5) qest <- y[iq] fhat<-NA if(op==1)fhat<-kerden(x,q) if(op==2)fhat<-rdplot(x,pts=qest,pyhat=T,plotit=F) if(op==3)fhat<-akerd(x,pts=qest,pyhat=T,plotit=F) if(is.na(fhat[1]))stop("Something wrong, op should be 1 or 2 or 3") qse<-1/(2*sqrt(length(x))*fhat) qse } winval<-function(x,tr=.2){ # # Winsorize the data in the vector x. # tr is the amount of Winsorization which defaults to .2. # # This function is used by several other functions that come with this book. # y<-sort(x) n<-length(x) ibot<-floor(tr*n)+1 itop<-length(x)-ibot+1 xbot<-y[ibot] xtop<-y[itop] winval<-ifelse(x<=xbot,xbot,x) winval<-ifelse(winval>=xtop,xtop,winval) winval } hdseb<-function(x,q=.5,nboot=100,SEED=T){ # # Compute bootstrap estimate of the standard error of the # Harrell-Davis estimator of the qth quantile. # The default quantile is the median, q=.5 # The default number of bootstrap samples is nboot=100 # if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. data<-matrix(sample(x,size=length(x)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,hd,q) hdseb<-sqrt(var(bvec)) hdseb } mestseb<-function(x,nboot=1000,bend=1.28){ # # Compute bootstrap estimate of the standard error of the # M-estimators with Huber's Psi. # The default percentage bend is bend=1.28 # The default number of bootstrap samples is nboot=100 # set.seed(1) # set seed of random number generator so that # results can be duplicated. data<-matrix(sample(x,size=length(x)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,mest) mestseb<-sqrt(var(bvec)) mestseb } onestep<-function(x,bend=1.28,na.rm=F){ # # Compute one-step M-estimator of location using Huber's Psi. # The default bending constant is 1.28 # if(na.rm)x<-x[!is.na(x)] y<-(x-median(x))/mad(x) #mad in splus is madn in the book. A<-sum(hpsi(y,bend)) B<-length(x[abs(y)<=bend]) onestep<-median(x)+mad(x)*A/B onestep } trimse<-function(x,tr=.2,na.rm=F){ # # Estimate the standard error of the gamma trimmed mean # The default amount of trimming is tr=.2. # if(na.rm)x<-x[!is.na(x)] trimse<-sqrt(winvar(x,tr))/((1-2*tr)*sqrt(length(x))) trimse } winvar<-function(x,tr=.2,na.rm=F){ # # Compute the gamma Winsorized variance for the data in the vector x. # tr is the amount of Winsorization which defaults to .2. # if(na.rm)x<-x[!is.na(x)] y<-sort(x) n<-length(x) ibot<-floor(tr*n)+1 itop<-length(x)-ibot+1 xbot<-y[ibot] xtop<-y[itop] y<-ifelse(y<=xbot,xbot,y) y<-ifelse(y>=xtop,xtop,y) winvar<-var(y) winvar } mest<-function(x,bend=1.28,na.rm=F){ # # Compute M-estimator of location using Huber's Psi. # The default bending constant is 1.28 # if(na.rm)x<-x[!is.na(x)] if(mad(x)==0)stop("MAD=0. The M-estimator cannot be computed.") y<-(x-median(x))/mad(x) #mad in splus is madn in the book. A<-sum(hpsi(y,bend)) B<-length(x[abs(y)<=bend]) mest<-median(x)+mad(x)*A/B repeat{ y<-(x-mest)/mad(x) A<-sum(hpsi(y,bend)) B<-length(x[abs(y)<=bend]) newmest<-mest+mad(x)*A/B if(abs(newmest-mest) <.0001)break mest<-newmest } mest } hpsi<-function(x,bend=1.28){ # # Evaluate Huber`s Psi function for each value in the vector x # The bending constant defaults to 1.28. # hpsi<-ifelse(abs(x)<=bend,x,bend*sign(x)) hpsi } hdci<-function(x,q=.5,alpha=.05,nboot=100,SEED=T){ # # Compute a 1-alpha confidence for qth quantile using the # Harrell-Davis estimator in conjunction with the # bootstrap estimate of the standard error. # # The default quantile is .5. # The default value for alpha is .05. # se<-hdseb(x,q,nboot,SEED=SEED) crit<-.5064/(length(x)^(.25))+1.96 if(q<=.2 || q>=.8){ if(length(x) <=20)crit<-(-6.23)/length(x)+5.01 } if(q<=.1 || q>=.9){ if(length(x) <=40)crit<-36.2/length(x)+1.31 } if(length(x)<=10){ print("The number of observations is less than 11.") print("Accurate critical values have not been determined for this case.") } low<-hd(x,q)-crit*se hi<-hd(x,q)+crit*se list(ci=c(low,hi),crit=crit,se=se) } mestci<-function(x,alpha=.05,nboot=399,bend=1.28,os=F){ # # Compute a bootstrap, .95 confidence interval for the # M-estimator of location based on Huber's Psi. # The default percentage bend is bend=1.28 # The default number of bootstrap samples is nboot=399 # # By default, the fully iterated M-estimator is used. To use the # one-step M-estimator instead, set os=T # os<-as.logical(os) if(length(x) <=19) print("The number of observations is less than 20.") print("This function might fail due to division by zero,") print("which in turn causes an error in function hpsi") print("having to do with a missing value.") set.seed(1) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") data<-matrix(sample(x,size=length(x)*nboot,replace=T),nrow=nboot) if(!os)bvec<-apply(data,1,mest,bend) if(os)bvec<-apply(data,1,onestep,bend) bvec<-sort(bvec) low<-round((alpha/2)*nboot) up<-nboot-low low<-low+1 list(ci=c(bvec[low],bvec[up])) } sint<-function(x,alpha=.05){ # # Compute a 1-alpha confidence interval for the median using # the Hettmansperger-Sheather interpolation method. # # The default value for alpha is .05. # k<-qbinom(alpha/2,length(x),.5) gk<-pbinom(length(x)-k,length(x),.5)-pbinom(k-1,length(x),.5) if(gk >= 1-alpha){ gkp1<-pbinom(length(x)-k-1,length(x),.5)-pbinom(k,length(x),.5) kp<-k+1 } if(gk < 1-alpha){ k<-k-1 gk<-pbinom(length(x)-k,length(x),.5)-pbinom(k-1,length(x),.5) gkp1<-pbinom(length(x)-k-1,length(x),.5)-pbinom(k,length(x),.5) kp<-k+1 } xsort<-sort(x) nmk<-length(x)-k nmkp<-nmk+1 ival<-(gk-1+alpha)/(gk-gkp1) lam<-((length(x)-k)*ival)/(k+(length(x)-2*k)*ival) low<-lam*xsort[kp]+(1-lam)*xsort[k] hi<-lam*xsort[nmk]+(1-lam)*xsort[nmkp] sint<-c(low,hi) sint } b2ci<-function(x,y,alpha=.05,nboot=2000,est=bivar,...){ # # Compute a bootstrap confidence interval for the # the difference between any two parameters corresponding to # independent groups. # By default, biweight midvariances are compared. # Setting est=mean, for example, will result in a percentile # bootstrap confidence interval for the difference between means. # The default number of bootstrap samples is nboot=399 # x<-x[!is.na(x)] # Remove any missing values in x y<-y[!is.na(y)] # Remove any missing values in y set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") datax<-matrix(sample(x,size=length(x)*nboot,replace=T),nrow=nboot) datay<-matrix(sample(y,size=length(y)*nboot,replace=T),nrow=nboot) bvecx<-apply(datax,1,est,...) bvecy<-apply(datay,1,est,...) bvec<-sort(bvecx-bvecy) low <- round((alpha/2) * nboot) + 1 up <- nboot - low temp <- sum(bvec < 0)/nboot + sum(bvec == 0)/(2 * nboot) sig.level <- 2 * (min(temp, 1 - temp)) list(ci = c(bvec[low], bvec[up]), p.value = sig.level) } ecdf<-function(x,val){ # compute empirical cdf for data in x evaluated at val # That is, estimate P(X <= val) # ecdf<-length(x[x<=val])/length(x) ecdf } kswsig<-function(m,n,val){ # # Compute significance level of the weighted # Kolmogorov-Smirnov test statistic # # m=sample size of first group # n=sample size of second group # val=observed value of test statistic # mpn<-m+n cmat<-matrix(0,m+1,n+1) umat<-matrix(0,m+1,n+1) for (i in 1:m-1){ for (j in 1:n-1)cmat[i+1,j+1]<-abs(i/m-j/n)*sqrt(m*n/((i+j)*(1-(i+j)/mpn))) } cmat<-ifelse(cmat<=val,1,0) for (i in 0:m){ for (j in 0:n)if(i*j==0)umat[i+1,j+1]<-cmat[i+1,j+1] else umat[i+1,j+1]<-cmat[i+1,j+1]*(umat[i+1,j]+umat[i,j+1]) } term<-lgamma(m+n+1)-lgamma(m+1)-lgamma(n+1) kswsig<-1.-umat[m+1,n+1]/exp(term) kswsig } kswsig<-function(m,n,val){ # # Compute significance level of the weighted # Kolmogorov-Smirnov test statistic # # m=sample size of first group # n=sample size of second group # val=observed value of test statistic # mpn<-m+n cmat<-matrix(0,m+1,n+1) umat<-matrix(0,m+1,n+1) for (i in 1:m-1){ for (j in 1:n-1)cmat[i+1,j+1]<-abs(i/m-j/n)*sqrt(m*n/((i+j)*(1-(i+j)/mpn))) } cmat<-ifelse(cmat<=val,1,0) for (i in 0:m){ for (j in 0:n)if(i*j==0)umat[i+1,j+1]<-cmat[i+1,j+1] else umat[i+1,j+1]<-cmat[i+1,j+1]*(umat[i+1,j]+umat[i,j+1]) } term<-lgamma(m+n+1)-lgamma(m+1)-lgamma(n+1) kswsig<-1.-umat[m+1,n+1]/exp(term) kswsig } binomci<-function(x=sum(y),nn=length(y),y=NA,n=NA,alpha=.05){ # Compute a 1-alpha confidence interval for p, the probability of # success for a binomial distribution, using Pratt's method # # y is a vector of 1s and 0s. # x is the number of successes observed among n trials # if(nn==1)stop("Something is wrong: number of observations is only 1") n<-nn if(x!=n && x!=0){ z<-qnorm(1-alpha/2) A<-((x+1)/(n-x))^2 B<-81*(x+1)*(n-x)-9*n-8 C<-(0-3)*z*sqrt(9*(x+1)*(n-x)*(9*n+5-z^2)+n+1) D<-81*(x+1)^2-9*(x+1)*(2+z^2)+1 E<-1+A*((B+C)/D)^3 upper<-1/E A<-(x/(n-x-1))^2 B<-81*x*(n-x-1)-9*n-8 C<-3*z*sqrt(9*x*(n-x-1)*(9*n+5-z^2)+n+1) D<-81*x^2-9*x*(2+z^2)+1 E<-1+A*((B+C)/D)^3 lower<-1/E } if(x==0){ lower<-0 upper<-1-alpha^(1/n) } if(x==1){ upper<-1-(alpha/2)^(1/n) lower<-1-(1-alpha/2)^(1/n) } if(x==n-1){ lower<-(alpha/2)^(1/n) upper<-(1-alpha/2)^(1/n) } if(x==n){ lower<-alpha^(1/n) upper<-1 } phat<-x/n list(phat=phat,ci=c(lower,upper)) } kssig<-function(m,n,val){ # # Compute significance level of the Kolmogorov-Smirnov test statistic # m=sample size of first group # n=sample size of second group # val=observed value of test statistic # cmat<-matrix(0,m+1,n+1) umat<-matrix(0,m+1,n+1) for (i in 0:m){ for (j in 0:n)cmat[i+1,j+1]<-abs(i/m-j/n) } cmat<-ifelse(cmat<=val,1e0,0e0) for (i in 0:m){ for (j in 0:n)if(i*j==0)umat[i+1,j+1]<-cmat[i+1,j+1] else umat[i+1,j+1]<-cmat[i+1,j+1]*(umat[i+1,j]+umat[i,j+1]) } term<-lgamma(m+n+1)-lgamma(m+1)-lgamma(n+1) kssig<-1.-umat[m+1,n+1]/exp(term) kssig } meemul<-function(x,alpha=.05){ # # Perform Mee's method for all pairs of J independent groups. # The familywise type I error probability is controlled by using # a critical value from the Studentized maximum modulus distribution. # # The data are assumed to be stored in $x$ in list mode. # Length(x) is assumed to correspond to the total number of groups, J # It is assumed all groups are independent. # # Missing values are automatically removed. # # The default value for alpha is .05. Any other value results in using # alpha=.01. # if(!is.list(x))stop("Data must be stored in list mode.") J<-length(x) CC<-(J^2-J)/2 test<-matrix(NA,CC,5) for(j in 1:J){ xx<-!is.na(x[[j]]) val<-x[[j]] x[[j]]<-val[xx] # Remove missing values } dimnames(test)<-list(NULL,c("Group","Group","phat","ci.lower","ci.upper")) jcom<-0 crit<-smmcrit(200,CC) if(alpha!=.05)crit<-smmcrit01(200,CC) alpha<-1-pnorm(crit) for (j in 1:J){ for (k in 1:J){ if (j < k){ temp<-mee(x[[j]],x[[k]],alpha) jcom<-jcom+1 test[jcom,1]<-j test[jcom,2]<-k test[jcom,3]<-temp$phat test[jcom,4]<-temp$ci[1] test[jcom,5]<-temp$ci[2] }}} list(test=test) } tsub<-function(isub,x,y,tr){ # # Compute test statistic for trimmed means # when comparing dependent groups. # By default, 20% trimmed means are used. # isub is a vector of length n, # a bootstrap sample from the sequence of integers # 1, 2, 3, ..., n # # This function is used by ydbt # tsub<-yuend(x[isub],y[isub],tr=tr)$teststat tsub } deciles<-function(x){ # # Estimate the deciles for the data in vector x # using the Harrell-Davis estimate of the qth quantile # xs<-sort(x) n<-length(x) vecx<-seq(along=x) xq<-0 for (i in 1:9){ q<-i/10 m1<-(n+1)*q m2<-(n+1)*(1-q) wx<-pbeta(vecx/n,m1,m2)-pbeta((vecx-1)/n,m1,m2) # W sub i values xq[i]<-sum(wx*xs) } xq } kstiesig<-function(x,y,val){ # # Compute significance level of the Kolmogorov-Smirnov test statistic # for the data in x and y. # This function allows ties among the values. # val=observed value of test statistic # m<-length(x) n<-length(y) z<-c(x,y) z<-sort(z) cmat<-matrix(0,m+1,n+1) umat<-matrix(0,m+1,n+1) for (i in 0:m){ for (j in 0:n){ if(abs(i/m-j/n)<=val)cmat[i+1,j+1]<-1e0 k<-i+j if(k > 0 && k.25)print("Warning: with tr>.25 type I error control might be poor") x<-x[!is.na(x)] # Remove any missing values in x y<-y[!is.na(y)] # Remove any missing values in y h1<-length(x)-2*floor(tr*length(x)) h2<-length(y)-2*floor(tr*length(y)) q1<-(length(x)-1)*winvar(x,tr)/(h1*(h1-1)) q2<-(length(y)-1)*winvar(y,tr)/(h2*(h2-1)) df<-(q1+q2)^2/((q1^2/(h1-1))+(q2^2/(h2-1))) crit<-qt(1-alpha/2,df) dif<-mean(x,tr)-mean(y,tr) low<-dif-crit*sqrt(q1+q2) up<-dif+crit*sqrt(q1+q2) test<-abs(dif/sqrt(q1+q2)) yuen<-2*(1-pt(test,df)) list(ci=c(low,up),p.value=yuen,dif=dif,se=sqrt(q1+q2),teststat=test,crit=crit,df=df) } shifthd<-function(x,y,nboot=200,plotit=T,plotop=F){ # # Compute confidence intervals for the difference between deciles # of two independent groups. The simultaneous probability coverage is .95. # The Harrell-Davis estimate of the qth quantile is used. # The default number of bootstrap samples is nboot=100 # # The results are stored and returned in a 9 by 3 matrix, # the ith row corresponding to the i/10 quantile. # The first column is the lower end of the confidence interval. # The second column is the upper end. # The third column is the estimated difference between the deciles # (second group minus first). # plotit<-as.logical(plotit) x<-x[!is.na(x)] y<-y[!is.na(y)] set.seed(2) # set seed of random number generator so that # results can be duplicated. crit<-80.1/(min(length(x),length(y)))^2+2.73 m<-matrix(0,9,3) for (i in 1:9){ q<-i/10 print("Working on quantile") print(q) data<-matrix(sample(x,size=length(x)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,hd,q) sex<-var(bvec) data<-matrix(sample(y,size=length(y)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,hd,q) sey<-var(bvec) dif<-hd(y,q)-hd(x,q) m[i,3]<-dif m[i,1]<-dif-crit*sqrt(sex+sey) m[i,2]<-dif+crit*sqrt(sex+sey) } dimnames(m)<-list(NULL,c("ci.lower","ci.upper","Delta.hat")) if(plotit){ if(plotop){ xaxis<-c(1:9)/10 xaxis<-c(xaxis,xaxis) } if(!plotop)xaxis<-c(deciles(x),deciles(x)) par(pch="+") yaxis<-c(m[,1],m[,2]) if(!plotop)plot(xaxis,yaxis,ylab="delta",xlab="x (first group)") if(plotop)plot(xaxis,yaxis,ylab="delta",xlab="Deciles") par(pch="*") if(!plotop)points(deciles(x),m[,3]) if(plotop)points(c(1:9)/10,m[,3]) } m } shiftdhd<-function(x,y,nboot=200,plotit=T,plotop=F,SEED=T,pr=T){ # # Compute confidence intervals for the difference between deciles # of two dependent groups. The simultaneous probability coverage is .95. # The Harrell-Davis estimate of the qth quantile is used. # The default number of bootstrap samples is nboot=100 # # The results are stored and returned in a 9 by 4 matrix, # the ith row corresponding to the i/10 quantile. # The first column is the lower end of the confidence interval. # The second column is the upper end. # The third column is the estimated difference between the deciles # (second group minus first). # The fourth column contains the estimated standard error. # # No missing values are allowed. # if(pr){ print("NOTE: for higher power when sampling from a heavy-tailed dist.") print("or if the goal is to use an alpha value different from .05") print("use the function qdec2ci") } plotit<-as.logical(plotit) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. crit<-37/length(x)^(1.4)+2.75 if(pr)print("The approximate .05 critical value is") if(pr)print(crit) m<-matrix(0,9,4) if(pr)print("Taking Bootstrap Samples. Please wait.") data<-matrix(sample(length(x),size=length(x)*nboot,replace=T),nrow=nboot) xmat<-matrix(x[data],nrow=nboot,ncol=length(x)) ymat<-matrix(y[data],nrow=nboot,ncol=length(x)) for (i in 1:9){ q<-i/10 bvec<-apply(xmat,1,hd,q)-apply(ymat,1,hd,q) se<-sqrt(var(bvec)) dif<-hd(y,q)-hd(x,q) m[i,3]<-dif m[i,1]<-dif-crit*se m[i,2]<-dif+crit*se m[i,4]<-se } dimnames(m)<-list(NULL,c("lower","upper","Delta.hat","se")) if(plotit){ if(plotop){ xaxis<-c(1:9)/10 xaxis<-c(xaxis,xaxis) } if(!plotop)xaxis<-c(deciles(x),deciles(x)) par(pch="+") yaxis<-c(m[,1],m[,2]) if(!plotop)plot(xaxis,yaxis,ylab="delta",xlab="x (first group)") if(plotop)plot(xaxis,yaxis,ylab="delta",xlab="Deciles") par(pch="*") if(!plotop)points(deciles(x),m[,3]) if(plotop)points(c(1:9)/10,m[,3]) } m } ks<-function(x,y,w=F,sig=T){ # Compute the Kolmogorov-Smirnov test statistic # # w=T computes the weighted version instead. (See chapter 5.) # sig=T indicates that the exact significance level is to be computed. # If there are ties, the reported significance level is exact when # using the unweighted test, but for the weighted test the reported # level is too high. # # This function uses the functions ecdf, kstiesig, kssig and kswsig # # This function returns the value of the test statistic, the approximate .05 # critical value, and the exact p-value if sig=T. # # Missing values are automatically removed # x<-x[!is.na(x)] y<-y[!is.na(y)] w<-as.logical(w) sig<-as.logical(sig) tie<-logical(1) tie<-F siglevel<-NA z<-sort(c(x,y)) # Pool and sort the observations for (i in 2:length(z))if(z[i-1]==z[i])tie<-T #check for ties v<-1 # Initializes v for (i in 1:length(z))v[i]<-abs(ecdf(x,z[i])-ecdf(y,z[i])) ks<-max(v) crit<-1.36*sqrt((length(x)+length(y))/(length(x)*length(y))) # Approximate # .05 critical value if(!w && sig && !tie)siglevel<-kssig(length(x),length(y),ks) if(!w && sig && tie)siglevel<-kstiesig(x,y,ks) if(w){ crit<-(max(length(x),length(y))-5)*.48/95+2.58+abs(length(x)-length(y))*.44/95 if(length(x)>100 || length(y)>100){ print("When either sample size is greater than 100,") print("the approximate critical value can be inaccurate.") print("It is recommended that the exact significance level be computed.") } for (i in 1:length(z)){ temp<-(length(x)*ecdf(x,z[i])+length(y)*ecdf(y,z[i]))/length(z) temp<-temp*(1.-temp) v[i]<-v[i]/sqrt(temp) } v<-v[!is.na(v)] ks<-max(v)*sqrt(length(x)*length(y)/length(z)) if(sig)siglevel<-kswsig(length(x),length(y),ks) if(tie && sig){ print("Ties were detected. The reported significance level") print("of the weighted Kolmogorov-Smirnov test statistic is not exact.") }} list(test=ks,critval=crit,siglevel=siglevel) } m2ci<-function(x,y,alpha=.05,nboot=1000,bend=1.28,os=F){ # # Compute a bootstrap, .95 confidence interval for the # the difference between two independent # M-estimator of location based on Huber's Psi. # The default percentage bend is bend=1.28 # The default number of bootstrap samples is nboot=399 # # By default, the fully iterated M-estimator is used. To use the # one-step M-estimator instead, set os=T # os<-as.logical(os) x<-x[!is.na(x)] # Remove any missing values in x y<-y[!is.na(y)] # Remove any missing values in y if(length(x)<=19 || length(y)<=19){ print("The number of observations in at least one group") print("is less than 20. This function might fail due to division by zero,") print("which in turn causes an error in function hpsi having to do with") print("a missing value.")} set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") datax<-matrix(sample(x,size=length(x)*nboot,replace=T),nrow=nboot) datay<-matrix(sample(y,size=length(y)*nboot,replace=T),nrow=nboot) if(!os){ bvecx<-apply(datax,1,mest,bend) bvecy<-apply(datay,1,mest,bend) } if(os){ bvecx<-apply(datax,1,onestep,bend) bvecy<-apply(datay,1,onestep,bend) } bvec<-sort(bvecx-bvecy) low<-round((alpha/2)*nboot) up<-round((1-alpha/2)*nboot) se<-sqrt(var(bvec)) list(ci=c(bvec[low],bvec[up]),se=se) } smmcrit<-function(nuhat,C){ # # Determine the .95 quantile of the C-variate Studentized maximum # modulus distribution using linear interpolation on inverse # degrees of freedom # If C=1, this function returns the .975 quantile of Student's t # distribution. # if(C-round(C)!=0)stop("The number of contrasts, C, must be an integer") if(C>=29)stop("C must be less than or equal to 28") if(C<=0)stop("C must be greater than or equal to 1") if(nuhat<2)stop("The degrees of freedom must be greater than or equal to 2") if(C==1)smmcrit<-qt(.975,nuhat) if(C>=2){ C<-C-1 m1<-matrix(0,20,27) m1[1,]<-c(5.57,6.34,6.89,7.31,7.65,7.93,8.17,8.83,8.57, 8.74,8.89,9.03,9.16,9.28,9.39,9.49,9.59, 9.68, 9.77,9.85,9.92,10.00,10.07,10.13,10.20,10.26,10.32) m1[2,]<-c(3.96,4.43,4.76,5.02,5.23,5.41,5.56,5.69,5.81, 5.92,6.01,6.10,6.18,6.26,6.33,6.39,6.45,6.51, 6.57,6.62,6.67,6.71,6.76,6.80,6.84,6.88, 6.92) m1[3,]<-c(3.38,3.74,4.01,4.20,4.37,4.50,4.62,4.72,4.82, 4.89,4.97,5.04,5.11,5.17,5.22,5.27,5.32, 5.37, 5.41,5.45,5.49,5.52,5.56,5.59,5.63,5.66,5.69) m1[4,]<-c(3.09,3.39,3.62,3.79,3.93,4.04,4.14,4.23,4.31, 4.38,4.45,4.51,4.56,4.61,4.66,4.70,4.74,4.78, 4.82,4.85,4.89,4.92,4.95,4.98,5.00,5.03,5.06) m1[5,]<-c(2.92,3.19,3.39,3.54,3.66,3.77,3.86,3.94,4.01, 4.07,4.13,4.18,4.23,4.28,4.32,4.36,4.39,4.43, 4.46,4.49,4.52,4.55,4.58,4.60,4.63,4.65,4.68) m1[6,]<-c(2.80,3.06,3.24,3.38,3.49,3.59,3.67,3.74,3.80, 3.86,3.92,3.96,4.01,4.05,4.09,4.13,4.16,4.19, 4.22,4.25,4.28,4.31,4.33,4.35,4.38,4.39,4.42) m1[7,]<-c(2.72,2.96,3.13,3.26,3.36,3.45,3.53,3.60,3.66, 3.71,3.76,3.81,3.85,3.89,3.93,3.96,3.99, 4.02, 4.05,4.08,4.10,4.13,4.15,4.18,4.19,4.22,4.24) m1[8,]<-c(2.66,2.89,3.05,3.17,3.27,3.36,3.43,3.49,3.55, 3.60,3.65,3.69,3.73,3.77,3.80,3.84,3.87,3.89, 3.92,3.95,3.97,3.99,4.02,4.04,4.06,4.08,4.09) m1[9,]<-c(2.61,2.83,2.98,3.10,3.19,3.28,3.35,3.41,3.47, 3.52,3.56,3.60,3.64,3.68,3.71,3.74,3.77,3.79, 3.82,3.85,3.87,3.89,3.91,3.94,3.95, 3.97,3.99) m1[10,]<-c(2.57,2.78,2.93,3.05,3.14,3.22,3.29,3.35,3.40, 3.45,3.49,3.53,3.57,3.60,3.63,3.66,3.69,3.72, 3.74,3.77,3.79,3.81,3.83,3.85,3.87,3.89,3.91) m1[11,]<-c(2.54,2.75,2.89,3.01,3.09,3.17,3.24,3.29,3.35, 3.39,3.43,3.47,3.51,3.54,3.57,3.60,3.63,3.65, 3.68,3.70,3.72,3.74,3.76,3.78,3.80,3.82,3.83) m1[12,]<-c(2.49,2.69,2.83,2.94,3.02,3.09,3.16,3.21,3.26, 3.30,3.34,3.38,3.41,3.45,3.48,3.50,3.53,3.55, 3.58,3.59,3.62,3.64,3.66,3.68,3.69,3.71,3.73) m1[13,]<-c(2.46,2.65,2.78,2.89,2.97,3.04,3.09,3.15,3.19, 3.24,3.28,3.31,3.35,3.38,3.40,3.43,3.46,3.48, 3.50,3.52,3.54,3.56,3.58,3.59,3.61,3.63,3.64) m1[14,]<-c(2.43,2.62,2.75,2.85,2.93,2.99,3.05,3.11,3.15, 3.19,3.23,3.26,3.29,3.32,3.35,3.38,3.40,3.42, 3.44,3.46,3.48,3.50,3.52,3.54,3.55,3.57,3.58) m1[15,]<-c(2.41,2.59,2.72,2.82,2.89,2.96,3.02,3.07,3.11, 3.15,3.19,3.22,3.25,3.28,3.31,3.33,3.36,3.38, 3.39,3.42,3.44,3.46,3.47,3.49,3.50,3.52,3.53) m1[16,]<-c(2.38,2.56,2.68,2.77,2.85,2.91,2.97,3.02,3.06, 3.09,3.13,3.16,3.19,3.22,3.25,3.27,3.29,3.31, 3.33,3.35,3.37,3.39,3.40,3.42,3.43,3.45,3.46) m1[17,]<-c(2.35,2.52,2.64,2.73,2.80,2.87,2.92,2.96,3.01, 3.04,3.07,3.11,3.13,3.16,3.18,3.21,3.23,3.25, 3.27,3.29,3.30,3.32,3.33,3.35,3.36,3.37,3.39) m1[18,]<-c(2.32,2.49,2.60,2.69,2.76,2.82,2.87,2.91,2.95, 2.99,3.02,3.05,3.08,3.09,3.12,3.14,3.17, 3.18, 3.20,3.22,3.24,3.25,3.27,3.28,3.29,3.31,3.32) m1[19,]<-c(2.29,2.45,2.56,2.65,2.72,2.77,2.82,2.86,2.90, 2.93,2.96,2.99,3.02,3.04,3.06,3.08,3.10, 3.12, 3.14,3.16,3.17,3.19,3.20,3.21,3.23,3.24,3.25) m1[20,]<-c(2.24,2.39,2.49,2.57,2.63,2.68,2.73,2.77,2.79, 2.83,2.86,2.88,2.91,2.93,2.95,2.97,2.98, 3.01, 3.02,3.03,3.04,3.06,3.07,3.08,3.09,3.11,3.12) if(nuhat>=200)smmcrit<-m1[20,C] if(nuhat<200){ nu<-c(2,3,4,5,6,7,8,9,10,11,12,14,16,18,20,24,30,40,60,200) temp<-abs(nu-nuhat) find<-order(temp) if(temp[find[1]]==0)smmcrit<-m1[find[1],C] if(temp[find[1]]!=0){ if(nuhat>nu[find[1]]){ smmcrit<-m1[find[1],C]- (1/nu[find[1]]-1/nuhat)*(m1[find[1],C]-m1[find[1]+1,C])/ (1/nu[find[1]]-1/nu[find[1]+1]) } if(nuhat0)J<-length(grp) nval<-1 nrat<-1 nmax<-0 rbar<-1 mrbar<-0 for (j in grp){ temp<-x[[j]] temp<-temp[!is.na(temp)] #Missing values are removed. nrat[j]<-(length(temp)-1)/length(temp) nval[j]<-length(temp) if(j==grp[1])xall<-temp if(j!=grp[1])xall<-c(xall,temp) if(length(temp)>nmax)nmax<-length(temp) } pv<-array(NA,c(J,nmax,J)) tv<-matrix(NA,J,nmax) rv<-matrix(0,J,nmax) for (i in 1:J){ data<-x[[i]] data<-data[!is.na(data)] for (j in 1:length(data)){ tempr<-data[j]-xall rv[i,j]<-length(tempr[tempr>=0]) for (l in 1:J){ templ<-x[[l]] templ<-templ[!is.na(templ)] temp<-data[j]-templ pv[i,j,l]<-length(temp[temp>=0]) } tv[i,j]<-sum(pv[i,j,])-pv[i,j,i] } rbar[i]<-sum(rv[i,])/nval[i] mrbar<-mrbar+sum(rv[i,]) } amat<-matrix(0,J,J) for(i in 1:J){ temptv<-tv[i,] temptv<-temptv[!is.na(temptv)] amat[i,i]<-(length(temptv)-1)*var(temptv) for (l in 1:J){ tempp<-pv[l,,i] tempp<-tempp[!is.na(tempp)] if(l!=i){ amat[i,i]<-amat[i,i]+(length(tempp)-1)*var(tempp) }} for (j in 1:J){ if(j>i){ for (l in 1:J){ temp1<-pv[l,,i] temp2<-pv[l,,j] temp1<-temp1[!is.na(temp1)] temp2<-temp2[!is.na(temp2)] #if(i!=l && l!=j)amat[i,j]<-(length(temp1)-1)*var(temp1,temp2) if(i!=l && l!=j)amat[i,j]<-amat[i,j]+(length(temp1)-1)*var(temp1,temp2) } temp1<-pv[i,,j] temp2<-tv[i,] temp1<-temp1[!is.na(temp1)] temp2<-temp2[!is.na(temp2)] amat[i,j]<-amat[i,j]-(length(temp1)-1)*var(temp1,temp2) temp1<-pv[j,,i] temp2<-tv[j,] temp1<-temp1[!is.na(temp1)] temp2<-temp2[!is.na(temp2)] amat[i,j]<-amat[i,j]-(length(temp1)-1)*var(temp1,temp2) } amat[j,i]<-amat[i,j] }} N<-sum(nval) amat<-amat/N^3 amati<-ginv(amat) uvec<-1 mrbar<-mrbar/N for (i in 1:J)uvec[i]<-nval[i]*(rbar[i]-mrbar)/(N*(N+1)) testv<-N*prod(nrat)*uvec%*%amati%*%uvec test<-testv[1,1] df<-J-1 siglevel<-1-pchisq(test,df) list(test=test,siglevel=siglevel,df=df) } apanova<-function(data,grp=0){ # # Perform Agresti-Pendergast rank test for J dependent groups # The data are assumed to be stored in an n by J matrix or # in list mode. In the latter case, length(data)=J. # if(is.list(data)){ x<-matrix(0,length(data[[1]]),length(data)) for (j in 1:length(data))x[,j]<-data[[j]] } if(is.matrix(data))x<-data if(sum(grp==0))grp<-c(1:ncol(x)) x<-x[,grp] J<-ncol(x) n<-nrow(x) if(n<=20)print("With n<=20, suggest using bprm") rm<-matrix(rank(x),n,J) rv<-apply(rm,2,mean) sm<-(n-1)*winall(rm,tr=0)$cov/(n-J+1) jm1<-J-1 cv<-diag(1,jm1,J) for (i in 2:J){ k<-i-1 cv[k,i]<--1 } cr<-cv%*%rv ftest<-n*t(cr)%*%solve(cv%*%sm%*%t(cv))%*%cr/(J-1) df1<-J-1 df2<-(J-1)*(n-1) siglevel<-1-pf(ftest,df1,df2) list(FTEST=ftest,df1=df1,df2=df2,siglevel=siglevel) } box1way<-function(x,tr=.2,grp=c(1:length(x))){ # # A heteroscedastic one-way ANOVA for trimmed means # using a generalization of Box's method. # # The data are assumed to be stored in $x$ in list mode. # Length(x) is assumed to correspond to the total number of groups. # By default, the null hypothesis is that all groups have a common mean. # To compare a subset of the groups, use grp to indicate which # groups are to be compared. For example, if you type the # command grp<-c(1,3,4), and then execute this function, groups # 1, 3, and 4 will be compared with the remaining groups ignored. # # Missing values are automatically removed. # J<-length(grp) # The number of groups to be compared print("The number of groups to be compared is") print(J) h<-vector("numeric",J) w<-vector("numeric",J) xbar<-vector("numeric",J) svec<-vector("numeric",J) for(j in 1:J){ xx<-!is.na(x[[j]]) val<-x[[j]] x[[j]]<-val[xx] # Remove missing values h[j]<-length(x[[grp[j]]])-2*floor(tr*length(x[[grp[j]]])) # h is the number of observations in the jth group after trimming. svec[j]<-((length(x[[grp[j]]])-1)*winvar(x[[grp[j]]],tr))/(h[j]-1) xbar[j]<-mean(x[[grp[j]]],tr) } xtil<-sum(h*xbar)/sum(h) fval<-h/sum(h) TEST<-sum(h*(xbar-xtil)^2)/sum((1-fval)*svec) nu1<-sum((1-fval)*svec) nu1<-nu1^2/((sum(svec*fval))^2+sum(svec^2*(1-2*fval))) nu2<-(sum((1-fval)*svec))^2/sum(svec^2*(1-fval)^2/(h-1)) sig<-1-pf(TEST,nu1,nu2) list(TEST=TEST,nu1=nu1,nu2=nu2,siglevel=sig) } pairdepb<-function(x,tr=.2,alpha=.05,grp=0,nboot=599){ # # Using the percentile t bootstrap method, # compute a .95 confidence interval for all pairwise differences between # the trimmed means of dependent groups. # By default, 20% trimming is used with B=599 bootstrap samples. # # x can be an n by J matrix or it can have list mode # if(!is.list(x) && !is.matrix(x))stop("Data must be stored in a matrix or in list mode.") if(is.list(x)){ if(sum(grp)==0)grp<-c(1:length(x)) # put the data in an n by J matrix mat<-matrix(0,length(x[[1]]),length(grp)) for (j in 1:length(grp))mat[,j]<-x[[grp[j]]] } if(is.matrix(x)){ if(sum(grp)==0)grp<-c(1:ncol(x)) mat<-x[,grp] } if(sum(is.na(mat)>=1))stop("Missing values are not allowed.") J<-ncol(mat) connum<-(J^2-J)/2 bvec<-matrix(0,connum,nboot) set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") data<-matrix(sample(nrow(mat),size=nrow(mat)*nboot,replace=T),nrow=nboot) xcen<-matrix(0,nrow(mat),ncol(mat)) for (j in 1:J)xcen[,j]<-mat[,j]-mean(mat[,j],tr) #Center data it<-0 for (j in 1:J){ for (k in 1:J){ if(j=2)kron<-rbind(kron,m3) } kron } rmanova<-function(x,tr=.2,grp=c(1:length(x))){ # # A heteroscedastic one-way repeated measures ANOVA for trimmed means. # # The data are assumed to be stored in $x$ which can # be either an n by J matrix, or an S-PLUS variable having list mode. # If the data are stored in list mode, # length(x) is assumed to correspond to the total number of groups. # By default, the null hypothesis is that all group have a common mean. # To compare a subset of the groups, use grp to indicate which # groups are to be compared. For example, if you type the # command grp<-c(1,3,4), and then execute this function, groups # 1, 3, and 4 will be compared with the remaining groups ignored. # if(!is.list(x) && !is.matrix(x))stop("Data must be stored in a matrix or in list mode.") if(is.list(x)){ J<-length(grp) # The number of groups to be compared print("The number of groups to be compared is") print(J) m1<-matrix(x[[grp[1]]],length(x[[grp[1]]]),1) for(i in 2:J){ # Put the data into an n by J matrix m2<-matrix(x[[grp[i]]],length(x[[i]]),1) m1<-cbind(m1,m2) } } if(is.matrix(x)){ if(length(grp)=ncol(x))m1<-as.matrix(x) J<-ncol(x) print("The number of groups to be compared is") print(J) } # # Raw data are now in the matrix m1 # m2<-matrix(0,nrow(m1),ncol(m1)) xvec<-1 g<-floor(tr*nrow(m1)) #2g is the number of observations trimmed. for(j in 1:ncol(m1)){ # Putting Winsorized values in m2 m2[,j]<-winval(m1[,j],tr) xvec[j]<-mean(m1[,j],tr) } xbar<-mean(xvec) qc<-(nrow(m1)-2*g)*sum((xvec-xbar)^2) m3<-matrix(0,nrow(m1),ncol(m1)) m3<-sweep(m2,1,apply(m2,1,mean)) # Sweep out rows m3<-sweep(m3,2,apply(m2,2,mean)) # Sweep out columns m3<-m3+mean(m2) # Grand Winsorized mean swept in qe<-sum(m3^2) test<-(qc/(qe/(nrow(m1)-2*g-1))) # # Next, estimate the adjusted degrees of freedom # v<-winall(m1,tr=tr)$cov vbar<-mean(v) vbard<-mean(diag(v)) vbarj<-1 for(j in 1:J){ vbarj[j]<-mean(v[j,]) } A<-J*J*(vbard-vbar)^2/(J-1) B<-sum(v*v)-2*J*sum(vbarj^2)+J*J*vbar^2 ehat<-A/B etil<-(nrow(m2)*(J-1)*ehat-2)/((J-1)*(nrow(m2)-1-(J-1)*ehat)) etil<-min(1.,etil) df1<-(J-1)*etil df2<-(J-1)*etil*(nrow(m2)-2*g-1) siglevel<-1-pf(test,df1,df2) list(test=test,df=c(df1,df2),siglevel=siglevel,tmeans=xvec,ehat=ehat,etil=etil) } trimpartt<-function(x,con){ # # This function is used by other functions described in chapter 6. # trimpartt<-sum(con*x) trimpartt } bptdmean<-function(isub,x,tr){ # # Compute trimmed means # when comparing dependent groups. # By default, 20% trimmed means are used. # isub is a vector of length n, # a bootstrap sample from the sequence of integers # 1, 2, 3, ..., n # # This function is used by bptd. # bptdmean<-mean(x[isub],tr) bptdmean } bptdpsi<-function(x,con){ # Used by bptd to compute bootstrap psihat values # bptdpsi<-sum(con*x) bptdpsi } bptdsub<-function(isub,x,tr,con){ # # Compute test statistic for trimmed means # when comparing dependent groups. # By default, 20% trimmed means are used. # isub is a vector of length n, # a bootstrap sample from the sequence of integers # 1, 2, 3, ..., n # con is a J by c matrix. The cth column contains # a vector of contrast coefficients. # # This function is used by bptd. # h1 <- nrow(x) - 2 * floor(tr * nrow(x)) se<-0 for(j in 1:ncol(x)){ for(k in 1:ncol(x)){ djk<-(nrow(x) - 1) * wincor(x[isub,j],x[isub,k], tr)$cov se<-se+con[j]*con[k]*djk } } se/(h1*(h1-1)) } selby2<-function(m,grpc,coln=NA){ # Create categories according to the grpc[1] and grpc[2] columns # of the matrix m. The function puts the values in column coln into # a vector having list mode. # if(is.na(coln))stop("The argument coln is not specified") if(length(grpc)>4)stop("The argument grpc must have length less than or equal to 4") x<-vector("list") ic<-0 if(length(grpc)==2){ cat1<-selby(m,grpc[1],coln)$grpn cat2<-selby(m,grpc[2],coln)$grpn for (i1 in 1:length(cat1)){ for (i2 in 1:length(cat2)){ temp<-NA it<-0 for (i in 1:nrow(m)){ if(sum(m[i,c(grpc[1],grpc[2])]==c(cat1[i1],cat2[i2]))==2){ it<-it+1 temp[it]<-m[i,coln] } } if(!is.na(temp[1])){ ic<-ic+1 x[[ic]]<-temp if(ic==1)grpn<-matrix(c(cat1[i1],cat2[i2]),1,2) if(ic>1)grpn<-rbind(grpn,c(cat1[i1],cat2[i2])) } }} } if(length(grpc)==3){ cat1<-selby(m,grpc[1],coln)$grpn cat2<-selby(m,grpc[2],coln)$grpn cat3<-selby(m,grpc[3],coln)$grpn x<-vector("list") ic<-0 for (i1 in 1:length(cat1)){ for (i2 in 1:length(cat2)){ for (i3 in 1:length(cat3)){ temp<-NA it<-0 for (i in 1:nrow(m)){ if(sum(m[i,c(grpc[1],grpc[2],grpc[3])]==c(cat1[i1],cat2[i2],cat3[i3]))==3){ it<-it+1 temp[it]<-m[i,coln] }} if(!is.na(temp[1])){ ic<-ic+1 x[[ic]]<-temp if(ic==1)grpn<-matrix(c(cat1[i1],cat2[i2],cat3[i3]),1,3) if(ic>1)grpn<-rbind(grpn,c(cat1[i1],cat2[i2],cat3[i3])) }}}} } if(length(grpc)==4){ cat1<-selby(m,grpc[1],coln)$grpn cat2<-selby(m,grpc[2],coln)$grpn cat3<-selby(m,grpc[3],coln)$grpn cat4<-selby(m,grpc[4],coln)$grpn x<-vector("list") ic<-0 for (i1 in 1:length(cat1)){ for (i2 in 1:length(cat2)){ for (i3 in 1:length(cat3)){ for (i4 in 1:length(cat4)){ temp<-NA it<-0 for (i in 1:nrow(m)){ if(sum(m[i,c(grpc[1],grpc[2],grpc[3],grpc[4])]==c(cat1[i1],cat2[i2],cat3[i3],cat4[i4]))==4){ it<-it+1 temp[it]<-m[i,coln] }} if(!is.na(temp[1])){ ic<-ic+1 x[[ic]]<-temp if(ic==1)grpn<-matrix(c(cat1[i1],cat2[i2],cat3[i3],cat4[i4]),1,4) if(ic>1)grpn<-rbind(grpn,c(cat1[i1],cat2[i2],cat3[i3],cat4[i4])) }}}}} } list(x=x,grpn=grpn) } lindmsub<-function(isub,x,est,...){ # # isub is a vector of length n containing integers between # randomly sampled with replacement from 1,...,n. # # Used by lindm to convert an n by B matrix of bootstrap values, # randomly sampled from 1, ..., n, with replacement, to a # J by B matrix of measures of location. # # lindmsub<-est(x[isub],...) lindmsub } lindm<-function(x,con=0,est=onestep,grp=0,alpha=.05,nboot=399,...){ # # Compute a 1-alpha confidence interval for a set of d linear contrasts # involving M-estimators associated with the marginal distributions # using a bootstrap method. # Dependent groups are assumed. # # The data are assumed to be stored in x in list mode. Thus, # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. Length(x)=the number of groups = J, say. # # con is a J by d matrix containing the contrast coefficents of interest. # If unspecified, all pairwise comparisons are performed. # For example, con[,1]=c(1,1,-1,-1,0,0) and con[,2]=c(,1,-1,0,0,1,-1) # will test two contrasts: (1) the sum of the first two trimmed means is # equal to the sum of the second two, and (2) the difference between # the first two is equal to the difference between the trimmed means of # groups 5 and 6. # # The default number of bootstrap samples is nboot=399 # # This function uses the function trimpartt written for this # book. # # # if(!is.list(x) && !is.matrix(x))stop("Data must be stored in a matrix or in list mode.") if(is.list(x)){ if(sum(grp)==0)grp<-c(1:length(x)) # put the data in an n by J matrix mat<-matrix(0,length(x[[1]]),length(grp)) for (j in 1:length(grp))mat[,j]<-x[[grp[j]]] } if(is.matrix(x)){ if(sum(grp)==0)grp<-c(1:ncol(x)) mat<-x[,grp] } mat<-elimna(mat) J<-ncol(mat) Jm<-J-1 d<-(J^2-J)/2 if(sum(con^2)==0){ con<-matrix(0,J,d) id<-0 for (j in 1:Jm){ jp<-j+1 for (k in jp:J){ id<-id+1 con[j,id]<-1 con[k,id]<-0-1 }}} if(nrow(con)!=ncol(mat))stop("The number of groups does not match the number of contrast coefficients.") m1<-matrix(0,J,nboot) m2<-1 # Initialize m2 mval<-1 set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") data<-matrix(sample(nrow(mat),size=nrow(mat)*nboot,replace=T),nrow=nboot) # data is B by n matrix xcen<-matrix(0,nrow(mat),ncol(mat)) #An n by J matrix for (j in 1:J){xcen[,j]<-mat[,j]-est(mat[,j],...) #Center data mval[j]<-est(mat[,j],...) } for (j in 1:J)m1[j,]<-apply(data,1,lindmsub,xcen[,j],est,...) # A J by nboot matrix. m2<-var(t(m1)) # A J by J covariance matrix corresponding to the nboot values. boot<-matrix(0,ncol(con),nboot) bot<-1 for (d in 1:ncol(con)){ top<-apply(m1,2,trimpartt,con[,d]) # A vector of length nboot containing psi hat values consq<-con[,d]^2 bot[d]<-trimpartt(diag(m2),consq) for (j1 in 1:J){ for (j2 in 1:J){ if(j1=29)stop("C must be less than or equal to 28") if(C<=0)stop("C must be greater than or equal to 1") if(nuhat<2)stop("The degrees of freedom must be greater than or equal to 2") if(C==1)smmcrit01<-qt(.995,nuhat) if(C>=2){ C<-C-1 m1<-matrix(0,20,27) m1[1,]<-c(12.73,14.44,15.65,16.59,17.35,17.99,18.53,19.01,19.43, 19.81,20.15,20.46,20.75,20.99,20.99,20.99,20.99,20.99, 22.11,22.29,22.46,22.63,22.78,22.93,23.08,23.21,23.35) m1[2,]<-c(7.13,7.91,8.48,8.92,9.28,9.58,9.84,10.06,10.27, 10.45,10.61,10.76,10.90,11.03,11.15,11.26,11.37,11.47, 11.56,11.65,11.74,11.82,11.89,11.97,12.07,12.11,12.17) m1[3,]<-c(5.46,5.99,6.36,6.66,6.89,7.09,7.27,7.43,7.57, 7.69,7.80,7.91,8.01,8.09,8.17,8.25,8.32,8.39, 8.45,8.51,8.57,8.63,8.68,8.73,8.78,8.83,8.87) m1[4,]<-c(4.70,5.11,5.39,5.63,5.81,5.97,6.11,6.23,6.33, 6.43,6.52,6.59,6.67,6.74,6.81,6.87,6.93,6.98, 7.03,7.08,7.13,7.17,7.21,7.25,7.29,7.33,7.36) m1[5,]<-c(4.27,4.61,4.85,5.05,5.20,5.33,5.45,5.55,5.64, 5.72,5.79,5.86,5.93,5.99,6.04,6.09,6.14,6.18, 6.23,6.27,6.31,6.34,6.38,6.41,6.45,6.48,6.51) m1[6,]<-c(3.99,4.29,4.51,4.68,4.81,4.93,5.03,5.12,5.19, 5.27,5.33,5.39,5.45,5.50,5.55,5.59,5.64,5.68, 5.72,5.75,5.79,5.82,5.85,5.88,5.91,5.94,5.96) m1[7,]<-c(3.81,4.08,4.27,4.42,4.55,4.65,4.74,4.82,4.89, 4.96,5.02,5.07,5.12,5.17,5.21,5.25,5.29, 5.33, 5.36,5.39,5.43,5.45,5.48,5.51,5.54,5.56,5.59) m1[8,]<-c(3.67,3.92,4.10,4.24,4.35,4.45,4.53,4.61,4.67, 4.73,4.79,4.84,4.88,4.92,4.96,5.01,5.04,5.07, 5.10,5.13,5.16,5.19,5.21,5.24,5.26,5.29,5.31) m1[9,]<-c(3.57,3.80,3.97,4.09,4.20,4.29,4.37,4.44,4.50, 4.56,4.61,4.66,4.69,4.74,4.78,4.81,4.84,4.88, 4.91,4.93,4.96,4.99,5.01,5.03,5.06,5.08,5.09) m1[10,]<-c(3.48,3.71,3.87,3.99,4.09,4.17,4.25,4.31,4.37, 4.42,4.47,4.51,4.55,4.59,4.63,4.66,4.69,4.72, 4.75,4.78,4.80,4.83,4.85,4.87,4.89,4.91,4.93) m1[11,]<-c(3.42,3.63,3.78,3.89,.99,4.08,4.15,4.21,4.26, 4.31,4.36,4.40,4.44,4.48,4.51,4.54,4.57,4.59, 4.62,4.65,4.67,4.69,4.72,4.74,4.76,4.78,4.79) m1[12,]<-c(3.32,3.52,3.66,3.77,3.85,3.93,3.99,.05,4.10, 4.15,4.19,4.23,4.26,4.29,4.33,4.36,4.39,4.41, 4.44,4.46,4.48,4.50,4.52,4.54,4.56,4.58,4.59) m1[13,]<-c(3.25,3.43,3.57,3.67,3.75,3.82,3.88,3.94,3.99, 4.03,4.07,4.11,4.14,4.17,4.19,4.23,4.25,4.28, 4.29,4.32,4.34,4.36,4.38,4.39,4.42,4.43,4.45) m1[14,]<-c(3.19,3.37,3.49,3.59,3.68,3.74,3.80,3.85,3.89, 3.94,3.98,4.01,4.04,4.07,4.10,4.13,4.15,4.18, 4.19,4.22,4.24,4.26,4.28,4.29,4.31,4.33,4.34) m1[15,]<-c(3.15,3.32,3.45,3.54,3.62,3.68,3.74,3.79,3.83, 3.87,3.91,3.94,3.97,3.99,4.03,4.05,4.07,4.09, 4.12,4.14,4.16,4.17,4.19,4.21,4.22,4.24,4.25) m1[16,]<-c(3.09,3.25,3.37,3.46,3.53,3.59,3.64,3.69,3.73, 3.77,3.80,3.83,3.86,3.89,3.91,3.94,3.96,3.98, 4.00,4.02,4.04,4.05,4.07,4.09,4.10,4.12,4.13) m1[17,]<-c(3.03,3.18,3.29,3.38,3.45,3.50,3.55,3.59,3.64, 3.67,3.70,3.73,3.76,3.78,3.81,3.83,3.85,3.87, 3.89,3.91,3.92,3.94,3.95,3.97,3.98,4.00,4.01) m1[18,]<-c(2.97,3.12,3.22,3.30,3.37,3.42,3.47,3.51,3.55, 3.58,3.61,3.64,3.66,3.68,3.71,3.73,3.75,3.76, 3.78,3.80,3.81,3.83,3.84,3.85,3.87,3.88,3.89) m1[19,]<-c(2.91,3.06,3.15,3.23,3.29,3.34,3.38,3.42,3.46, 3.49,3.51,3.54,3.56,3.59,3.61,3.63,3.64,3.66, 3.68,3.69,3.71,3.72,3.73,3.75,3.76,3.77,3.78) m1[20,]<-c(2.81,2.93,3.02,3.09,3.14,3.19,3.23,3.26,3.29, 3.32,3.34,3.36,3.38,3.40,.42,.44,3.45,3.47, 3.48,3.49,3.50,3.52,3.53,3.54,3.55,3.56,3.57) if(nuhat>=200)smmcrit01<-m1[20,C] if(nuhat<200){ nu<-c(2,3,4,5,6,7,8,9,10,11,12,14,16,18,20,24,30,40,60,200) temp<-abs(nu-nuhat) find<-order(temp) if(temp[find[1]]==0)smmcrit01<-m1[find[1],C] if(temp[find[1]]!=0){ if(nuhat>nu[find[1]]){ smmcrit01<-m1[find[1],C]- (1/nu[find[1]]-1/nuhat)*(m1[find[1],C]-m1[find[1]+1,C])/ (1/nu[find[1]]-1/nu[find[1]+1]) } if(nuhat=1))ikeep[i]<-0 elimna<-m[ikeep[ikeep>=1],] elimna } pball<-function(m,beta=.2){ # # Compute the percentage bend correlation matrix for the # data in the n by p matrix m. # # This function also returns the two-sided significance level # for all pairs of variables, plus a test of zero correlations # among all pairs. (See chapter 6 for details.) # if(!is.matrix(m))stop("Data must be stored in an n by p matrix") pbcorm<-matrix(0,ncol(m),ncol(m)) temp<-matrix(1,ncol(m),ncol(m)) siglevel<-matrix(NA,ncol(m),ncol(m)) cmat<-matrix(0,ncol(m),ncol(m)) for (i in 1:ncol(m)){ ip1<-i for (j in ip1:ncol(m)){ if(i1]) sx<-ifelse(psi<(-1),0,x) sx<-ifelse(psi>1,0,sx) pbos<-(sum(sx)+omhatx*(i2-i1))/(length(x)-i1-i2) pbos } tauall<-function(m){ # # Compute Kendall's tau for the # data in the n by p matrix m. # # This function also returns the two-sided significance level # for all pairs of variables, plus a test of zero correlations # among all pairs. (See chapter 6 for details.) # if(!is.matrix(m))stop("Data must be stored in an n by p matrix") taum<-matrix(0,ncol(m),ncol(m)) siglevel<-matrix(NA,ncol(m),ncol(m)) for (i in 1:ncol(m)){ ip1<-i for (j in ip1:ncol(m)){ if(i=length(xv)/2)warning("More than half of the w values equal zero") sumw<-sum(w[ee=.0001) paste("failed to converge in",iter,"iterations") list(coef=c(b0,slope),residuals=res) } chreg<-function(x,y,bend=1.345,SEED=T){ # # Compute Coakley Hettmansperger robust regression estimators # JASA, 1993, 88, 872-880 # # x is a n by p matrix containing the predictor values. # # No missing values are allowed # # Comments in this function follow the notation used # by Coakley and Hettmansperger # library(MASS) # with old version of R, need library(lqs) when using ltsreg # as the initial estimate. # if(SEED)set.seed(12) # Set seed so that results are always duplicated. x<-as.matrix(x) p<-ncol(x) m<-elimna(cbind(x,y)) x<-m[,1:p] p1<-p+1 y<-m[,p1] x<-as.matrix(x) cutoff<-bend mve<-vector("list") if(ncol(x)==1){ mve$center<-median(x) mve$cov<-mad(x)^2 } if(ncol(x)>=2)mve<-cov.mve(x) # compute minimum volume ellipsoid measures of # location and scale and store in mve. reg0<-ltsreg(x,y) # compute initial regression est using least trimmed # squares. # Next, compute the rob-md2(i) values and store in rob rob<-1 # Initialize vector rob mx<-mve$center rob<-mahalanobis(x,mx,mve$cov) k21<-qchisq(.95,p) c62<-k21/rob vecone<-c(rep(1,length(y))) # Initialize vector vecone to 1 c30<-pmin(vecone,c62) # mallows weights put in c30 k81<-median(abs(reg0$residuals)) # median of absolute residuals k72<-1.4826*(1+(5/(length(y)-p-1)))*k81 # lms scale c60<-reg0$residuals/(k72*c30) # standardized residuals # compute psi and store in c27 cvec<-c(rep(cutoff,length(y))) # Initialize vector cvec to cutoff c27<-pmin(cvec,c60) c27<-pmax(-1*cutoff,c27) #c27 contains psi values # # compute B matrix and put in c66. # Also, transform B so that i th diag elem = 0 if c27[i] is # between -cutoff and cutoff, 1 otherwise. # c66<-ifelse(abs(c27)<=bend,1,0) # Have derivative of psi in c66 m1<-cbind(1,x) # X matrix with col of 1's added m2<-t(m1) #X transpose m5<-diag(c30) # matrix W, diagonal contains weights m4<-diag(c66) # B matrix m6<-m4%*%m1 # BX m7<-m2%*%m6 # X'BX (nD=X'BX) m8<-solve(m7) #m8 = (X'-B-X)inverse m9<-m8%*%m2 #m9=X prime-B-X inverse X' m9<-m9%*%m5 # m9=X prime-B-X inverse X'W m10<-m9%*%c27 c20<-m10*k72 c21<-reg0$coef+c20 #update initial estimate of parameters. res<-y-m1%*%c21 list(coef=t(c21),residuals=res) } regboot<-function(isub,x,y,regfun,...){ # # Perform regression using x[isub] to predict y[isub] # isub is a vector of length n, # a bootstrap sample from the sequence of integers # 1, 2, 3, ..., n # # This function is used by other functions when computing # bootstrap estimates. # # regfun is some regression method already stored in S-PLUS # It is assumed that regfun$coef contains the intercept and slope # estimates produced by regfun. The regression methods written for # this book, plus regression functions in S-PLUS, have this property. # # x is assumed to be a matrix containing values of the predictors. # xmat<-matrix(x[isub,],nrow(x),ncol(x)) vals<-regfun(xmat,y[isub],...)$coef vals } bmreg<-function(x,y,iter=20,bend=2*sqrt((ncol(x)+1)/nrow(x))){ # compute a bounded M regression using Huber Psi and Schweppe weights. # The predictors are assumed to be stored in the n by p matrix x. # x<-as.matrix(x) init<-lsfit(x,y) resid<-init$residuals x1<-cbind(1,x) nu<-sqrt(1-hat(x1)) low<-ncol(x)+1 for(it in 1:iter){ ev<-sort(abs(resid)) scale<-median(ev[c(low:length(y))])/qnorm(.75) rov<-(resid/scale)/nu psi<-ifelse(abs(rov)<=bend,rov,bend*sign(rov)) # Huber Psi wt<-nu*psi/(resid/scale) new<-lsfit(x,y,wt) if(max(abs(new$coef-init$coef))<.0001)break init$coef<-new$coef resid<-new$residuals } resid<-y-x1%*%new$coef if(max(abs(new$coef-init$coef))>=.0001) paste("failed to converge in",iter,"steps") list(coef=new$coef,residuals=resid,w=wt) } reglev<-function(x,y,plotit=T){ # # Search for good and bad leverage points using the # Rousseuw and van Zomeren method. # # x is an n by p matrix # # The function returns the number of the rows in x that are identified # as outliers. (The row numbers are stored in outliers.) # It also returns the distance of the points identified as outliers # in the variable dis. # library(MASS) plotit<-as.logical(plotit) set.seed(12) x<-as.matrix(x) res<-lmsreg(x,y)$resid sighat<-sqrt(median(res^2)) sighat<-1.4826*(1+(5/(length(y)-ncol(x)-1)))*sighat stanres<-res/sighat set.seed(12) if(ncol(x)>=2)mve<-cov.mve(x) if(ncol(x)==1){ mve<-vector("list") mve$center<-median(x) mve$cov<-mad(x)^2 } dis<-mahalanobis(x,mve$center,mve$cov) dis<-sqrt(dis) crit<-sqrt(qchisq(.975,ncol(x))) chk<-ifelse(dis>crit,1,0) vec<-c(1:nrow(x)) id<-vec[chk==1] chkreg<-ifelse(abs(stanres)>2.5,1,0) idreg<-vec[chkreg==1] if(plotit){ plot(dis,stanres,xlab="Robust distances",ylab="standardized residuals") abline(-2.5,0) abline(2.5,0) abline(v=crit) } list(levpoints=id,regout=idreg,dis=dis,stanres=stanres,crit=crit) } winreg<-function(x,y,iter=20,tr=.2){ # # Compute a biweight midregression equation # The predictors are assumed to be stored in the n by p matrix x. # x<-as.matrix(x) ma<-matrix(0,ncol(x),1) m<-matrix(0,ncol(x),ncol(x)) mvals<-apply(x,2,win,tr) for (i in 1:ncol(x)){ ma[i,1]<-wincor(x[,i],y,tr=tr)$cov for (j in 1:ncol(x))m[i,j]<-wincor(x[,i],x[,j],tr=tr)$cov } slope<-solve(m,ma) b0<-win(y,tr)-sum(slope%*%mvals) for(it in 1:iter){ res<-y-x%*%slope-b0 for (i in 1:ncol(x))ma[i,1]<-wincor(x[,i],res,tr=tr)$cov slopeadd<-solve(m,ma) b0add<-win(res,tr)-sum(slopeadd%*%mvals) if(max(abs(slopeadd),abs(b0add)) <.0001)break slope<-slope+slopeadd b0<-b0+b0add } if(max(abs(slopeadd),abs(b0add)) >=.0001) paste("failed to converge in",iter,"iterations") list(coef=c(b0,slope),resid=res) } anctgen<-function(x1,y1,x2,y2,pts,fr1=1,fr2=1,tr=.2){ # # Compare two independent groups using the ancova method # in chapter 9. No assumption is made about the form of the regression # lines--a running interval smoother is used. # # Assume data are in x1 y1 x2 and y2 # Comparisons are made at the design points contained in the vector # pts # # Comparisons can be made using at most 28 design points, otherwise # a critical value for controlling the experimentwise type I error cannot # be computed. # if(length(pts)>=29)stop("At most 28 points can be compared") n1<-1 n2<-1 vecn<-1 for(i in 1:length(pts)){ n1[i]<-length(y1[near(x1,pts[i],fr1)]) n2[i]<-length(y2[near(x2,pts[i],fr2)]) } mat<-matrix(NA,length(pts),8) dimnames(mat)<-list(NULL,c("X","n1","n2","DIF","TEST","se","ci.low","ci.hi")) for (i in 1:length(pts)){ g1<-y1[near(x1,pts[i],fr1)] g2<-y2[near(x2,pts[i],fr2)] g1<-g1[!is.na(g1)] g2<-g2[!is.na(g2)] test<-yuen(g1,g2,tr=tr) mat[i,1]<-pts[i] mat[i,2]<-length(g1) mat[i,3]<-length(g2) mat[i,4]<-test$dif mat[i,5]<-test$teststat mat[i,6]<-test$se if(length(pts)>=2)critv<-smmcrit(test$df,length(pts)) if(length(pts)==1)critv<-qt(.975,test$df) cilow<-test$dif-critv*test$se cihi<-test$dif+critv*test$se mat[i,7]<-cilow mat[i,8]<-cihi } list(output=mat,crit=critv) } near<-function(x,pt,fr=1){ # determine which values in x are near pt # based on fr * mad m<-mad(x) if(m==0){ temp<-idealf(x) m<-(temp$qu-temp$ql)/(qnorm(.75)-qnorm(.25)) } if(m==0)m<-sqrt(winvar(x)/.4129) if(m==0)stop("All measures of dispersion are equal to 0") dis<-abs(x-pt) dflag<-dis <= fr*m dflag } regpres1<-function(isub,x,y,regfun,mval){ # # Perform regression using x[isub] to predict y[isub] # isub is a vector of length n, # a bootstrap sample from the sequence of integers # 1, 2, 3, ..., n # # This function is used by other functions when computing # bootstrap estimates. # # regfun is some regression method already stored in S-PLUS # It is assumed that regfun$coef contains the intercept and slope # estimates produced by regfun. The regression methods written for # this book, plus regression functions in S-PLUS, have this property. # # x is assumed to be a matrix containing values of the predictors. # xmat<-matrix(x[isub,],mval,ncol(x)) regboot<-regfun(xmat,y[isub]) regboot<-regboot$coef regboot } runhat<-function(x,y,pts=x,est=mom,fr=1,...){ # # running interval smoother that can be used with any measure # of location or scale. By default, a modified one-step M-estimator is used. # This function computes an estimate of y for each x value stored in pts # # fr controls amount of smoothing rmd<-rep(NA,length(pts)) for(i in 1:length(pts)){ val<-y[near(x,pts[i],fr)] if(length(val)>0)rmd[i]<-est(val,...) } rmd } sqfun<-function(y){ # sqfun<-sum(y^2) sqfun } absfun<-function(y){ absfun<-sum(abs(y)) absfun } ancbootg<-function(x1,y1,x2,y2,pts,fr1=1,fr2=1,tr=.2,nboot=599){ # # Compare two independent groups using the ancova method # in chapter 9. No assumption is made about the form of the regression # lines--a running interval smoother is used. # # Assume data are in x1 y1 x2 and y2 # Comparisons are made at the design points contained in the vector # pts # n1<-1 n2<-1 vecn<-1 for(i in 1:length(pts)){ n1[i]<-length(y1[near(x1,pts[i],fr1)]) n2[i]<-length(y2[near(x2,pts[i],fr2)]) } mat<-matrix(NA,length(pts),8) dimnames(mat)<-list(NULL,c("X","n1","n2","DIF","TEST","se","ci.low","ci.hi")) gv<-vector("list",2*length(pts)) for (i in 1:length(pts)){ g1<-y1[near(x1,pts[i],fr1)] g2<-y2[near(x2,pts[i],fr2)] g1<-g1[!is.na(g1)] g2<-g2[!is.na(g2)] j<-i+length(pts) gv[[i]]<-g1 gv[[j]]<-g2 } I1<-diag(length(pts)) I2<-0-I1 con<-rbind(I1,I2) test<-linconb(gv,con=con,tr=tr,nboot=nboot) mat[,1]<-pts mat[,2]<-n1 mat[,3]<-n2 mat[,4]<-test$psihat[,2] mat[,5]<-test$test[,2] mat[,6]<-test$test[,3] mat[,7]<-test$psihat[,3] mat[,8]<-test$psihat[,4] list(output=mat,crit=test$crit) } errfun<-function(yhat,y,error=sqfun){ # # Compute error terms for regpre # # yhat is an n by nboot matrix # y is n by 1. # ymat<-matrix(y,nrow(yhat),ncol(yhat)) blob<-yhat-ymat errfun<-error(blob) errfun } near3d<-function(x,pt,fr=.8,m){ # determine which values in x are near pt # based on fr * cov.mve # # x is assumed to be an n by p matrix # pt is a vector of length p (a point in p-space). # m is cov.mve(x) computed by runm3d # library(MASS) if(!is.matrix(x))stop("Data are not stored in a matrix.") dis<-sqrt(mahalanobis(x,pt,m$cov)) dflag<-dis < fr dflag } run3hat<-function(x,y,pts,fr=.8,tr=.2){ # # Compute y hat for each row of data in the matrix pts # using a running interval method # # fr controls amount of smoothing # tr is the amount of trimming # x is an n by p matrix of predictors. # pts is an m by p matrix, m>=1. # library(MASS) set.seed(12) if(!is.matrix(x))stop("Predictors are not stored in a matrix.") if(!is.matrix(pts))stop("The third argument, pts, must be a matrix.") m<-cov.mcd(x) rmd<-1 # Initialize rmd nval<-1 for(i in 1:nrow(pts)){ rmd[i]<-mean(y[near3d(x,pts[i,],fr,m)],tr) nval[i]<-length(y[near3d(x,pts[i,],fr,m)]) } list(rmd=rmd,nval=nval) } idb<-function(x,n){ # # Determine whether a sequence of integers contains a 1, 2, ..., n. # Return idb[i]=1 if the value i is in x; 0 otherwise. # This function is used by regpre # m1<-matrix(0,n,n) m1<-outer(c(1:n),x,"-") m1<-ifelse(m1==0,1,0) idb<-apply(m1,1,sum) idb<-ifelse(idb>=1,0,1) idb } hratio<-function(x,y,regfun=bmreg){ # # Compute a p by p matrix of half-slope ratios # # regfun can be any s-plus function that returns the coefficients in # the vector regfun$coef, the first element of which contains the # estimated intercept, the second element contains the estimate of # the first predictor, etc. # x<-as.matrix(x) xmat<-matrix(0,nrow(x),ncol(x)) mval<-floor(length(y)/2) mr<-length(y)-mval xmatl<-matrix(0,mval,ncol(x)) xmatr<-matrix(0,mr,ncol(x)) hmat<-matrix(NA,ncol(x),ncol(x)) isub<-c(1:length(y)) ksub<-c(1:ncol(x))+1 for (k in 1:ncol(x)){ xord<-order(x[,k]) yord<-y[xord] yl<-yord[isub<=mval] yr<-yord[isub>mval] for (j in 1:ncol(x)){ xmat[,j]<-x[xord,j] xmatl[,j]<-xmat[isub<=mval,j] xmatr[,j]<-xmat[isub>mval,j] } coefl<-regfun(xmatl,yl)$coef coefr<-regfun(xmatr,yr)$coef hmat[k,]<-coefr[ksub[ksub>=2]]/coefl[ksub[ksub>=2]] hmat } } rung3d<-function(x,y,est=mom,fr=1,plotit=T,theta=50,phi=25,pyhat=F, expand=.5,scale=F,zscale=T,nmin=0,xout=F,outfun=out,SEED=T, xlab="X",ylab="Y",zlab="",pr=T,duplicate="error",...){ # # running mean using interval method # # fr (the span) controls amount of smoothing # est is the measure of location. # (Goal is to determine est(y) given x.) # x is an n by p matrix of predictors. # # pyhat=T, predicted values are returned. # library(MASS) library(akima) #library(lqs) if(SEED)set.seed(12) # set seed for cov.mve if(!is.matrix(x))stop("Data are not stored in a matrix.") if(nrow(x) != length(y))stop("Number of rows in x does not match length of y") temp<-cbind(x,y) p<-ncol(x) p1<-p+1 temp<-elimna(temp) # Eliminate any rows with missing values. if(xout){ keepit<-outfun(x,plotit=F)$keep x<-x[keepit,] y<-y[keepit] } if(zscale){ for(j in 1:p1){ temp[,j]<-(temp[,j]-median(temp[,j]))/mad(temp[,j]) }} x<-temp[,1:p] y<-temp[,p1] m<-cov.mve(x) iout<-c(1:nrow(x)) rmd<-1 # Initialize rmd nval<-1 for(i in 1:nrow(x))rmd[i]<-est(y[near3d(x,x[i,],fr,m)],...) for(i in 1:nrow(x))nval[i]<-length(y[near3d(x,x[i,],fr,m)]) if(ncol(x)==2){ if(plotit){ if(pr){ if(!scale)print("With dependence, suggest using scale=T") } fitr<-rmd[nval>nmin] y<-y[nval>nmin] x<-x[nval>nmin,] iout<-c(1:length(fitr)) nm1<-length(fitr)-1 for(i in 1:nm1){ ip1<-i+1 for(k in ip1:length(fitr))if(sum(x[i,]==x[k,])==2)iout[k]<-0 } fitr<-fitr[iout>=1] # Eliminate duplicate points in the x-y plane # This is necessary when doing three dimensional plots # with the S-PLUS function interp mkeep<-x[iout>=1,] fit<-interp(mkeep[,1],mkeep[,2],fitr,duplicate=duplicate) persp(fit,theta=theta,phi=phi,expand=expand, scale=scale,xlab=xlab,ylab=ylab,zlab=zlab) }} if(pyhat)last<-rmd if(!pyhat)last <- "Done" last } mbmreg<-function(x,y,iter=20,bend=2*sqrt(ncol(x)+1)/nrow(x)){ # # Compute a bounded M regression estimator using # Huber Psi and Schweppe weights with # regression outliers getting a weight of zero. # # This is the modified M-regression estimator in Chapter 8 # # The predictors are assumed to be stored in the n by p matrix x. # x<-as.matrix(x) if(is.matrix(y)){ if(ncol(y)==1)y=as.vector(y) } x1<-cbind(1,x) library(MASS) reslms<-lmsreg(x,y)$resid sighat<-sqrt(median(reslms^2)) sighat<-1.4826*(1+(5/(length(y)-ncol(x)-1)))*sighat if(sighat==0)warning("The estimated measure of scale, based on the residuals using lms regression, is zero") temp<-ifelse(sighat*reslms>0,abs(reslms)/sighat,0*reslms) wt<-ifelse(temp<=2.5,1,0) init<-lsfit(x,y,wt) resid<-init$residuals nu<-sqrt(1-hat(x1)) low<-ncol(x)+1 for(it in 1:iter){ ev<-sort(abs(resid)) scale<-median(ev[c(low:length(y))])/qnorm(.75) rov<-(resid/scale)/nu psi<-ifelse(abs(rov)<=bend,rov,bend*sign(rov)) # Huber Psi wt<-nu*psi/(resid/scale) wt<-ifelse(temp<=2.5,wt,0) new<-lsfit(x,y,wt) if(abs(max(new$coef-init$coef)<.0001))break init$coef<-new$coef resid<-new$residuals } resid<-y-x1%*%new$coef if(abs(max(new$coef-init$coef)>=.0001)) paste("failed to converge in",iter,"steps") list(coef=new$coef,residuals=resid,w=wt) } rankisub<-function(x,y){ # # compute phat and an estimate of its variance # x<-x[!is.na(x)] # Remove missing values from x y<-y[!is.na(y)] # Remove missing values from y u<-outer(x,y,FUN="<") p1<-0 p2<-0 for (j in 1:length(y)){ temp<-outer(u[,j],u[,j]) p1<-p1+sum(temp)-sum(u[,j]*u[,j]) } for (i in 1: length(x)){ temp<-outer(u[i,],u[i,]) p2<-p2+sum(temp)-sum(u[i,]*u[i,]) } p<-sum(u)/(length(x)*length(y)) pad<-p if(p==0)pad<-.5/(length(x)*length(y)) if(p==1)pad<-(1-.5)/(length(x)*length(y)) p1<-p1/(length(x)*length(y)*(length(x)-1)) p2<-p2/(length(x)*length(y)*(length(y)-1)) var<-pad*(1.-pad)*(((length(x)-1)*(p1-p^2)/(pad*(1-pad))+1)/(1-1/length(y))+ ((length(y)-1)*(p2-p^2)/(pad*(1-pad))+1)/(1-1/length(x))) var<-var/(length(x)*length(y)) list(phat=p,sqse=var) } pbcor<-function(x,y,beta=.2){ # Compute the percentage bend correlation between x and y. # # beta is the bending constant for omega sub N. # if(length(x)!=length(y))stop("The vectors do not have equal lengths") if(sum(is.na(c(x,y)))!=0){ m1<-matrix(c(x,y),length(x),2) m1<-elimna(m1) x<-m1[,1] y<-m1[,2] # Have eliminated missing values } temp<-sort(abs(x-median(x))) omhatx<-temp[floor((1-beta)*length(x))] temp<-sort(abs(y-median(y))) omhaty<-temp[floor((1-beta)*length(y))] a<-(x-pbos(x,beta))/omhatx b<-(y-pbos(y,beta))/omhaty a<-ifelse(a<=-1,-1,a) a<-ifelse(a>=1,1,a) b<-ifelse(b<=-1,-1,b) b<-ifelse(b>=1,1,b) pbcor<-sum(a*b)/sqrt(sum(a^2)*sum(b^2)) test<-pbcor*sqrt((length(x) - 2)/(1 - pbcor^2)) sig<-2*(1 - pt(abs(test),length(x)-2)) list(cor=pbcor,test=test,siglevel=sig) } rmanovab<-function(x,tr=.2,alpha=.05,grp=0,nboot=599){ # # A bootstrap-t for comparing the trimmed means of dependent groups. # By default, 20% trimming is used with B=599 bootstrap samples. # # The optional argument grp is used to select a subset of the groups # and exclude the rest. # # x can be an n by J matrix or it can have list mode # if(!is.list(x) && !is.matrix(x))stop("Data must be stored in a matrix or in list mode.") if(is.list(x)){ if(sum(grp)==0)grp<-c(1:length(x)) # put the data in an n by J matrix mat<-matrix(0,length(x[[1]]),length(grp)) for (j in 1:length(grp))mat[,j]<-x[[grp[j]]] } if(is.matrix(x)){ if(sum(grp)==0)grp<-c(1:ncol(x)) mat<-x[,grp] } if(sum(is.na(mat)>=1))stop("Missing values are not allowed.") J<-ncol(mat) connum<-(J^2-J)/2 bvec<-matrix(0,connum,nboot) set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") data<-matrix(sample(nrow(mat),size=nrow(mat)*nboot,replace=T),nrow=nboot) xcen<-matrix(0,nrow(mat),ncol(mat)) for (j in 1:J)xcen[,j]<-mat[,j]-mean(mat[,j],tr) #Center data bvec<-apply(data,1,tsubrmanovab,xcen,tr) # bvec is vector of nboot bootstrap test statistics. icrit<-round((1-alpha)*nboot) bvec<-sort(bvec) crit<-bvec[icrit] test<-rmanova(x,tr,grp)$test list(teststat=test,crit=crit) } tsubrmanovab<-function(isub,x,tr){ # # Compute test statistic for trimmed means # when comparing dependent groups. # By default, 20% trimmed means are used. # isub is a vector of length n, # a bootstrap sample from the sequence of integers # 1, 2, 3, ..., n # # This function is used by rmanovab # tsub<-rmanovab1(x[isub,],tr=tr)$test tsub } rmanovab1<-function(x,tr=.2,grp=c(1:length(x))){ # # A heteroscedastic one-way repeated measures ANOVA for trimmed means. # # The data are assumed to be stored in $x$ which can # be either an n by J matrix, or an S-PLUS variable having list mode. # If the data are stored in list mode, # length(x) is assumed to correspond to the total number of groups. # By default, the null hypothesis is that all group have a common mean. # To compare a subset of the groups, use grp to indicate which # groups are to be compared. For example, if you type the # command grp<-c(1,3,4), and then execute this function, groups # 1, 3, and 4 will be compared with the remaining groups ignored. # if(!is.list(x) && !is.matrix(x))stop("Data must be stored in a matrix or in list mode.") if(is.list(x)){ J<-length(grp) # The number of groups to be compared m1<-matrix(x[[grp[1]]],length(x[[grp[1]]]),1) for(i in 2:J){ # Put the data into an n by J matrix m2<-matrix(x[[grp[i]]],length(x[[i]]),1) m1<-cbind(m1,m2) } } if(is.matrix(x)){ if(length(grp)=ncol(x))m1<-as.matrix(x) J<-ncol(x) } # # Raw data are now in the matrix m1 # m2<-matrix(0,nrow(m1),ncol(m1)) xvec<-1 g<-floor(tr*nrow(m1)) #2g is the number of observations trimmed. for(j in 1:ncol(m1)){ # Putting Winsorized values in m2 m2[,j]<-winval(m1[,j],tr) xvec[j]<-mean(m1[,j],tr) } xbar<-mean(xvec) qc<-(nrow(m1)-2*g)*sum((xvec-xbar)^2) m3<-matrix(0,nrow(m1),ncol(m1)) m3<-sweep(m2,1,apply(m2,1,mean)) # Sweep out rows m3<-sweep(m3,2,apply(m2,2,mean)) # Sweep out columns m3<-m3+mean(m2) # Grand Winsorized mean swept in qe<-sum(m3^2) test<-(qc/(qe/(nrow(m1)-2*g-1))) # # Next, estimate the adjusted degrees of freedom # v<-winall(m1)$cov vbar<-mean(v) vbard<-mean(diag(v)) vbarj<-1 for(j in 1:J){ vbarj[j]<-mean(v[j,]) } A<-J*J*(vbard-vbar)^2/(J-1) B<-sum(v*v)-2*J*sum(vbarj^2)+J*J*vbar^2 ehat<-A/B etil<-(nrow(m2)*(J-1)*ehat-2)/((J-1)*(nrow(m2)-1-(J-1)*ehat)) etil<-min(1.,etil) df1<-(J-1)*etil df2<-(J-1)*etil*(nrow(m2)-2*g-1) siglevel<-1-pf(test,df1,df2) list(test=test,df=c(df1,df2),siglevel=siglevel,tmeans=xvec,ehat=ehat,etil=etil) } mee<-function(x,y,alpha=.05){ # # For two independent groups, compute a 1-\alpha confidence interval # for p=P(X 0){print("Warning: Tied values detected") print("so even if distributions are identical,") print("P(X 0) print("Tied values detected. Interchanging columns might give different results. That is, comparing rows based on P(XY)") ck<-(K^2-K)/2 cj<-(J^2-J)/2 tc<-ck*cj if(tc>28){ print("Warning: The number of contrasts exceeds 28.") print("The critical value being used is based on 28 contrasts") tc<-28 } idmat<-matrix(NA,nrow=tc,ncol=8) dimnames(idmat)<-list(NULL,c("row","row","col","col","ci.lower","ci.upper","estimate","test.stat")) crit<-smmcrit(300,tc) if(alpha != .05){ crit<-smmcrit01(300,tc) if(alpha != .01){print("Warning: Only alpha = .05 and .01 are allowed,") print("alpha = .01 is being assumed.") } } phatsqse<-0 phat<-0 allit<-0 jcount<-0-K it<-0 for(j in 1:J){ for(jj in 1:J){ if(j < jj){ for(k in 1:K){ for(kk in 1:K){ if(k < kk){ it<-it+1 idmat[it,1:4]<-c(j,jj,k,kk) }}}}} jcount<-jcount+K for(k in 1:K){ for(kk in 1:K){ if(k < kk){ allit<-allit+1 xx<-x[[grp[k+jcount]]] yy<-x[[grp[kk+jcount]]] temp<-rankisub(xx,yy) phat[allit]<-temp$phat phatsqse[allit]<-temp$sqse }}}} # # Compute the contrast matrix. Each row contains a 1, -1 and the rest 0 # That is, all pairwise comparisons among K groups. # con<-matrix(0,cj,J) id<-0 Jm<-J-1 for (j in 1:Jm){ jp<-j+1 for (k in jp:J){ id<-id+1 con[id,j]<-1 con[id,k]<-0-1 }} IK<-diag(ck) B<-kron(con,IK) ntest<-ck*(J^2-J)/2 test<-0 civecl<-0 civecu<-0 for (itest in 1:ntest){ temp1<-sum(B[itest,]*phat) idmat[itest,7]<-temp1 idmat[itest,8]<-temp1/sqrt(sum(B[itest,]^2*phatsqse)) idmat[itest,5]<-temp1-crit*sqrt(sum(B[itest,]^2*phatsqse)) idmat[itest,6]<-temp1+crit*sqrt(sum(B[itest,]^2*phatsqse)) } nsig<-sum((abs(idmat[,8])>crit)) list(phat=phat,ci=idmat,crit=crit,nsig=nsig) } regts1<-function(vstar,yhat,res,mflag,x,tr){ ystar<-yhat+res*vstar bres<-ystar-mean(ystar,tr) rval<-0 for (i in 1:nrow(x)){ rval[i]<-sum(bres[mflag[,i]]) } rval } bptd<-function(x,tr=.2,alpha=.05,con=0,nboot=599){ # # Using the percentile t bootstrap method, # compute a .95 confidence interval for all linear contasts # specified by con, a J by C matrix, where C is the number of # contrasts to be tested, and the columns of con are the # contrast coefficients. # # If con is not specified, all pairwise comparisons are performed. # # The trimmed means of dependent groups are being compared. # By default, 20% trimming is used with B=599 bootstrap samples. # # x can be an n by J matrix or it can have list mode # if(!is.list(x) && !is.matrix(x))stop("Data must be stored in a matrix or in list mode.") if(is.list(x)){ if(is.matrix(con)){ if(length(x)!=nrow(con))stop("The number of rows in con is not equal to the number of groups.") }} if(is.list(x)){ # put the data in an n by J matrix mat<-matrix(0,length(x[[1]]),length(x)) for (j in 1:length(x))mat[,j]<-x[[j]] } J<-ncol(mat) Jm<-J-1 if(sum(con^2)==0){ d<-(J^2-J)/2 con<-matrix(0,J,d) id<-0 for (j in 1:Jm){ jp<-j+1 for (k in jp:J){ id<-id+1 con[j,id]<-1 con[k,id]<-0-1 }}} if(is.matrix(x)){ if(ncol(x)!=nrow(con))stop("The number of rows in con is not equal to the number of groups.") mat<-x } if(sum(is.na(mat)>=1))stop("Missing values are not allowed.") J<-ncol(mat) connum<-ncol(con) bvec<-matrix(0,connum,nboot) set.seed(2) # set seed of random number generator so that # results can be duplicated. # data is an nboot by n matrix xcen<-matrix(0,nrow(mat),ncol(mat)) #An n by J matrix xbars<-matrix(0,nboot,ncol(mat)) psihat<-matrix(0,connum,nboot) print("Taking bootstrap samples. Please wait.") data<-matrix(sample(nrow(xcen),size=nrow(mat)*nboot,replace=T),nrow=nboot) for (j in 1:J){ xcen[,j]<-mat[,j]-mean(mat[,j],tr) #Center data xbars[,j]<-apply(data,1,bptdmean,xcen[,j],tr) } for (ic in 1:connum){ paste("Working on contrast number",ic) bvec[ic,]<-apply(data,1,bptdsub,xcen,tr,con[,ic]) # bvec is a connum by nboot matrix containing the bootstrap sq standard error psihat[ic,]<-apply(xbars,1,bptdpsi,con[,ic]) } bvec<-psihat/sqrt(bvec) #bvec now contains bootstrap test statistics bvec<-abs(bvec) #Doing two-sided confidence intervals icrit<-round((1-alpha)*nboot) critvec<-apply(bvec,2,max) critvec<-sort(critvec) crit<-critvec[icrit] psihat<-matrix(0,connum,4) dimnames(psihat)<-list(NULL,c("con.num","psihat","ci.lower","ci.upper")) test<-matrix(NA,connum,3) dimnames(test)<-list(NULL,c("con.num","test","se")) isub<-c(1:nrow(mat)) tmeans<-apply(mat,2,mean,trim=tr) sqse<-1 psi<-1 for (ic in 1:ncol(con)){ sqse[ic]<-bptdsub(isub,mat,tr,con[,ic]) psi[ic]<-sum(con[,ic]*tmeans) psihat[ic,1]<-ic psihat[ic,2]<-psi[ic] psihat[ic,3]<-psi[ic]-crit*sqrt(sqse[ic]) psihat[ic,4]<-psi[ic]+crit*sqrt(sqse[ic]) test[ic,1]<-ic test[ic,2]<-psi[ic]/sqrt(sqse[ic]) test[ic,3]<-sqrt(sqse[ic]) } list(test=test,psihat=psihat,crit=crit,con=con) } twomanbt<-function(x,y,tr=.2,alpha=.05,nboot=599){ # # Two-sample Behrens-Fisher problem. # # For each of two independent groups, # have p measures for each subject. The goal is to compare the # trimmed means of the first measure, the trimmed means for the second # and so on. So there are a total of p comparisons between the two # groups, one for each measure. # # The percentile t bootstrap method is used to # compute a .95 confidence interval. # # By default, 20% trimming is used with B=599 bootstrap samples. # # x contains the data for the first group; it # can be an n by J matrix or it can have list mode. # y contains the data for the second group. # if(!is.list(x) && !is.matrix(x))stop("Data must be stored in a matrix or in list mode.") if(!is.list(y) && !is.matrix(y))stop("Data must be stored in a matrix or in list mode.") if(is.list(x)){ # put the data in an n by p matrix matx<-matrix(0,length(x[[1]]),length(x)) for (j in 1:length(x))matx[,j]<-x[[j]] } if(is.list(y)){ # put the data in an n by p matrix maty<-matrix(0,length(y[[1]]),length(y)) for (j in 1:length(y))maty[,j]<-y[[j]] } if(is.matrix(x)){ matx<-x } if(is.matrix(y)){ maty<-y } if(ncol(matx)!=ncol(maty))stop("The number of variables for group one is not equal to the number for group 2") if(sum(is.na(mat)>=1))stop("Missing values are not allowed.") J<-ncol(mat) connum<-ncol(matx) bvec<-matrix(0,connum,nboot) set.seed(2) # set seed of random number generator so that # results can be duplicated. xcen<-matrix(0,nrow(matx),ncol(matx)) ycen<-matrix(0,nrow(maty),ncol(maty)) for (j in 1:connum)xcen[,j]<-matx[,j]-mean(matx[,j],tr) #Center data for (j in 1:connum)ycen[,j]<-maty[,j]-mean(maty[,j],tr) #Center data print("Taking bootstrap samples. Please wait.") bootx<-sample(nrow(matx),size=nrow(matx)*nboot,replace=T) booty<-sample(nrow(maty),size=nrow(maty)*nboot,replace=T) matval<-matrix(0,nrow=nboot,ncol=connum) for (j in 1:connum){ datax<-matrix(xcen[bootx,j],ncol=nrow(matx)) datay<-matrix(ycen[booty,j],ncol=nrow(maty)) paste("Working on variable", j) top<- apply(datax, 1., mean, tr) - apply(datay, 1., mean, tr) botx <- apply(datax, 1., trimse, tr) boty <- apply(datay, 1., trimse, tr) matval[,j]<-abs(top)/sqrt(botx^2. + boty^2.) } bvec<-apply(matval,1,max) icrit<-round((1-alpha)*nboot) bvec<-sort(bvec) crit<-bvec[icrit] psihat<-matrix(0,ncol=4,nrow=connum) dimnames(psihat)<-list(NULL,c("con.num","psihat","ci.lower","ci.upper")) test<-matrix(0,ncol=3,nrow=connum) dimnames(test)<-list(NULL,c("con.num","test","se")) for(j in 1:ncol(matx)){ temp<-yuen(matx[,j],maty[,j],tr=tr) test[j,1]<-j test[j,2]<-abs(temp$test) test[j,3]<-temp$se psihat[j,1]<-j psihat[j,2]<-mean(matx[,j],tr)-mean(maty[,j]) psihat[j,3]<-mean(matx[,j],tr)-mean(maty[,j])-crit*temp$se psihat[j,4]<-mean(matx[,j],tr)-mean(maty[,j])+crit*temp$se } list(psihat=psihat,teststat=test,critical.value=crit) } bootdep<-function(x,tr=.2,nboot=500){ # # x is a matrix (n by p) or has list mode # Goal: Obtain boostrap samples and compute # the trimmed each for each of the p variables. # Return the bootstrap means in a matrix # # tr is the amount of trimming # nboot is the number of bootstrap samples # if(is.matrix(x))m1<-x if(is.list(x)){ # put the data into a matrix m1<-matrix(NA,ncol=length(x)) for(j in 1:length(x))m1[,j]<-x[[j]] } data<-matrix(sample(nrow(m1),size=nrow(m1)*nboot,replace=T),nrow=nboot) bvec<-matrix(NA,ncol=ncol(m1),nrow=nboot) for(j in 1:ncol(m1)){ temp<-m1[,j] bvec[,j]<-apply(data, 1., bootdepsub,temp,tr) } # return a nboot by p matrix of bootstrap trimmed means. bvec } bootdepsub<-function(isub,x,tr){ tsub<-mean(x[isub],tr) tsub } corb<-function(x,y,corfun=pbcor,nboot=599,SEED=T,...){ # # Compute a .95 confidence interval for a correlation. # The default correlation is the percentage bend. # # The function corfun is any s-plus function that returns a # correlation coefficient in corfun$cor. The functions pbcor and # wincor follow this convention. # # When using Pearson's correlation, and when n<250, use # lsfitci instead. # # The default number of bootstrap samples is nboot=599 # est<-corfun(x,y,...)$cor if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. #print("Taking bootstrap samples. Please wait.") data<-matrix(sample(length(y),size=length(y)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,corbsub,x,y,corfun,...) # A 1 by nboot matrix. ihi<-floor(.975*nboot+.5) ilow<-floor(.025*nboot+.5) bsort<-sort(bvec) corci<-1 corci[1]<-bsort[ilow] corci[2]<-bsort[ihi] phat <- sum(bvec < 0)/nboot sig <- 2 * min(phat, 1 - phat) list(cor.ci=corci,p.value=sig,cor.est=est) } corbsub<-function(isub,x,y,corfun,...){ # # Compute correlation for x[isub] and y[isub] # isub is a vector of length n, # a bootstrap sample from the sequence of integers # 1, 2, 3, ..., n # # This function is used by other functions when computing # bootstrap estimates. # # corfun is some correlation function already stored in S-PLUS # corbsub<-corfun(x[isub],y[isub],...)$cor corbsub } depreg<-function(x,y){ # # Compute the depth regression estimator. # Only a single predictor is allowed in this version # if(is.matrix(x)){ if(ncol(x)>=2)stop("Only a single predicor is allowed") x<-as.vector(x) } ord<-order(x) xs<-x[ord] ys<-y[ord] vec1<-outer(ys,ys,"-") vec2<-outer(xs,xs,"-") v1<-vec1[vec2>0] v2<-vec2[vec2>0] slope<-v1/v2 vec3<-outer(ys,ys,"+") vec4<-outer(xs,xs,"+") v3<-vec3[vec2>0] v4<-vec4[vec2>0] deep<-NA inter<-v3/2-slope*v4/2 temp<-matrix(c(inter,slope),ncol=2) deep<-apply(temp,1,rdepth,x,y) best<-max(deep) coef<-NA coef[2]<-mean(slope[deep==best]) coef[1]<-mean(inter[deep==best]) res<-y-coef[2]*x-coef[1] list(coef=coef,residuals=res) } tsgreg<-function(x,y,tries=(length(y)^2-length(y))/2){ # # x<-as.matrix(x) if(nrow(x)!=length(y))stop("Length of y must match the number of rows of x") # eliminate any rows with missing values. m1<-cbind(x,y) m1<-elimna(m1) x<-m1[,1:ncol(x)] y<-m1[,ncol(x)+1] set.seed(2) data<-matrix(NA,ncol=ncol(x)+1,nrow=tries) for(i in 1:tries){ data[i,]<-sample(length(y),size=ncol(x)+1,replace=F) } bvec <- apply(data, 1,tsgregs1,x,y) coef<-0 numzero<-0 loc<-0 for (i in 1:ncol(x)){ ip<-i+1 temp<-bvec[ip,] loc[i]<-median(x[,i]) coef[i+1]<-median(temp[temp!=0]) numzero[i]<-length(temp[temp==0]) } ip<-ncol(x)+1 coef[1]<-median(y)-sum(coef[2:ip]*loc) res<-y-x %*% coef[2:ip] - coef[1] list(coef=coef,residuals=res,numzero=numzero) } tsgregs1<-function(isub,x,y){ # # This function is used by tsgreg # # Perform regression using x[isub,] to predict y[isub] # isub is a vector of length nsub, determined by tsgreg # tsgregs1<-lsfit(x[isub,],y[isub])$coef } lts1reg<-function(x,y,tr=.2,h=NA){ # # Compute the least trimmed squares regression estimator. # Only a single predictor is allowed in this version # if(is.na(h))h<-length(x)-floor(tr * length(x)) ord<-order(x) xs<-x[ord] ys<-y[ord] vec1<-outer(ys,ys,"-") vec2<-outer(xs,xs,"-") v1<-vec1[vec2>0] v2<-vec2[vec2>0] slope<-v1/v2 vec3<-outer(ys,ys,"+") vec4<-outer(xs,xs,"+") v3<-vec3[vec2>0] v4<-vec4[vec2>0] val<-NA inter<-v3/2-slope*v4/2 for(i in 1:length(slope)){ #risk<-(y[vec2>0]-slope[i]*x[vec2>0]-inter[i])^2 risk<-(y-slope[i]*x-inter[i])^2 risk<-sort(risk) val[i]<-sum(risk[1:h]) } best<-min(val) coef<-NA coef[2]<-mean(slope[val==best]) coef[1]<-mean(inter[val==best]) res<-y-coef[2]*x-coef[1] list(coef=coef,residuals=res) } cidmul<-function(x,alpha=.05){ # # Perform Cliff's method for all pairs of J independent groups. # Unlike the functioni meemul, ties are allowed. # The familywise type I error probability is controlled by using # a critical value from the Studentized maximum modulus distribution. # # The data are assumed to be stored in $x$ in list mode. # Length(x) is assumed to correspond to the total number of groups, J # It is assumed all groups are independent. # # Missing values are automatically removed. # # The default value for alpha is .05. Any other value results in using # alpha=.01. # if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in list mode.") J<-length(x) CC<-(J^2-J)/2 test<-matrix(NA,CC,5) for(j in 1:J){ xx<-!is.na(x[[j]]) val<-x[[j]] x[[j]]<-val[xx] # Remove missing values } dimnames(test)<-list(NULL,c("Group","Group","d","ci.lower","ci.upper")) jcom<-0 crit<-smmcrit(200,CC) if(alpha!=.05)crit<-smmcrit01(200,CC) alpha<-1-pnorm(crit) for (j in 1:J){ for (k in 1:J){ if (j < k){ temp<-cid(x[[j]],x[[k]],alpha) jcom<-jcom+1 test[jcom,1]<-j test[jcom,2]<-k test[jcom,3]<-temp$d test[jcom,4]<-temp$cl test[jcom,5]<-temp$cu }}} list(test=test) } man2pb<-function(x,y,alpha=.05,nboot=NA,crit=NA){ # # Two-sample Behrens-Fisher problem. # # For each of two independent groups, # have P measures for each subject. The goal is to compare the 20% # trimmed means of the first group to the trimmed means for the second; # this is done for each of the P measures. # # The percentile bootstrap method is used to # compute a .95, or .975, or .99 confidence interval. # # Only 20% trimming is allowed. # # x contains the data for the first group; it # can be an n by J matrix or it can have list mode. # y contains the data for the second group. # # Vectors with missing values are eliminated from the analysis. # if(!is.list(x) && !is.matrix(x))stop("Data must be stored in a matrix or in list mode.") if(!is.list(y) && !is.matrix(y))stop("Data must be stored in a matrix or in list mode.") if(is.list(x)){ # put the data in an n by p matrix matx<-matrix(0,length(x[[1]]),length(x)) for (j in 1:length(x))matx[,j]<-x[[j]] } if(is.list(y)){ # put the data in an n by p matrix maty<-matrix(0,length(y[[1]]),length(y)) for (j in 1:length(y))maty[,j]<-y[[j]] } if(is.matrix(x)){ matx<-x } if(is.matrix(y)){ maty<-y } if(ncol(matx)!=ncol(maty))stop("The number of variables for group 1 is not equal to the number for group 2") if(sum(is.na(matx)>=1))matx<-elimna(matx) if(sum(is.na(maty)>=1))maty<-elimna(maty) J<-ncol(matx) connum<-ncol(matx) if(is.na(nboot)){ if(ncol(matx)<=4)nboot<-2000 if(ncol(matx)>4)nboot<-5000 } # # Determine critical value # if(ncol(matx)==2){ if(alpha==.05)crit<-.0125 if(alpha==.025)crit<-.0060 if(alpha==.01)crit<-.0015 } if(ncol(matx)==3){ if(alpha==.05)crit<-.007 if(alpha==.025)crit<-.003 if(alpha==.01)crit<-.001 } if(ncol(matx)==4){ if(alpha==.05)crit<-.0055 if(alpha==.025)crit<-.0020 if(alpha==.01)crit<-.0005 } if(ncol(matx)==5){ if(alpha==.05)crit<-.0044 if(alpha==.025)crit<-.0016 if(alpha==.01)crit<-.0005 } if(ncol(matx)==6){ if(alpha==.05)crit<-.0038 if(alpha==.025)crit<-.0018 if(alpha==.01)crit<-.0004 } if(ncol(matx)==7){ if(alpha==.05)crit<-.0028 if(alpha==.025)crit<-.0010 if(alpha==.01)crit<-.0002 } if(ncol(matx)==8){ if(alpha==.05)crit<-.0026 if(alpha==.025)crit<-.001 if(alpha==.01)crit<-.0002 } if(ncol(matx)>8){ # Use an approximation of the critical value if(alpha==.025)warning("Can't determine a critical value when alpha=.025 and the number of groups exceeds 8.") nmin<-min(nrow(matx),nrow(maty)) if(alpha==.05){ if(nmin<100)wval<-smmcrit(60,ncol(matx)) if(nmin>=100)wval<-smmcrit(300,ncol(matx)) wval<-0-wval crit<-pnorm(wval) } if(alpha==.01){ if(nmin<100)wval<-smmcrit01(60,ncol(matx)) if(nmin>=100)wval<-smmcrit01(300,ncol(matx)) wval<-0-wval crit<-pnorm(wval) } } if(is.na(crit))warning("Critical values can be determined for alpha=.05, .025 and .01 only") icl<-ceiling(crit*nboot) icu<-ceiling((1-crit)*nboot) set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") bootx<-bootdep(matx,tr=.2,nboot) booty<-bootdep(maty,tr=.2,nboot) # # Now have an nboot by J matrix of bootstrap values. # test<-1 for (j in 1:connum){ test[j]<-sum(bootx[,j].5)test[j]<-1-test[j] } output <- matrix(0, connum, 5) dimnames(output) <- list(NULL, c("variable #", "psihat", "test", "ci.lower", "ci.upper")) tmeanx <- apply(matx, 2, mean, trim = .2) tmeany <- apply(maty, 2, mean, trim = .2) psi <- 1 for(ic in 1:connum) { output[ic, 2] <- tmeanx[ic]-tmeany[ic] output[ic, 1] <- ic output[ic, 3] <- test[ic] temp <- sort(bootx[,ic]-booty[,ic]) print(length(temp)) output[ic, 4] <- temp[icl] output[ic, 5] <- temp[icu] } list(output = output, crit.value = crit) } qhatds1<-function(isubx,x,y){ # # function used by qhat when working on bootstrap estimates. # xx<-x[isubx] yy<-y[isubx] group<-disker(xx,yy,x,op=2)$zhat group } qhatd<-function(x,y,nboot=50){ # # Estimate Q, a nonparametric measure of effect size, using # the .632 method of estimating prediction error. # (See Efron and Tibshirani, 1993, pp. 252--254) # # The default number of bootstrap samples is nboot=100 # # This function is for dependent groups. For independent groups, use # qhati # set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") data<-matrix(sample(length(x),size=length(x)*nboot,replace=T),nrow=nboot) # data is an nboot by n matrix containing subscripts for bootstrap sample bid<-apply(data,1,idb,length(x)) # bid is a n by nboot matrix. If the jth bootstrap sample from # 1, ..., n contains the value i, bid[i,j]=0; otherwise bid[i,j]=1 yhat<-apply(data,1,qhatds1,x,y) bi<-apply(bid,1,sum) # B sub i in notation of Efron and Tibshirani, p. 253 temp<-(bid*yhat) diff<-apply(temp,1,sum) temp<-diff/bi ep0<-sum(temp[!is.na(temp)])/length(y) aperror<-disker(x,y)$phat # apparent error regpre<-.368*aperror+.632*ep0 list(app.error=aperror,qhat.632=regpre) } winmean<-function(x,tr=.2){ winmean<-mean(winval(x,tr)) winmean } kerden<-function(x,q=.5,xval=0){ # Compute the kernel density estimator of the # probability density function evaluated at the qth quantile. # # x contains vector of observations # q is the quantile of interest, the default is the median. # If you want to evaluate f hat at xval rather than at the # q th quantile, set q=0 and xval to desired value. # y<-sort(x) n<-length(x) temp<-idealf(x) h<-1.2*(temp$qu-temp$ql)/n^(.2) iq<-floor(q*n+.5) qhat<-y[iq] if (q==0) qhat<-xval xph<-qhat+h A<-length(y[y<=xph]) xmh<-qhat-h B<-length(y[y0 & !is.na(l)])+length(u[u<0 & !is.na(u)]) qhat<-c(1:length(x))/length(x) m<-matrix(c(qhat,l,u),length(x),3) dimnames(m)<-list(NULL,c("qhat","lower","upper")) if(plotit){ temp2 <- m[, 2] temp2 <- temp2[!is.na(temp2)] xsort<-sort(x) ysort<-sort(y) del<-0 for (i in 1:length(x))del[i]<-ysort[round(length(y)*i/length(x))]-xsort[i] xaxis<-c(xsort,xsort,xsort) yaxis<-c(del,m[,2],m[,3]) plot(xaxis,yaxis,type="n",ylab="delta",xlab="x (first group)") lines(xsort,del) lines(xsort,m[,2],lty=2) lines(xsort,m[,3],lty=2) temp <- summary(x) text(temp[3], min(temp2), "+") text(temp[2], min(temp2), "o") text(temp[5], min(temp2), "o") } list(m=m,crit=crit,numsig=num,pc=pc) } runcor<-function(x,y,z,fr=1,corflag=F,corfun=pbcor,plotit=T,rhat=F){ # # Estimate how the correlation between x and y varies with z # # running correlation using interval method # # fr controls amount of smoothing # # corfun is the correlation to be used. It is assumed that # corfun is an s-plus function that returns a correlation coefficient # in corfun$cor # # To use Pearsons correlation, set corflag=T # temp<-cbind(x,y,z) # Eliminate any rows with missing values temp<-elimna(temp) x<-temp[,1] y<-temp[,2] z<-temp[,3] plotit<-as.logical(plotit) rmd<-NA if(!corflag){ for(i in 1:length(x)){ flag<-near(z,z[i],fr) if(sum(flag)>2)rmd[i]<-corfun(x[flag],y[flag])$cor }} if(corflag){ for(i in 1:length(x)){ flag<-near(z,z[i],fr) if(sum(flag)>2)rmd[i]<-cor(x[flag],y[flag]) }} if(plotit){ plot(c(max(z),min(z),z),c(1,-1,rmd),xlab="Modifier",ylab="Correlation",type="n") sz<-sort(z) zorder<-order(z) sysm<-rmd[zorder] lines(sz,sysm) } if(!rhat)rmd<-"Done" rmd } twocor<-function(x1,y1,x2,y2,corfun=pbcor,nboot=599,alpha=.05,SEED=T,...){ # # Compute a .95 confidence interval for the # difference between two correlation coefficients # corresponding to two independent groups. # # the function corfun is any s-plus function that returns a # correlation coefficient in corfun$cor. The functions pbcor and # wincor follow this convention. # # For Pearson's correlation, use # the function twopcor instead. # # The default number of bootstrap samples is nboot=599 # if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") data1<-matrix(sample(length(y1),size=length(y1)*nboot,replace=T),nrow=nboot) bvec1<-apply(data1,1,corbsub,x1,y1,corfun,...) # A 1 by nboot matrix. data2<-matrix(sample(length(y2),size=length(y2)*nboot,replace=T),nrow=nboot) bvec2<-apply(data2,1,corbsub,x2,y2,corfun,...) # A 1 by nboot matrix. bvec<-bvec1-bvec2 bsort<-sort(bvec) term<-alpha/2 ihi<-floor((1-term)*nboot+.5) ilow<-floor(term*nboot+.5) corci<-1 corci[1]<-bsort[ilow] corci[2]<-bsort[ihi] r1<-corfun(x1,y1)$cor r2<-corfun(x2,y2)$cor reject<-"NO" if(corci[1]>0 || corci[2]<0)reject="YES" list(r1=r1,r2=r2,ci.dif=corci,reject=reject) } pcorb<-function(x,y,SEED=T){ # Compute a .95 confidence interval for Pearson's correlation coefficient. # # This function uses an adjusted percentile bootstrap method that # gives good results when the error term is heteroscedastic. # nboot<-599 #Number of bootstrap samples xy<-elimna(cbind(x,y)) x<-xy[,1] y<-xy[,2] if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. #print("Taking bootstrap samples; please wait") data<-matrix(sample(length(y),size=length(y)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,pcorbsub,x,y) # A 1 by nboot matrix. ilow<-15 ihi<-584 if(length(y) < 250){ ilow<-14 ihi<-585 } if(length(y) < 180){ ilow<-11 ihi<-588 } if(length(y) < 80){ ilow<-8 ihi<-592 } if(length(y) < 40){ ilow<-7 ihi<-593 } bsort<-sort(bvec) r<-cor(x,y) ci<-c(bsort[ilow],bsort[ihi]) list(r=r,ci=ci) } twobici<-function(r1=sum(x),n1=length(x),r2=sum(y),n2=length(y),x=NA,y=NA,alpha=.05){ # # Compute confidence interval for p1-p2, # the difference between probabilities of # success for a two binomials using Beal's method. # # r is number of successes # n is sample size # if x contains data, r1 is taken to be the # number of 1s in x and n1 is length(x) # if(length(r1)>1)stop("r1 must be a single number, not a vector") if(length(n1)>1)stop("n1 must be a single number, not a vector") if(length(r2)>1)stop("r2 must be a single number, not a vector") if(!is.na(sum(r1)) || !is.na(sum(n1)) || !is.na(sum(r2)) || !is.na(sum(n2))){ if(r1<0 || n1<0)stop("Both r1 and n1 must be greater than 0") if(r1 > n1)stop("r1 can't be greater than n1") if(r2<0 || n2<0)stop("Both r2 and n2 must be greater than 0") if(r2 > n2)stop("r2 can't be greater than n2") } if(!is.na(sum(x))){ r1<-sum(x) n1<-length(x) } if(!is.na(sum(y))){ r2<-sum(y) n2<-length(y) } a<-(r1/n1)+(r2/n2) b<-(r1/n1)-(r2/n2) u<-.25*((1/n1)+(1/n2)) v<-.25*((1/n1)-(1/n2)) V<-u*((2-a)*a-b^2)+2*v*(1-a)*b crit<-qchisq(1-alpha/2,1) A<-sqrt(crit*(V+crit*u^2*(2-a)*a+crit*v^2*(1-a)^2)) B<-(b+crit*v*(1-a))/(1+crit*u) ci<-NA ci[1]<-B-A/(1+crit*u) ci[2]<-B+A/(1+crit*u) p1<-r1/n1 p2<-r2/n2 list(ci=ci,p1=p1,p2=p2) } runmean<-function(x,y,fr=1,tr=.2,pyhat=F,eout=F,outfun=out,plotit=T,xout=F, xlab="x",ylab="y"){ # # running mean using interval method # # fr controls amount of smoothing # tr is the amount of trimming # # Missing values are automatically removed. # if(eout && xout)xout<-F temp<-cbind(x,y) temp<-elimna(temp) # Eliminate any rows with missing values if(eout){ flag<-outfun(temp,plotit=F)$keep temp<-temp[flag,] } if(xout){ flag<-outfun(x,plotit=F)$keep temp<-temp[flag,] } x<-temp[,1] y<-temp[,2] pyhat<-as.logical(pyhat) rmd<-c(1:length(x)) for(i in 1:length(x))rmd[i]<-mean(y[near(x,x[i],fr)],tr) if(pyhat)return(rmd) if(plotit){ plot(x,y,xlab=xlab,ylab=ylab) sx<-sort(x) xorder<-order(x) sysm<-rmd[xorder] tempx<-(!duplicated(sx)) lines(sx[tempx], sysm[tempx]) }} pcorbsub<-function(isub, x, y) { # # Compute Pearson's correlation using x[isub] and y[isub] # isub is a vector of length n, # a bootstrap sample from the sequence of integers # 1, 2, 3, ..., n # pcorbsub<-cor(x[isub],y[isub]) pcorbsub } pow1<-function(n,Del,alpha){ # # Determine power of Student's T in the # one-sided, one-sample case where # # n=sample size # Del=(mu0-mu1)/sigma # alpha=Type I error probability # mu0 is hypothesized value # mu1 is some non-null value for the mean. # Del<-abs(Del) if(alpha<=0 || alpha>=1)stop("alpha must be between 0 and 1") K11<-1-alpha K5<-sqrt(n)*Del # Next, use the Kraemer-Paik (1979, Technometrics, 21, 357-360) # approximation of the noncentral T. K6<-n-1 K14<-qt(K11,K6) K7<-K14*sqrt(1+K5*K5/K6) K8<-K5*sqrt(1+K14*K14/K6) K9<-K7-K8 pow1<-1-pt(K9,K6) pow1 } stein1<-function(x,del,alpha=.05,pow=.8,oneside=F){ # # Performs Stein's method on the data in x. # In the event additional observations are required # and can be obtained, use the s-plus function stein2. # del<-abs(del) n<-length(x) df<-n-1 if(!oneside)alpha<-alpha/2 d<-(del/(qt(pow,df)-qt(alpha,df)))^2 N<-max(c(n,floor(var(x)/d)+1)) N } stein2<-function(x1,x2,mu0=0,alpha=.05){ # # Do second stage of Stein's method # x1 contains first stage data # x2 contains first stage data # mu0 is the hypothesized value # n<-length(x1) df<-n-1 N<-n+length(x2) test<-sqrt(N)*(mean(c(x1,x2))-mu0)/sqrt(var(x1)) crit <- qt(1 - alpha/2, df) low<- mean(c(x1,x2))-crit*sqrt(var(x1)) up<- mean(c(x1,x2))+crit*sqrt(var(x1)) sig<-2*(1-pt(test,df)) list(ci = c(low, up), siglevel =sig,mean=mean(c(x1,x2)), teststat = test, crit = crit, df = df) } ci2bin<-function(r1=sum(x),n1=length(x),r2=sum(y),n2=length(y),x=NA,y=NA,alpha=0.05){ # # Compute a confidence interval for the # difference between probability of success # for two independent binomials # # r1=number of successes in group 1 # n1=number of observations in group 1 # cr<-qchisq(1-alpha,1) p1<-r1/n1 p2<-r2/n2 a<-p1+p2 b<-p1-p2 u<-.25*(1/n1+1/n2) v<-.25*(1/n1-1/n2) V<-u*((2-a)*a-b^2)+2*v*(1-a)*b A<-sqrt(cr*(V+cr*u^2*(2-a)*a+cr*v^2*(1-a)^2)) B<-(b+cr*v*(1-a))/(1+cr*u) ci<-NA ci[1]<-B-A/(1+cr*u) ci[2]<-B+A/(1+cr*u) list(ci=ci) } powt1est<-function(x,delta=0,ci=F,nboot=800){ # # Estimate power for a given value of delta # # Only 20% trimming is allowed. # temp1<-powest(x,rep(0,5),delta,se=trimse(x)) if(ci){ set.seed(2) pboot<-NA datay<-rep(0,5) print("Taking bootstrap samples. Please wait.") datax <- matrix(sample(x, size = length(x) * nboot, replace = T ), nrow = nboot) for(i in 1:nboot) { se <- trimse(datax[i, ]) pboot[i] <- powest(x, rep(0,5), delta, se) } temp <- sort(pboot) } ll<-floor(0.05 * nboot + 0.5) list(est.power=temp1,ci=temp[ll]) } powt1an<-function(x,ci=F,plotit=T,nboot=800){ # # Do a power analysis for the one-sample case with 20% trimmed # mean and when the percentile bootstrap is to be used to test # hypoltheses. # x<-x[!is.na(x)] lp<-NA se<-trimse(x) gval<-NA dv<-seq(0,3.5*se,length=15) for(i in 1:length(dv)){ gval[i]<-powest(x,rep(0,5),dv[i],se) } if(!ci){ if(plotit){ plot(dv,gval,type="n",xlab="delta",ylab="power") lines(dv,gval) }} if(ci){ set.seed(2) print("Taking bootstrap samples. Please wait.") datax <- matrix(sample(x, size = length(x) * nboot, replace = T), nrow = nboot) pboot<-matrix(NA,nrow=nboot,ncol=length(dv)) for(i in 1:nboot){ se<-trimse(datax[i,]) for(j in 1:length(dv)){ pboot[i,j]<-powest(x,rep(0,5),dv[j],se) }} ll<-floor(.05*nboot+.5) for(i in 1:15){ temp<-sort(pboot[,i]) lp[i]<-temp[ll] } plot(c(dv,dv),c(gval,lp),type="n",xlab="delta",ylab="power") lines(dv,gval) lines(dv,lp,lty=2) } list(delta=dv,power=gval,lowp=lp) } trimpb2<-function(x,y,tr=.2,alpha=.05,nboot=2000,WIN=F,win=.1,plotit=F,op=4){ # # Compute a 1-alpha confidence interval for # the difference between two 20% trimmed means. # Independent groups are assumed. # # The default number of bootstrap samples is nboot=2000 # # tr is the amount of trimming # # win is the amount of Winsorizing before bootstrapping # when WIN=T. # # Missing values are automatically removed. # x<-x[!is.na(x)] y<-y[!is.na(y)] if(WIN){ if(win>tr)stop("Cannot Winsorize more than you trim") if(tr < .2){print("When Winsorizing, the amount of trimming") print("should be at least .2") } if(min(c(length(x),length(y))) < 15){ print ("Warning: Winsorizing with sample sizes less than 15") print("can result in poor control over the probability of a Type I error") } x<-winval(x,win) y<-winval(y,win) } xx<-list() xx[[1]]<-x xx[[2]]<-y est.dif<-tmean(xx[[1]],tr=tr)-tmean(xx[[2]],tr=tr) crit<-alpha/2 temp<-round(crit*nboot) icl<-temp+1 icu<-nboot-temp bvec<-matrix(NA,nrow=2,ncol=nboot) set.seed(2) # set seed of random number generator so that # results can be duplicated. for(j in 1:2){ data<-matrix(sample(xx[[j]],size=length(xx[[j]])*nboot,replace=T),nrow=nboot) bvec[j,]<-apply(data,1,mean,tr) # Bootstrapped trimmed means for jth group } top<-bvec[1,]-bvec[2,] test<-sum(top<0)/nboot+.5*sum(top==0)/nboot if(test > .5)test<-1-test top<-sort(top) ci<-NA ci[1]<-top[icl] ci[2]<-top[icu] if(plotit)g2plot(bvec[1,],bvec[2,],op=op) list(p.value=2*test,ci=ci,est.dif=est.dif) } twolsreg<-function(x1,y1,x2,y2){ # # Compute a .95 confidence interval for # the difference between two regression slopes, # estimated via least squares and # corresponding to two independent groups. # # This function uses an adjusted percentile bootstrap method that # gives good results when the error term is heteroscedastic. # # WARNING: If the number of boostrap samples is altered, it is # unknown how to adjust the confidence interval when n1+n2 < 250. # nboot<-599 #Number of bootstrap samples set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples; please wait") data1<-matrix(sample(length(y1),size=length(y1)*nboot,replace=T),nrow=nboot) bvec1<-apply(data1,1,twolsregsub,x1,y1) # A 1 by nboot matrix. data2<-matrix(sample(length(y2),size=length(y2)*nboot,replace=T),nrow=nboot) bvec2<-apply(data2,1,twolsregsub,x2,y2) # A 1 by nboot matrix. bvec<-bvec1-bvec2 ilow<-15 ihi<-584 if(length(y1)+length(y2) < 250){ ilow<-14 ihi<-585 } if(length(y1)+length(y2) < 180){ ilow<-11 ihi<-588 } if(length(y1)+length(y2) < 80){ ilow<-8 ihi<-592 } if(length(y1)+length(y2) < 40){ ilow<-7 ihi<-593 } bsort<-sort(bvec) b1<-lsfit(x1,y1)$coef[2] b2<-lsfit(x2,y2)$coef[2] ci<-c(bsort[ilow],bsort[ihi]) list(b1=b1,b2=b2,ci=ci) } twolsregsub<-function(isub, x, y) { # # Compute least squares estimate of the # slope using x[isub] and y[isub] # isub is a vector of length n, # a bootstrap sample from the sequence of integers # 1, 2, 3, ..., n # twolsregsub<-lsfit(x[isub],y[isub])$coef[2] twolsregsub } bdanova1<-function(x,alpha=.05,power=.9,delta=NA){ # # Do the first stage of a Bishop-Dudewicz ANOVA method. # That is, based on the data in x # determine N_j, the number of observations needed # in the jth group to achieve power 1-beta. # # The argument x is assumed to have list mode or the # data is assumed to be stored in an n by J matrix # if(is.na(delta))stop("A value for delta was not specified") if(!is.list(x)){ if(!is.matrix(x))stop("Data must be stored in matrix or in list mode") y<-x x<-list() for(j in 1:ncol(y))x[[j]]<-y[,j] } nvec<-NA svec<-NA J<-length(x) for(j in 1:length(x)){ nvec[j]<-length(x[[j]]) svec[j]<-var(x[[j]]) } nu<-nvec-1 nu1<-sum(1/(nu-2)) nu1<-J/nu1+2 A<-(J-1)*nu1/(nu1-2) B<-(nu1^2/J)*(J-1)/(nu1-2) C<-3*(J-1)/(nu1-4) D<-(J^2-2*J+3)/(nu1-2) E<-B*(C+D) M<-(4*E-2*A^2)/(E-A^2-2*A) L<-A*(M-2)/M f<-qf(1-alpha,L,M) crit<-L*f b<-(nu1-2)*crit/nu1 zz<-qnorm(power) A<-.5*(sqrt(2)*zz+sqrt(2*zz^2+4*(2*b-J+2))) B<-A^2-b d<-((nu1-2)/nu1)*delta/B N<-NA for(j in 1:length(x)){ N[j]<-max(c(nvec[j]+1,floor(svec[j]/d)+1)) } list(N=N,d=d,crit=crit) } comvar2<-function(x,y,nboot=599){ # # Compare the variances of two independent groups. # x<-x[!is.na(x)] # Remove missing values in x y<-y[!is.na(y)] # Remove missing values in y # set seed of random number generator so that # results can be duplicated. sig<-var(x)-var(y) set.seed(2) nmin<-min(length(x),length(y)) print("Taking bootstrap samples. Please wait.") datax<-matrix(sample(x,size=nmin*nboot,replace=T),nrow=nboot) datay<-matrix(sample(y,size=nmin*nboot,replace=T),nrow=nboot) v1<-apply(datax,1,FUN=var) v2<-apply(datay,1,FUN=var) boot<-v1-v2 boot<-sort(boot) ilow <- 15 ihi <- 584 if(nmin < 250) { ilow <- 13 ihi <- 586 } if(nmin < 180) { ilow <- 10 ihi <- 589 } if(nmin < 80) { ilow <- 7 ihi <- 592 } if(nmin < 40) { ilow <- 6 ihi <- 593 } ilow<-round((ilow/599)*nboot) ihi<-round((ihi/599)*nboot) ci<-c(boot[ilow+1],boot[ihi]) list(ci=ci,vardif=sig) } regi<-function(x,y,z,pt=median(z),fr=.8,est=mom,regfun=tsreg,testit=F,...){ # # split the data according to whether z is < or > pt, then # use runmean2g to plot a smooth of the regression # lines corresponding to these two groups. # m<-cbind(x,y,z) m<-elimna(m) x<-m[,1] y<-m[,2] z<-m[,3] flag<-(z=12]) isub[5]<-max(sub[vecn>=12]) isub[3]<-floor((isub[1]+isub[5])/2) isub[2]<-floor((isub[1]+isub[3])/2) isub[4]<-floor((isub[3]+isub[5])/2) mat<-matrix(NA,5,3) dimnames(mat)<-list(NULL,c("X","n1","n2")) for (i in 1:5){ j<-i+5 temp1<-y1[near(x1,x1[isub[i]],fr1)] temp2<-y2[near(x2,x1[isub[i]],fr2)] temp1<-temp1[!is.na(temp1)] temp2<-temp2[!is.na(temp2)] mat[i,1]<-x1[isub[i]] mat[i,2]<-length(temp1) mat[i,3]<-length(temp2) gv1[[i]]<-temp1 gv1[[j]]<-temp2 } I1<-diag(npt) I2<-0-I1 con<-rbind(I1,I2) test<-pbmcp(gv1,alpha=alpha,nboot=nboot,est=est,con=con,...) } # if(!is.na(pts[1])){ npt<-length(pts) n1<-1 n2<-1 vecn<-1 for(i in 1:length(pts)){ n1[i]<-length(y1[near(x1,pts[i],fr1)]) n2[i]<-length(y2[near(x2,pts[i],fr2)]) } mat<-matrix(NA,length(pts),3) dimnames(mat)<-list(NULL,c("X","n1","n2")) gv<-vector("list",2*length(pts)) for (i in 1:length(pts)){ j<-i+npt temp1<-y1[near(x1,pts[i],fr1)] temp2<-y2[near(x2,pts[i],fr2)] temp1<-temp1[!is.na(temp1)] temp2<-temp2[!is.na(temp2)] mat[i,1]<-pts[i] if(length(temp1)<=5)paste("Warning, there are",length(temp1)," points corresponding to the design point X=",pts[i]) if(length(temp2)<=5)paste("Warning, there are",length(temp2)," points corresponding to the design point X=",pts[i]) mat[i,2]<-length(temp1) mat[i,3]<-length(temp2) gv1[[i]]<-temp1 gv1[[j]]<-temp2 } I1<-diag(npt) I2<-0-I1 con<-rbind(I1,I2) test<-pbmcp(gv1,alpha=alpha,nboot=nboot,est=est,con=con,...) } if(plotit) runmean2g(x1,y1,x2,y2,fr=fr1,est=est,...) list(mat=mat,output=test) } ancboot<-function(x1,y1,x2,y2,fr1=1,fr2=1,tr=.2,nboot=599,pts=NA,plotit=T){ # # Compare two independent groups using the ancova method # in chapter 9. No assumption is made about the form of the regression # lines--a running interval smoother is used. # Confidence intervals are computed using a percentile t bootstrap # method. Comparisons are made at five empirically chosen design points. # # Assume data are in x1 y1 x2 and y2 # if(is.na(pts[1])){ isub<-c(1:5) # Initialize isub test<-c(1:5) xorder<-order(x1) y1<-y1[xorder] x1<-x1[xorder] xorder<-order(x2) y2<-y2[xorder] x2<-x2[xorder] n1<-1 n2<-1 vecn<-1 for(i in 1:length(x1))n1[i]<-length(y1[near(x1,x1[i],fr1)]) for(i in 1:length(x1))n2[i]<-length(y2[near(x2,x1[i],fr2)]) for(i in 1:length(x1))vecn[i]<-min(n1[i],n2[i]) sub<-c(1:length(x1)) isub[1]<-min(sub[vecn>=12]) isub[5]<-max(sub[vecn>=12]) isub[3]<-floor((isub[1]+isub[5])/2) isub[2]<-floor((isub[1]+isub[3])/2) isub[4]<-floor((isub[3]+isub[5])/2) mat<-matrix(NA,5,8) dimnames(mat)<-list(NULL,c("X","n1","n2","DIF","TEST","ci.low","ci.hi", "p.value")) gv1<-vector("list") for (i in 1:5){ j<-i+5 temp1<-y1[near(x1,x1[isub[i]],fr1)] temp2<-y2[near(x2,x1[isub[i]],fr2)] temp1<-temp1[!is.na(temp1)] temp2<-temp2[!is.na(temp2)] mat[i,2]<-length(temp1) mat[i,3]<-length(temp2) gv1[[i]]<-temp1 gv1[[j]]<-temp2 } I1<-diag(5) I2<-0-I1 con<-rbind(I1,I2) test<-linconb(gv1,con=con,tr=tr,nboot=nboot) for(i in 1:5){ mat[i,1]<-x1[isub[i]] } mat[,4]<-test$psihat[,2] mat[,5]<-test$test[,2] mat[,6]<-test$psihat[,3] mat[,7]<-test$psihat[,4] mat[,8]<-test$test[,4] } if(!is.na(pts[1])){ n1<-1 n2<-1 vecn<-1 for(i in 1:length(pts)){ n1[i]<-length(y1[near(x1,pts[i],fr1)]) n2[i]<-length(y2[near(x2,pts[i],fr2)]) if(n1[i]<=5)paste("Warning, there are",n1[i]," points corresponding to the design point X=",pts[i]) if(n2[i]<=5)paste("Warning, there are",n2[i]," points corresponding to the design point X=",pts[i]) } mat<-matrix(NA,length(pts),9) dimnames(mat)<-list(NULL,c("X","n1","n2","DIF","TEST","se","ci.low","ci.hi", "p.value")) gv<-vector("list",2*length(pts)) for (i in 1:length(pts)){ g1<-y1[near(x1,pts[i],fr1)] g2<-y2[near(x2,pts[i],fr2)] g1<-g1[!is.na(g1)] g2<-g2[!is.na(g2)] j<-i+length(pts) gv[[i]]<-g1 gv[[j]]<-g2 } I1<-diag(length(pts)) I2<-0-I1 con<-rbind(I1,I2) test<-linconb(gv,con=con,tr=tr,nboot=nboot) mat[,1]<-pts mat[,2]<-n1 mat[,3]<-n2 mat[,4]<-test$psihat[,2] mat[,5]<-test$test[,2] mat[,6]<-test$test[,3] mat[,7]<-test$psihat[,3] mat[,8]<-test$psihat[,4] mat[,9]<-test$test[,4] } if(plotit) runmean2g(x1,y1,x2,y2,fr=fr1,est=mean,tr=tr) list(output=mat,crit=test$crit) } spear<-function(x,y=NA){ # Compute Spearman's rho # if(!is.na(y[1]))corv<-cor(rank(x),rank(y)) if(is.na(y[1])){ m<-apply(x,2,rank) corv<-cor(m) } test <-corv * sqrt((length(x) - 2)/(1. - corv^2)) sig <- 2 * (1 - pt(abs(test), length(x) - 2)) if(is.na(y[1]))sig<-matrix(sig,ncol=sqrt(length(sig))) list(cor=corv,siglevel = sig) } linchk<-function(x,y,sp,pv=1,regfun=tsreg,plotit=T,nboot=599,alpha=.05,pr=T){ # # Split the data into two groups according to whether # predictor variable pv has a value less than sp. # Then test the hypothesis that slope coefficients, # based on the regression method regfun, are equal. # x<-as.matrix(x) if(pr)print(paste("Splitting data using predictor", pv)) xx<-x[,pv] flag<-(xx<=sp) temp<-reg2ci(x[flag,],y[flag],x[!flag,],y[!flag],regfun=regfun,plotit=plotit,nboot=nboot,alpha=alpha) temp } trimci<-function(x,tr=.2,alpha=.05,null.value=0,pr=T){ # # Compute a 1-alpha confidence interval for the trimmed mean # # The default amount of trimming is tr=.2 # if(pr){ print("The p-value returned by the this function is based on the") print("null value specified by the argument null.value, which defaults to 0") } x<-elimna(x) se<-sqrt(winvar(x,tr))/((1-2*tr)*sqrt(length(x))) trimci<-vector(mode="numeric",length=2) df<-length(x)-2*floor(tr*length(x))-1 trimci[1]<-mean(x,tr)-qt(1-alpha/2,df)*se trimci[2]<-mean(x,tr)+qt(1-alpha/2,df)*se test<-(mean(x,tr)-null.value)/se sig<-2*(1-pt(abs(test),df)) list(ci=trimci,test.stat=test,p.value=sig) } msmed<-function(x,y=NA,con=0,alpha=.05){ # # Test a set of linear contrasts using Medians # # The data are assumed to be stored in $x$ in a matrix or in list mode. # Length(x) is assumed to correspond to the total number of groups, J # It is assumed all groups are independent. # # con is a J by d matrix containing the contrast coefficients that are used. # If con is not specified, all pairwise comparisons are made. # # Missing values are automatically removed. # if(!is.na(y[1])){ xx<-list() xx[[1]]<-x xx[[2]]<-y if(is.matrix(x) || is.list(x))stop("When y is speficied, x should not have list mode or be a matrix") x<-xx } if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in a matrix or in list mode.") con<-as.matrix(con) J<-length(x) h<-vector("numeric",J) w<-vector("numeric",J) xbar<-vector("numeric",J) for(j in 1:J){ xx<-!is.na(x[[j]]) val<-x[[j]] if(sum(duplicated(val)>0)){ print(paste("Warning: Group",j, "has tied values. Might want to used medpb")) } x[[j]]<-val[xx] # Remove missing values xbar[j]<-median(x[[j]]) w[j]<-msmedse(x[[j]])^2 # Squared standard error. } if(sum(con^2!=0))CC<-ncol(con) if(sum(con^2)==0){ CC<-(J^2-J)/2 psihat<-matrix(0,CC,5) dimnames(psihat)<-list(NULL,c("Group","Group","psihat","ci.lower","ci.upper")) test<-matrix(NA,CC,6) dimnames(test)<-list(NULL,c("Group","Group","test","crit","se","p.value")) jcom<-0 for (j in 1:J){ for (k in 1:J){ if (j < k){ jcom<-jcom+1 test[jcom,3]<-abs(xbar[j]-xbar[k])/sqrt(w[j]+w[k]) test[jcom,6]<-2*(1-pt(test[jcom,3],999)) sejk<-sqrt(w[j]+w[k]) test[jcom,5]<-sejk psihat[jcom,1]<-j psihat[jcom,2]<-k test[jcom,1]<-j test[jcom,2]<-k psihat[jcom,3]<-(xbar[j]-xbar[k]) crit<-NA if(CC==1)crit<-qnorm(1-alpha/2) if(CC>1){ if(alpha==.05)crit<-smmcrit(500,CC) if(alpha==.01)crit<-smmcrit01(500,CC) if(is.na(crit))warning("Can only be used with alpha=.05 or .01") } test[jcom,4]<-crit psihat[jcom,4]<-psihat[jcom,3]-crit*test[jcom,5] psihat[jcom,5]<-psihat[jcom,3]+crit*test[jcom,5] }}}} if(sum(con^2)>0){ if(nrow(con)!=length(x))warning("The number of groups does not match the number of contrast coefficients.") psihat<-matrix(0,ncol(con),4) dimnames(psihat)<-list(NULL,c("con.num","psihat","ci.lower","ci.upper")) test<-matrix(0,ncol(con),5) dimnames(test)<-list(NULL,c("con.num","test","crit","se","p.value")) for (d in 1:ncol(con)){ psihat[d,1]<-d psihat[d,2]<-sum(con[,d]*xbar) sejk<-sqrt(sum(con[,d]^2*w)) test[d,1]<-d test[d,2]<-sum(con[,d]*xbar)/sejk test[d,5]<-2*(1-pt(abs(test[d,2]),999)) crit<-NA if(CC==1)crit<-qnorm(1-alpha/2) if(alpha==.05)crit<-smmcrit(500,ncol(con)) if(alpha==.01)crit<-smmcrit01(500,ncol(con)) test[d,3]<-crit test[d,4]<-sejk psihat[d,3]<-psihat[d,2]-crit*sejk psihat[d,4]<-psihat[d,2]+crit*sejk }} list(test=test,psihat=psihat) } msmedse<-function(x){ # # Compute the standard error of the median using method # recommended by McKean and Shrader (1984). # y<-sort(x) n<-length(x) av<-round((n+1)/2-qnorm(0.995)*sqrt(n/4)) if(av==0)av<-1 top<-n-av+1 sqse<-((y[top]-y[av])/(2*qnorm(.995)))^2 sqse<-sqrt(sqse) sqse } selby<-function(m,grpc,coln){ # # # A commmon situation is to have data stored in an n by p matrix where # one or more of the columns are group identification numbers. # This function groups all values in column coln according to the # group numbers in column grpc and stores the results in list mode. # # More than one column of data can sorted # # grpc indicates the column of the matrix containing group id number # if(!is.matrix(m))stop("Data must be stored in a matrix") if(is.na(grpc[1]))stop("The argument grpc is not specified") if(is.na(coln[1]))stop("The argument coln is not specified") if(length(grpc)!=1)stop("The argument grpc must have length 1") x<-vector("list") grpn<-unique(m[,grpc]) it<-0 for (ig in 1:length(grpn)){ for (ic in 1:length(coln)){ it<-it+1 flag<-(m[,grpc]==grpn[ig]) x[[it]]<-m[flag,coln[ic]] }} list(x=x,grpn=grpn) } med2way<-function(J,K,x,grp=c(1:p),alpha=.05,p=J*K){ # # Perform a J by K (two-way) anova on medians where # all jk groups are independent. # # The s-plus variable x is assumed to contain the raw # data stored in list mode. # If grp is unspecified, it is assumed x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second factor: level 1,2 # x[[j+1]] is the data for level 2,1, etc. # If the data are in wrong order, grp can be used to rearrange the # groups. For example, for a two by two design, grp<-c(2,4,3,1) # indicates that the second group corresponds to level 1,1; # group 4 corresponds to level 1,2; group 3 is level 2,1; # and group 1 is level 2,2. # # It is assumed that the input variable x has length JK, the total number of # groups being tested. If not, a warning message is printed. # print("Suggestion: Use the function m2way instead, especially with tied values") if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data are not stored in a matrix or in list mode") if(p!=length(x)){ print("Warning: The number of groups in your data is not equal to JK") } xbar<-0 h<-0 d<-0 R<-0 W<-0 d<-0 r<-0 w<-0 nuhat<-0 omegahat<-0 DROW<-0 DCOL<-0 xtil<-matrix(0,J,K) aval<-matrix(0,J,K) for (j in 1:p){ xbar[j]<-median(x[[grp[j]]]) h[j]<-length(x[[grp[j]]]) d[j]<-msmedse(x[[grp[j]]])^2 } d<-matrix(d,J,K,byrow=T) xbar<-matrix(xbar,J,K,byrow=T) h<-matrix(h,J,K,byrow=T) for(j in 1:J){ R[j]<-sum(xbar[j,]) nuhat[j]<-(sum(d[j,]))^2/sum(d[j,]^2/(h[j,]-1)) r[j]<-1/sum(d[j,]) DROW[j]<-sum(1/d[j,]) } for(k in 1:K){ W[k]<-sum(xbar[,k]) omegahat[k]<-(sum(d[,k]))^2/sum(d[,k]^2/(h[,k]-1)) w[k]<-1/sum(d[,k]) DCOL[k]<-sum(1/d[,k]) } D<-1/d for(j in 1:J){ for(k in 1:K){ xtil[j,k]<-sum(D[,k]*xbar[,k]/DCOL[k])+sum(D[j,]*xbar[j,]/DROW[j])- sum(D*xbar/sum(D)) aval[j,k]<-(1-D[j,k]*(1/sum(D[j,])+1/sum(D[,k])-1/sum(D)))^2/(h[j,k]-3) } } Rhat<-sum(r*R)/sum(r) What<-sum(w*W)/sum(w) Ba<-sum((1-r/sum(r))^2/nuhat) Bb<-sum((1-w/sum(w))^2/omegahat) Va<-sum(r*(R-Rhat)^2)/((J-1)*(1+2*(J-2)*Ba/(J^2-1))) Vb<-sum(w*(W-What)^2)/((K-1)*(1+2*(K-2)*Bb/(K^2-1))) sig.A<-1-pf(Va,J-1,9999999) sig.B<-1-pf(Vb,K-1,9999999) # Next, do test for interactions Vab<-sum(D*(xbar-xtil)^2) dfinter<-(J-1)*(K-1) sig.AB<-1-pchisq(Vab,dfinter) list(test.A=Va,p.val.A=sig.A,test.B=Vb,p.val.B=sig.B,test.AB=Vab,p.val.AB=sig.AB) } idealf<-function(x,na.rm=F){ # # Compute the ideal fourths for data in x # if(na.rm)x<-x[!is.na(x)] j<-floor(length(x)/4 + 5/12) y<-sort(x) g<-(length(x)/4)-j+(5/12) ql<-(1-g)*y[j]+g*y[j+1] k<-length(x)-j+1 qu<-(1-g)*y[k]+g*y[k-1] list(ql=ql,qu=qu) } lintests1<-function(vstar,yhat,res,mflag,x,regfun,...){ ystar<-yhat+res*vstar bres<-regfun(x,ystar,...)$residuals rval<-0 for (i in 1:nrow(x)){ rval[i]<-sum(bres[mflag[,i]]) } rval } rdepth<-function(d, x, y, sortx = T) { ########################################################################## # This function computes the regression depth of a line with coordinates d # relative to the bivariate data set (x,y). # The first component of the vector d indicates the intercept of the line, # the second component is the slope. # # Input : d : vector with two components # x,y : vectors of equal length (data set) # sortx : logical, to set to F if the data set (x,y) is # already sorted by its x-coordinates # # Reference: # Rousseeuw, P.J. and Hubert, M. (1996), # Regression Depth, Technical report, University of Antwerp # submitted for publication. ########################################################################## if(!is.vector(x) || !is.vector(y)) stop("x and y should be vectors") n <- length(x) if(n < 2) stop("you need at least two observations") xy <- cbind(x, y) b <- d[1] a <- d[2] if(sortx) xy <- xy[order(xy[, 1], xy[, 2]), ] res <- xy[, 2] - a * xy[, 1] - b res[abs(res) < 9.9999999999999995e-08] <- 0 posres <- res >= 0 negres <- res <= 0 lplus <- cumsum(posres) rplus <- lplus[n] - lplus lmin <- cumsum(negres) rmin <- lmin[n] - lmin depth <- pmin(lplus + rmin, rplus + lmin) min(depth) } permg<-function(x,y,alpha=.05,est=mean,nboot=1000){ # # Do a two-sample permutation test based on means or any # other measure of location or scale indicated by the # argument est. # # The default number of permutations is nboot=1000 # x<-x[!is.na(x)] y<-y[!is.na(y)] xx<-c(x,y) dif<-est(x)-est(y) vec<-c(1:length(xx)) v1<-length(x)+1 difb<-NA temp2<-NA for(i in 1:nboot){ data <- sample(xx, size = length(xx), replace = F) temp1<-est(data[c(1:length(x))]) temp2<-est(data[c(v1:length(xx))]) difb[i]<-temp1-temp2 } difb<-sort(difb) icl<-floor((alpha/2)*nboot+.5) icu<-floor((1-alpha/2)*nboot+.5) reject<-"no" if(dif>=difb[icu] || dif <=difb[icl])reject<-"yes" list(dif=dif,lower=difb[icl],upper=difb[icu],reject=reject) } pb2gen<-function(x,y,alpha=.05,nboot=2000,est=mom,SEED=T,pr=T,...){ # # Compute a bootstrap confidence interval for the # the difference between any two parameters corresponding to # independent groups. # By default, MOM-estimators are compared. # Setting est=mean, for example, will result in a percentile # bootstrap confidence interval for the difference between means. # Setting est=onestep will compare M-estimators of location. # The default number of bootstrap samples is nboot=2000 # x<-x[!is.na(x)] # Remove any missing values in x y<-y[!is.na(y)] # Remove any missing values in y if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. if(pr)print("Taking bootstrap samples. Please wait.") datax<-matrix(sample(x,size=length(x)*nboot,replace=T),nrow=nboot) datay<-matrix(sample(y,size=length(y)*nboot,replace=T),nrow=nboot) bvecx<-apply(datax,1,est,...) bvecy<-apply(datay,1,est,...) bvec<-sort(bvecx-bvecy) low<-round((alpha/2)*nboot)+1 up<-nboot-low temp<-sum(bvec<0)/nboot+sum(bvec==0)/(2*nboot) sig.level<-2*(min(temp,1-temp)) se<-var(bvec) list(ci=c(bvec[low],bvec[up]),p.value=sig.level,sq.se=se) } tmean<-function(x,tr=.2,na.rm=F){ if(na.rm)x<-x[!is.na(x)] val<-mean(x,tr) val } depth<-function(U,V,m){ # # Compute the halfspace depth of the point (u,v) for the pairs of points # in the n by 2 matrix m. # X<-m[,1] Y<-m[,2] FV<-NA NUMS<-0 NUMH<-0 SDEP<-0.0 HDEP<-0.0 N<-length(X) P<-acos(-1) P2<-P*2.0 EPS<-0.000001 ALPHA<-NA NT<-0 for(i in 1:nrow(m)){ DV<-sqrt(((X[i]-U)*(X[i]-U)+(Y[i]-V)*(Y[i]-V))) if (DV <= EPS){ NT<-NT+1 } else{ XU<-(X[i]-U)/DV YU<-(Y[i]-V)/DV if (abs(XU) > abs(YU)){ if (X[i] >= U){ ALPHA[i-NT]<-asin(YU) if(ALPHA[i-NT] < 0.0) ALPHA[i-NT]<-P2+ALPHA[i-NT] } else{ ALPHA[i-NT]<-P-asin(YU) } } else{ if (Y[i] >= V) ALPHA[i-NT]<-acos(XU) else ALPHA[i-NT]<-P2-acos(XU) } if (ALPHA[i-NT] >= P2-EPS) ALPHA[i-NT]<-0.0 } } NN<-N-NT if(NN<=1){ NUMS<-NUMS+depths1(NT,1)*depths1(NN,2)+depths1(NT,2)*depths1(NN,1)+ depths1(NT,3) if(N >= 3)SDEP<-(NUMS+0.0)/(depths1(N,3)+0.0) NUMH<-NUMH+NT HDEP<-(NUMH+0.0)/(N+0.0) return(HDEP) } ALPHA<-sort(ALPHA[1:NN]) ANGLE<-ALPHA[1]-ALPHA[NN]+P2 for(i in 2:NN){ ANGLE<-max(c(ANGLE,ALPHA[i]-ALPHA[i-1])) } if(ANGLE > (P+EPS)){ NUMS<-NUMS+depths1(NT,1)*depths1(NN,2)+depths1(NT,2)*depths1(NN,1)+ depths1(NT,3) if(N >= 3)SDEP<-(NUMS+0.0)/(depths1(N,3)+0.0) NUMH<-NUMH+NT HDEP<-(NUMH+0.0)/(N+0.0) return(HDEP) } ANGLE<-ALPHA[1] NU<-0 for (i in 1:NN){ ALPHA[i]<-ALPHA[i]-ANGLE if(ALPHA[i]<(P-EPS))NU<-NU+1 } if(NU >= NN){ NUMS<-NUMS+depths1(NT,1)*depths1(NN,2)+depths1(NT,2)*depths1(NN,1)+ depths1(NT,3) if(N >= 3)SDEP<-(NUMS+0.0)/(depths1(N,3)+0.0) NUMH<-NUMH+NT HDEP<-(NUMH+0.0)/(N+0.0) return(HDEP) } # # Mergesort the alpha with their antipodal angles beta, # and at the same time update I, F(I), and NBAD. # JA<-1 JB<-1 ALPHK<-ALPHA[1] BETAK<-ALPHA[NU+1]-P NN2<-NN*2 NBAD<-0 I<-NU NF<-NN for(J in 1:NN2){ ADD<-ALPHK+EPS if (ADD < BETAK){ NF<-NF+1 if(JA < NN){ JA<-JA+1 ALPHK<-ALPHA[JA] } else ALPHK<-P2+1.0 } else{ I<-I+1 NN1<-NN+1 if(I==NN1){ I<-1 NF<-NF-NN } FV[I]<-NF NFI<-NF-I NBAD<-NBAD+depths1(NFI,2) if(JB < NN){ JB<-JB+1 if(JB+NU <= NN) BETAK<-ALPHA[JB+NU]-P else BETAK<-ALPHA[JB+NU-NN]+P } else BETAK<-P2+1.0 } } NUMS<-depths1(NN,3)-NBAD # # Computation of NUMH for halfspace depth. # GI<-0 JA<-1 ANGLE<-ALPHA[1] dif<-NN-FV[1] NUMH<-min(FV[1],dif) for(I in 2:NN){ AEPS<-ANGLE+EPS if(ALPHA[I] <= AEPS){ JA<-JA+1 } else{ GI<-GI+JA JA<-1 ANGLE<-ALPHA[I] } KI<-FV[I]-GI NNKI<-NN-KI NUMH<-min(c(NUMH,min(c(KI,NNKI)))) } NUMS<-NUMS+depths1(NT,1)*depths1(NN,2)+depths1(NT,2)*depths1(NN,1)+ depths1(NT,3) if(N >= 3)SDEP<-(NUMS+0.0)/(depths1(N,3)+0.0) NUMH<-NUMH+NT HDEP<-(NUMH+0.0)/(N+0.0) HDEP } rtdep<-function(pts,m,nsamp=100,SEED=NA){ # # Determine Tukey depth by randomly sampling # p-1 points from m (which has p columns), # combine this with pt, fit a plane, check # the residuals, and repeat many times. # Count how many positive residuals # there are, say pr, how many negative residuals, nr. # The approximate depth is min (pr,nr) over all samples. # set.seed(2) if(!is.na(SEED))set.seed(SEED) if(!is.matrix(m))stop("Second argument is not a matrix") if(ncol(m)==2)tdep<-depth(pts[1],pts[2],m) if(ncol(m)>2){ n<-nrow(m) pts<-matrix(pts,ncol=ncol(m)) mold<-m p<-ncol(m) pm1<-p-1 mdup<-matrix(rep(pts,nrow(m)),ncol=ncol(m),byrow=T) dif<-abs(m-mdup) chk<-apply(dif,1,sum) flag<-(chk!=0) m<-m[flag,] m<-as.matrix(m) dmin<-sum(chk==0) m3<-rbind(m,pts) tdep<-nrow(m)+1 for(i in 1:nsamp){ mat<-sample(nrow(m),pm1,T) #if(p==2)x<-c(m[mat,2:p],pts[,2:p]) if(p>2)x<-rbind(m[mat,2:p],pts[,2:p]) y<-c(m[mat,1],pts[1]) if(prod(eigen(var(x))$values) >10^{-8}){ #print(prod(eigen(var(x))$values)) temp<-qr(x) #print(temp) #print(ncol(x)) if(temp$rank[1]==ncol(x)){ temp<-lsfit(x,y)$coef m2<-cbind(rep(1,nrow(m3)),m3[,2:p]) res<-m3[,1]-temp%*%t(m2) p1<-sum((res>0)) p2<-sum((res<0)) tdep<-min(c(tdep,p1,p2)) if(tdep EPS) { NSIN <- NSIN + 1 foundSingular <- T if (PRINT) paste( "ERROR: No Eigenvalue = 0 for sample", NRAN) next } # ------------------------------------------ # Need to test for singularity # ------------------------------------------ if (Eval[NP-1] <= EPS) { NSIN <- NSIN + 1 } # ------------------------------------------ # Projecting all pints on line through # theta with direction given by the eigen # vector of the smallest eigenvalue, i.e., # the direction orthogonal on the hyperplane # given by the NP-subset. # Compute the one-dimensional halfspace depth # of theta on this line. # ------------------------------------------ # in Splus the smallest eigenvalue is the # last one and corresponding vector is the # last one, hence Eval[NP] is the smallest # and Evec[,NP] is the corresponding vector # ------------------------------------------ eigenVec <- Evec[,NP] NT <- sum( ifelse( eigenVec <= EPS, 1, 0 ) ) KT <- sum( ifelse( eigenVec > EPS, PNT * eigenVec, 0 ) ) if (NT == NP) { NSIN <- NSIN + 1 foundSingular <- T if (PRINT) paste( " ERROR: Eigenvector = 0 for sample", NRAN ) if (foundSingular) next # Do next Sample } K <- X %*% eigenVec K <- K - KT NUMH <- sum( ifelse( K > EPS, 1, 0 ) ) NT <- sum( ifelse( abs(K) <= EPS, 1, 0 ) ) # ------------------------------------------- # If all projections collapse with theta, # return to reduce the dimension # ------------------------------------------- if (NT == N) { NSIN <- -1 return( list( NDEP=NDEP, NSIN=NSIN, EVEC=Evec ) ) # Will need #Eigen Vector matrix to reduce dimension } # ------------------------------------------- # Update halfspace depth # ------------------------------------------- NDEP <- min( NDEP, min( NUMH+NT,N-NUMH ) ) } return( list( NDEP=NDEP, NSIN=NSIN, EVEC=Evec ) ) } #================================================ Reduce <- function( X, PNT, Evec ) { Det <- det(Evec) if (Det==0) { return( list( X=X, PNT=PNT, DET=Det ) ) } NP <- ncol(X) # --------------------------------------- # Compute (NP-1)-dimentional coordinates # for all points and theta # --------------------------------------- RedEvec <- matrix(Evec[,1:(NP-1)],nrow=NP,ncol=(NP-1)) # Reducing # dimension by removing the last dimension with 0 variance. PNT <- PNT %*% RedEvec X <- X %*% RedEvec if (!is.matrix(X)) X <- matrix(X,ncol=(NP-1)) return( list( X=X, PNT=PNT, DET=Det ) ) } # # PROGRAM BEGINS # if (!is.na(SEED)) set.seed( SEED ) # --------------------------------------- # Initialize Number of singular samples # --------------------------------------- Nsin <- 0 X <- as.matrix( X ) N <- nrow( X ) NP <- ncol( X ) if (length(PNT) != NP){print("Length of 'PNT' has to equal to") stop("number of columns in X !!! " ) } # --------------------------------------- # Handle special case where N=1 # --------------------------------------- if (N==1) { NDEP <- ifelse( abs(X[1,]-PNT) > EPS, 0, 1 ) # if any dimension # different from point PNT, NDEP=0, else = 1 NDEP <- min( NDEP ) DEPTH <- NDEP/ N return( DEPTH ) } # --------------------------------------- # Handle special case where NP=1 # --------------------------------------- repeat #+++++++++++++++++++++++++++++++++ { # In this case depth is equal to number of points <= to T if (NP==1) { MORE <- sum( ifelse( X[,1] >= (PNT-EPS), 1, 0 ) ) LESS <- sum( ifelse( X[,1] <= (PNT+EPS), 1, 0 ) ) NDEP <- min( LESS, MORE ) DEPTH <- NDEP / N return( DEPTH ) } # --------------------------------------- # General Case, call function DEP # --------------------------------------- if (N > NP) { RES <- DEP( X=X, PNT=PNT, NDIR=NDIR, EPS=EPS, PRINT=PRINT ) NDEP <- RES$NDEP NSIN <- RES$NSIN EVEC <- RES$EVEC } else { NSIN <- -1 # Needs to reduce dimensions EVEC <- eigen( var( X ) )[[2]] # Getting eigenvector } # --------------------------------------- # If all points and theta are identified # as lying on the same hyperplane, reduce # the dimension of the data set by projection # on that hyperplane, and compute the depth # on the reduced data set # --------------------------------------- if (NSIN == -1) { NSIN <- 0 if (PRINT) print( " Direction with zero variance detected" ) RED <- Reduce( X=X, PNT=PNT, Evec=EVEC ) X <- RED$X PNT <- RED$PNT Det <- RED$DET if (Det==0) { print("\n\n\t DIMENSION REDUCTION TERMINATED\n\t EIGENVECTORS ARE NOT") stop("INDEPENDENT\n\n" ) } NP <- ncol(X) if (PRINT) paste(" Dimension reduced to", NP ) } else { break # No need to reduce dimension of X and hence no need to #return, breaks 'repeat' loop } } # End repeat+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ DEPTH <- NDEP / N return( DEPTH ) } depths1<-function(m,j){ if(m < j)depths1<-0 else{ if(j==1)depths1<-m if(j==2)depths1<-(m*(m-1))/2 if(j==3)depths1<-(m*(m-1)*(m-2))/6 } depths1 } outbox<-function(x,mbox=F,gval=NA,plotit=F){ # # This function detects outliers using the # boxplot rule, but unlike the S-PLUS function boxplot, # the ideal fourths are used to estimate the quartiles. # # Setting mbox=T results in using the modification # of the boxplot rule suggested by Carling (2000). # x<-x[!is.na(x)] # Remove missing values if(plotit)boxplot(x) n<-length(x) temp<-idealf(x) if(mbox){ if(is.na(gval))gval<-(17.63*n-23.64)/(7.74*n-3.71) cl<-median(x)-gval*(temp$qu-temp$ql) cu<-median(x)+gval*(temp$qu-temp$ql) } if(!mbox){ if(is.na(gval))gval<-1.5 cl<-temp$ql-gval*(temp$qu-temp$ql) cu<-temp$qu+gval*(temp$qu-temp$ql) } flag<-NA outid<-NA vec<-c(1:n) for(i in 1:n){ flag[i]<-(x[i]< cl || x[i]> cu) } if(sum(flag)==0)outid<-NA if(sum(flag)>0)outid<-vec[flag] keep<-vec[!flag] outval<-x[flag] list(out.val=outval,out.id=outid,keep=keep,cl=cl,cu=cu) } mscov<-function(m){ # # m is an n by p matrix # # Compute a skipped covariance matrix # # Eliminate outliers using a projection method # That is, compute Donoho-Gasko median, for each point # consider the line between it and the median, # project all points onto this line, and # check for outliers using a boxplot rule. # Repeat this for all points. A point is declared # an outlier if for any projection it is an outlier # using a modification of the usual boxplot rule. # # Eliminate any outliers and compute covariances # using remaining data. # m<-elimna(m) temp<-outpro(m,plotit=F)$keep mcor<-var(m[temp,]) list(cov=mcor) } runm3d<-function(x,y,theta=50,phi=25,fr=.8,tr=.2,plotit=T,pyhat=F,nmin=0, expand=.5,scale=F,zscale=F,xout=F,outfun=out,eout=F,xlab="X",ylab="Y",zlab="", pr=T){ # # running mean using interval method # # fr controls amount of smoothing # tr is the amount of trimming # x is an n by p matrix of predictors. # # Rows of data with missing values are automatically removed. # # When plotting, theta and phi can be used to change # the angle at which the plot is viewed. # # theta is the azimuthal direction and phi the colatitude # expand controls relative length of z-axis # library(MASS) library(akima) if(plotit){ if(pr){ print("Note: when there is independence, scale=F is probably best") print("When there is dependence, scale=T is probably best") }} if(!is.matrix(x))stop("x should be a matrix") if(nrow(x) != length(y))stop("number of rows of x should equal length of y") temp<-cbind(x,y) p<-ncol(x) p1<-p+1 temp<-elimna(temp) # Eliminate any rows with missing values. if(xout){ keepit<-rep(T,nrow(x)) flag<-outfun(x,plotit=F)$out.id keepit[flag]<-F x<-x[keepit,] y<-y[keepit] } if(zscale){ for(j in 1:p1){ temp[,j]<-(temp[,j]-median(temp[,j]))/mad(temp[,j]) }} x<-temp[,1:p] y<-temp[,p1] pyhat<-as.logical(pyhat) plotit<-as.logical(plotit) set.seed(12) m<-cov.mve(x) iout<-c(1:nrow(x)) rmd<-1 # Initialize rmd nval<-1 for(i in 1:nrow(x))rmd[i]<-mean(y[near3d(x,x[i,],fr,m)],tr) for(i in 1:nrow(x))nval[i]<-length(y[near3d(x,x[i,],fr,m)]) if(plotit){ if(ncol(x)!=2)stop("When plotting, x must be an n by 2 matrix") fitr<-rmd[nval>nmin] y<-y[nval>nmin] x<-x[nval>nmin,] iout<-c(1:length(fitr)) nm1<-length(fitr)-1 for(i in 1:nm1){ ip1<-i+1 for(k in ip1:length(fitr))if(sum(x[i,]==x[k,])==2)iout[k]<-0 } fitr<-fitr[iout>=1] # Eliminate duplicate points in the x-y plane # This is necessary when doing three dimensional plots # with the S-PLUS function interp mkeep<-x[iout>=1,] fit<-interp(mkeep[,1],mkeep[,2],fitr) persp(fit,theta=theta,phi=phi,xlab=xlab,ylab=ylab,zlab=zlab,expand=expand, scale=scale) } last<-"Done" if(pyhat)last<-rmd last } skerd<-function(x,op=T){ # # Compute kernel density estimate # for univariate data using R function density # if(op)temp<-density(x,na.rm=T) if(!op)temp<-density(x,na.rm=T,width="SJ-dpi",n=256) plot(temp$x,temp$y,type="n",ylab="",xlab="x") lines(temp$x,temp$y) } rdplot<-function(x,fr=NA,plotit=T,theta=50,phi=25,expand=.5,pyhat=F,pts=NA, xlab="X",ylab=""){ # # Expected frequency curve # # fr controls amount of smoothing # theta is the azimuthal direction and phi the colatitude # plotit<-as.logical(plotit) x<-elimna(x) x<-as.matrix(x) rmd<-NA if(ncol(x)==1){ if(is.na(fr))fr<-.8 if(is.na(pts[1]))pts<-x for(i in 1:length(pts)){ rmd[i]<-sum(near(x,pts[i],fr)) } if(mad(x)!=0)rmd<-rmd/(2*fr*mad(x)) rmd<-rmd/length(x) if(plotit){ plot(pts,rmd,type="n",ylab=ylab,xlab=xlab) sx<-sort(pts) xorder<-order(pts) sysm<-rmd[xorder] lines(sx,sysm) }} if(ncol(x)>1){ library(MASS) if(is.na(fr))fr<-.6 m<-cov.mve(x) for(i in 1:nrow(x)){ rmd[i]<-sum(near3d(x,x[i,],fr,m)) } rmd<-rmd/nrow(x) if(plotit && ncol(x)==2){ fitr<-rmd iout<-c(1:length(fitr)) nm1<-length(fitr)-1 for(i in 1:nm1){ ip1<-i+1 for(k in ip1:length(fitr))if(sum(x[i,]==x[k,])==2)iout[k]<-0 } fitr<-fitr[iout>=1] mkeep<-x[iout>=1,] fit<-interp(mkeep[,1],mkeep[,2],fitr) persp(fit,theta=theta,phi=phi,expand=expand,xlab="Var 1",ylab="Var 2",zlab="") } } if(pyhat)last<-rmd if(!pyhat)last<-"Done" last } cid<-function(x,y,alpha=.05,plotit=F,pop=0,fr=.8,rval=15){ # # Compute a confidence interval for delta using the method in # Cliff, 1996, p. 140, eq 5.12 # # The null hypothesis is that for two independent group, P(XY). # This function reports a 1-alpha confidence interval for # P(X>Y)-P(X0)/length(msave) c.sum<-matrix(c(qxly,q0,qxgy),nrow=1,ncol=3) dimnames(c.sum)<-list(NULL,c("P(XY)")) sigdih<-sum((m-d)^2)/(length(x)*length(y)-1) di<-NA for (i in 1:length(x))di[i]<-sum(x[i]>y)/length(y)-sum(x[i]x)/length(x)-sum(y[i]2500){ print("Product of sample sizes exceeds 2500.") print("Execution time might be high when using pop=0 or 1") print("If this is case, might consider changing the argument pop") }} if(pop==0)akerd(as.vector(msave)) if(pop==1)rdplot(as.vector(msave),fr=fr) if(pop==2)kdplot(as.vector(msave),rval=rval) if(pop==3)boxplot(as.vector(msave)) if(pop==4)stem(as.vector(msave)) if(pop==5)hist(as.vector(msave),xlab="X") if(pop==6)skerd(as.vector(msave)) } list(cl=cl,cu=cu,d=d,sqse.d=sh,phat=phat,summary.dvals=c.sum) } rimul<-function(J,K,x,alpha=.05,p=J*K,grp=c(1:p),plotit=T,op=4){ # # Rank-based multiple comparisons for all interactions # in J by K design. The method is based on an # extension of Cliff's heteroscedastic technique for # handling tied values and the Patel-Hoel definition of no interaction. # # The familywise type I error probability is controlled by using # a critical value from the Studentized maximum modulus distribution. # # It is assumed all groups are independent. # # Missing values are automatically removed. # # The default value for alpha is .05. Any other value results in using # alpha=.01. # # Argument grp can be used to rearrange the order of the data. # if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in a matrix or in list mode.") CCJ<-(J^2-J)/2 CCK<-(K^2-K)/2 CC<-CCJ*CCK test<-matrix(NA,CC,7) test.p<-matrix(NA,CC,7) for(j in 1:J){ xx<-!is.na(x[[j]]) val<-x[[j]] x[[j]]<-val[xx] # Remove missing values } mat<-matrix(grp,ncol=K,byrow=T) dimnames(test)<-list(NULL,c("Factor A","Factor A","Factor B","Factor B","delta","ci.lower","ci.upper")) jcom<-0 crit<-smmcrit(200,CC) if(alpha!=.05)crit<-smmcrit01(200,CC) alpha<-1-pnorm(crit) for (j in 1:J){ for (jj in 1:J){ if (j < jj){ for (k in 1:K){ for (kk in 1:K){ if (k < kk){ jcom<-jcom+1 test[jcom,1]<-j test[jcom,2]<-jj test[jcom,3]<-k test[jcom,4]<-kk temp1<-cid(x[[mat[j,k]]],x[[mat[j,kk]]],plotit=F) temp2<-cid(x[[mat[jj,k]]],x[[mat[jj,kk]]],plotit=F) delta<-temp2$d-temp1$d sqse<-temp1$sqse.d+temp2$sqse.d test[jcom,5]<-delta/2 test[jcom,6]<-delta/2-crit*sqrt(sqse/4) test[jcom,7]<-delta/2+crit*sqrt(sqse/4) }}}}}} if(J==2 && K==2){ if(plotit){ m1<-outer(x[[1]],x[[2]],FUN="-") m2<-outer(x[[3]],x[[4]],FUN="-") m1<-as.vector(m1) m2<-as.vector(m2) g2plot(m1,m2,op=op) }} list(test=test) } ifmest<-function(x,bend=1.28,op=2){ # # Estimate the influence function of an M-estimator, using # Huber's Psi, evaluated at x. # # Data are in the vector x, bend is the percentage bend # # op=2, use adaptive kernel estimator # otherwise use Rosenblatt's shifted histogram # tt<-mest(x,bend) # Store M-estimate in tt s<-mad(x)*qnorm(.75) if(op==2){ val<-akerd(x,pts=tt,plotit=F,pyhat=T) val1<-akerd(x,pts=tt-s,plotit=F,pyhat=T) val2<-akerd(x,pts=tt+s,plotit=F,pyhat=T) } if(op!=2){ val<-kerden(x,0,tt) val1<-kerden(x,0,tt-s) val2<-kerden(x,0,tt+s) } ifmad<-sign(abs(x-tt)-s)-(val2-val1)*sign(x-tt)/val ifmad<-ifmad/(2*.6745*(val2+val1)) y<-(x-tt)/mad(x) n<-length(x) b<-sum(y[abs(y)<=bend])/n a<-hpsi(y)*mad(x)-ifmad*b ifmest<-a/(length(y[abs(y)<=bend])/n) ifmest } qmjci<-function(x,q=.5,alpha=.05,op=1){ # # Compute a 1-alpha confidence for qth quantile using the # Maritz-Jarrett estimate of the standard error. # # The default quantile is .5. # The default value for alpha is .05. # if(q <= 0 || q>=1)stop("q must be between 0 and 1") y<-sort(x) m<-floor(q*length(x)+.5) crit<-qnorm(1-alpha/2) qmjci<-vector(mode="numeric",2) se<-NA if(op==1)se<-mjse(x) if(op==2){ if(q!=.5)stop("op=2 works only with q=.5") se<-msmedse(x) } if(op==3)se<-qse(x,q) if(is.na(se))stop("Something is wrong, op should be 1, 2 or 3") qmjci[1]<-y[m]-crit*se qmjci[2]<-y[m]+crit*se qmjci } ydbt<-function(x,y,tr=.2,alpha=.05,nboot=599,side=F,plotit=F,op=1){ # # Using the bootstrap-t method, # compute a .95 confidence interval for the difference between # the marginal trimmed means of paired data. # By default, 20% trimming is used with B=599 bootstrap samples. # # side=F returns equal-tailed ci # side=T returns symmetric ci. # side<-as.logical(side) if(length(x)!=length(y))stop("Must have equal sample sizes.") m<-cbind(x,y) m<-elimna(m) x<-m[,1] y<-m[,2] if(sum(c(!is.na(x),!is.na(y)))!=(length(x)+length(y)))stop("Missing values are not allowed.") set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") data<-matrix(sample(length(y),size=length(y)*nboot,replace=T),nrow=nboot) xcen<-x-mean(x,tr) ycen<-y-mean(y,tr) bvec<-apply(data,1,tsub,xcen,ycen,tr) # bvec is a 1 by nboot matrix containing the bootstrap test statistics. estse<-yuend(x,y)$se dif<-mean(x,tr)-mean(y,tr) if(!side){ ilow<-round((alpha/2)*nboot) ihi<-nboot-ilow bsort<-sort(bvec) ci<-0 ci[1]<-dif-bsort[ihi]*estse ci[2]<-dif-bsort[ilow+1]*estse } if(side){ bsort<-sort(abs(bvec)) ic<-round((1-alpha)*nboot) ci<-0 ci[1]<-dif-bsort[ic]*estse ci[2]<-dif+bsort[ic]*estse } if(plotit){ if(op==1)akerd(bsort) if(op==2)rdplot(bsort) if(op==3)boxplot(bsort) } list(ci=ci,dif=dif) } bootdpci<-function(x,y,est=onestep,nboot=NA,alpha=.05,plotit=T,dif=T,BA=F,...){ # # Use percentile bootstrap method, # compute a .95 confidence interval for the difference between # a measure of location or scale # when comparing two dependent groups. # By default, a one-step M-estimator (with Huber's psi) is used. # If, for example, it is desired to use a fully iterated # M-estimator, use fun=mest when calling this function. # output<-rmmcppb(x,y,est=est,nboot=nboot,alpha=alpha, plotit=plotit,dif=dif,BA=BA,...)$output list(output=output) } relfun<-function(xv,yv,C=36,epsilon=.0001,plotit=T){ # Compute the measures of location, scale and correlation used in the # bivariate boxplot of Goldberg and Iglewicz, # Technometrics, 1992, 34, 307-320. # # The code in relplot plots the boxplot. # # This code assumes the data are in xv and yv # # This code uses the function biloc, stored in the file biloc.b7 and # bivar stored in bivar.b7 # plotit<-as.logical(plotit) # # Do pairwise elimination of missing values # temp<-matrix(c(xv,yv),ncol=2) temp<-elimna(temp) xv<-temp[,1] yv<-temp[,2] tx<-biloc(xv) ty<-biloc(yv) sx<-sqrt(bivar(xv)) sy<-sqrt(bivar(yv)) z1<-(xv-tx)/sx+(yv-ty)/sy z2<-(xv-tx)/sx-(yv-ty)/sy ee<-((z1-biloc(z1))/sqrt(bivar(z1)))^2+ ((z2-biloc(z2))/sqrt(bivar(z2)))^2 w<-(1-ee/C)^2 if(length(w[w==0])>=length(xv)/2)warning("More than half of the w values equal zero") sumw<-sum(w[ee1, a standard percentile bootstrap method is used # with FWE (the probability of at least one type I error) # controlled via the Bonferroni inequality. # # The predictor values are assumed to be in the n by p matrix x. # The default number of bootstrap samples is nboot=599 # # seed=T causes the seed of the random number generator to be set to 2, # otherwise the seed is not set. # # Warning: probability coverage has been studied only when alpha=.05 # x<-as.matrix(x) p<-ncol(x) pp<-p+1 temp<-elimna(cbind(x,y)) # Remove any missing values. x<-temp[,1:p] y<-temp[,p+1] if(xout){ m<-cbind(x,y) flag<-outfun(x,plotit=F)$keep m<-m[flag,] x<-m[,1:p] y<-m[,pp] } x<-as.matrix(x) if(seed)set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples; please wait") data<-matrix(sample(length(y),size=length(y)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,regboot,x,y,lsfit) # A p+1 by n matrix. The first row # contains the bootstrap intercepts, the second row # contains the bootstrap values for first predictor, etc. if(p==1){ if(alpha != .05){print("Resetting alpha to .05") print("With p=1, unknown how to adjust confidence interval") print("when alpha is not equal to .05.") } ilow<-15 ihi<-584 if(length(y) < 250){ ilow<-13 ihi<-586 } if(length(y) < 180){ ilow<-10 ihi<-589 } if(length(y) < 80){ ilow<-7 ihi<-592 } if(length(y) < 40){ ilow<-6 ihi<-593 } ilow<-round((ilow/599)*nboot) ihi<-round((ihi/599)*nboot) } if(p>1){ ilow<-round(alpha*nboot/2)+1 ihi<-nboot-ilow } lsfitci<-matrix(0,ncol(x),2) for(i in 1:ncol(x)){ ip<-i+1 bsort<-sort(bvec[ip,]) lsfitci[i,1]<-bsort[ilow+1] lsfitci[i,2]<-bsort[ihi] } bsort<-sort(bvec[1,]) interceptci<-c(bsort[15],bsort[584]) crit.level<-NA pmat<-NA if(p>1){ crit.level<-alpha/p pmat<-matrix(NA,nrow=p,ncol=2) dimnames(pmat) <- list(NULL, c("Slope","p-value")) for(pv in 1:p){ pmat[pv,1]<-pv pp<-pv+1 #pmat[pv,2]<-sum(bvec[pp,]<0)/nboot pmat[pv,2]<-(sum(bvec[pp,]<0)+.5*sum(bvec[pp,]==0))/nboot temp3<-1-pmat[pv,2] pmat[pv,2]<-2*min(pmat[pv,2],temp3) }} list(intercept.ci=interceptci,slope.ci=lsfitci,crit.level=crit.level, p.values=pmat) } wmve<-function(m){ # # Compute skipped measure of location and scatter # using MVE method # if(is.matrix(m))n<-nrow(m) if(is.vector(m))n<-length(m) flag<-rep(T,n) vec<-out(m,plotit=F)$out.id flag[vec]<-F if(is.vector(m)){ center<-mean(m[flag]) scatter<-var(m[flag]) } if(is.matrix(m)){ center<-apply(m[flag,],2,mean) scatter<-var(m[flag,]) } list(center=center,cov=scatter) } wmw<-function(x,y){ # # Do Mann-Whitney test # Return the usual p-value followed by adjusted # p-value using Hodges, Ramsey and Wechsler (1990) method # (See Wilcox, 2003, p. 559.) # m<-length(x) n<-length(y) com<-rank(c(x,y)) xp1<-length(x)+1 x<-com[1:length(x)] y<-com[xp1:length(com)] u<-sum(y)-n*(n+1)/2 sigsq<-m*n*(n+m+1)/12 yv<-(u+.5-m*n/2)/sqrt(sigsq) kv<-20*m*n*(m+n+1)/(m^2+n^2+n*m+m+n) S<-yv^2 T1<-S-3 T2<-(155*S^2-416*S-195)/42 cv<-1+T1/kv+T2/kv^2 sighrw<-2*(1-pnorm(abs(cv*yv))) z<-(u-(.5*m*n))/sqrt(sigsq) sig<-2*(1-pnorm(abs(z))) list(p.value=sig,sigad=sighrw) } lsfitNci<-function(x,y,alpha=.05){ # # Compute confidence for least squares # regression using heteroscedastic method # recommended by Long and Ervin (2000). # x<-as.matrix(x) if(nrow(x) != length(y))stop("Length of y does not match number of x values") m<-cbind(x,y) m<-elimna(m) y<-m[,ncol(x)+1] temp<-lsfit(x,y) x<-cbind(rep(1,nrow(x)),m[,1:ncol(x)]) xtx<-solve(t(x)%*%x) h<-diag(x%*%xtx%*%t(x)) hc3<-xtx%*%t(x)%*%diag(temp$res^2/(1-h)^2)%*%x%*%xtx df<-nrow(x)-ncol(x) crit<-qt(1-alpha/2,df) al<-ncol(x) ci<-matrix(NA,nrow=al,ncol=3) for(j in 1:al){ ci[j,1]<-j ci[j,2]<-temp$coef[j]-crit*sqrt(hc3[j,j]) ci[j,3]<-temp$coef[j]+crit*sqrt(hc3[j,j]) } print("Confidence intervals for intercept followd by slopes:") list(ci=ci,stand.errors=sqrt(diag(hc3))) } pow2an<-function(x,y,ci=F,plotit=T,nboot=800){ # # Do a power analysis when comparing the 20% trimmed # means of two independent groups with the percentile # bootstrap method. # # x<-x[!is.na(x)] y<-y[!is.na(y)] lp<-NA se<-yuen(x,y)$se gval<-NA dv<-seq(0,3.5*se,length=15) for(i in 1:length(dv)){ gval[i]<-powest(x,y,dv[i],se) } if(!ci){ if(plotit){ plot(dv,gval,type="n",xlab="delta",ylab="power") lines(dv,gval) }} if(ci){ print("Taking bootstrap samples. Please wait.") datax <- matrix(sample(x, size = length(x) * nboot, replace = T), nrow = nboot) datay <- matrix(sample(y, size = length(y) * nboot, replace = T), nrow = nboot) pboot<-matrix(NA,ncol=15,nrow=nboot) for(i in 1:nboot){ se<-yuen(datax[i,],datay[i,])$se for(j in 1:length(dv)){ pboot[i,j]<-powest(x,y,dv[j],se) }} ll<-floor(.05*nboot+.5) for(i in 1:15){ temp<-sort(pboot[,i]) lp[i]<-temp[ll] } plot(c(dv,dv),c(gval,lp),type="n",xlab="delta",ylab="power") lines(dv,gval) lines(dv,lp,lty=2) } list(delta=dv,power=gval,lowp=lp) } powest<-function(x=NA,y=NA,delta=0,se=NA,wv1=NA,wv2=NA,n1=NA,n2=NA){ # # wv1 = Winsorized variance for group 1 # wv2 = Winsorized variance for group 2 # # Only 20% trimming is allowed. # tr<-.2 if(is.na(se)){ if(is.na(wv1)){ h1 <- length(x) - 2 * floor(tr * length(x)) h2 <- length(y) - 2 * floor(tr * length(y)) q1 <- ((length(x) - 1) * winvar(x, tr))/(h1 * (h1 - 1)) q2 <- ((length(y) - 1) * winvar(y, tr))/(h2 * (h2 - 1)) } if(!is.na(wv1)){ if(is.na(n1))stop("Need to specify sample size for group 1") if(is.na(n2))stop("Need to specify sample size for group 2") h1<-n1-2*floor(tr*n1) h2<-n2-2*floor(tr*n2) q1<-(n1-1)*wv1/(h1*(h1-1)) q2<-(n2-1)*wv2/(h2*(h2-1)) } se<-sqrt(q1+q2) } ygam<-sqrt(2*.01155)*c(0:35)/8 pow<-c(500.0,540.0,607.0, 706.0, 804.0,981.0,1176.0,1402.0,1681.0, 2008.0, 2353.0, 2769.0, 3191.0, 3646.0, 4124.0, 4617.0, 5101.0, 5630.0, 6117.0, 6602.0, 7058.0, 7459.0, 7812.0, 8150.0, 8479.0, 8743.0, 8984.0, 9168.0, 9332.0, 9490.0, 9607.0, 9700.0, 9782.0, 9839.0, 9868.0)/10000 flag<-(delta==0 && se==0) if(flag)powest<-.05 else{ chk<-floor(8*delta/se)+1 chk1<-chk+1 gval<-delta/se d1<-(gval-(chk-1)/8)*8 if(chk > length(pow))powest<-1 if(chk == length(pow))pow[chk1]<-1 if(chk <= length(pow)) powest<-pow[chk]+d1*(pow[chk1]-pow[chk]) } powest } twopcor<-function(x1,y1,x2,y2,SEED=T){ # # Compute a .95 confidence interval for # the difference between two Pearson # correlations corresponding to two independent # goups. # # This function uses an adjusted percentile bootstrap method that # gives good results when the error term is heteroscedastic. # # WARNING: If the number of boostrap samples is altered, it is # unknown how to adjust the confidence interval when n1+n2 < 250. # nboot<-599 #Number of bootstrap samples if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. X<-elimna(cbind(x1,y1)) x1<-X[,1] y1<-X[,2] X<-elimna(cbind(x2,y2)) x2<-X[,1] y2<-X[,2] print("Taking bootstrap samples; please wait") data1<-matrix(sample(length(y1),size=length(y1)*nboot,replace=T),nrow=nboot) bvec1<-apply(data1,1,pcorbsub,x1,y1) # A 1 by nboot matrix. data2<-matrix(sample(length(y2),size=length(y2)*nboot,replace=T),nrow=nboot) bvec2<-apply(data2,1,pcorbsub,x2,y2) # A 1 by nboot matrix. bvec<-bvec1-bvec2 ilow<-15 ihi<-584 if(length(y1)+length(y2) < 250){ ilow<-14 ihi<-585 } if(length(y1)+length(y2) < 180){ ilow<-11 ihi<-588 } if(length(y1)+length(y2) < 80){ ilow<-8 ihi<-592 } if(length(y1)+length(y2) < 40){ ilow<-7 ihi<-593 } bsort<-sort(bvec) r1<-cor(x1,y1) r2<-cor(x2,y2) ci<-c(bsort[ilow],bsort[ihi]) list(r1=r1,r2=r2,ci=ci) } indtall<-function(x,y=NA,tr=0,nboot=500,SEED=T){ # # Test the hypothesis of independence for # 1. all pairs of variables in matrix x, if y=NA, or # 2. between each variable stored in the matrix x and y. # This is done by repeated to calls to indt # x<-as.matrix(x) # First, eliminate any rows of data with missing values. if(!is.na(y[1])){ temp <- cbind(x, y) temp <- elimna(temp) pval<-ncol(temp)-1 x <- temp[,1:pval] y <- temp[, pval+1] } x<-as.matrix(x) if(is.na(y[1])){ ntest<-(ncol(x)^2-ncol(x))/2 if(ntest==0)stop("Something is wrong. Does x have only one column?") output<-matrix(NA,nrow=ntest,ncol=4) dimnames(output)<-list(NULL,c("VAR","VAR","Test Stat.","p-value")) x<-elimna(x) ic<-0 for (j in 1:ncol(x)){ for (jj in 1:ncol(x)){ if(jyhat)/length(x) zhat<-NA if(!is.na(z[1])){ # # Make decisions for the data in z, # set zhat=1 if decide it came from # group 1. # zxhat<-0 zyhat<-0 zhat<-0 if(op==2){ zxhat<-akerd(x,pts=z,pyhat=T,plotit=F) zyhat<-akerd(y,pts=z,pyhat=T,plotit=F) } for(i in 1:length(z)){ if(op==1){ zxhat[i]<-kerden(x,0,z[i]) zyhat[i]<-kerden(y,0,z[i]) } zhat[i]<-1 if(is.na(zxhat[i]) || is.na(zyhat[i])){ # Missing values, # data can't be used to make a decision, # so make a random decision about whether a value # came from first group. arb<-runif(1) zhat[i]<-1 if(arb < .5)zhat[i]<-0 } else if(zxhat[i]=2){ library(akima) if(ncol(x)==2 && !scale){ if(pr){ print("scale=F is specified.") print("If there is dependence, might use scale=T") }} m<-elimna(cbind(x,y)) x<-m[,1:d] y<-m[,d+1] if(eout && xout)stop("Can't have both eout and xout = F") if(eout){ flag<-outfun(m,plotit=F)$keep m<-m[flag,] } if(xout){ flag<-outfun(x,plotit=F)$keep m<-m[flag,] } x<-m[,1:d] y<-m[,d+1] if(d==2)fitr<-fitted(loess(y~x[,1]*x[,2],span=span,family=family)) if(d==3)fitr<-fitted(loess(y~x[,1]*x[,2]*x[,3],span=span,family=family)) if(d==4)fitr<-fitted(loess(y~x[,1]*x[,2]*x[,3]*x[,4],span=span,family=family)) if(d>4)stop("Can have at most four predictors") last<-fitr if(d==2 && plotit){ iout<-c(1:length(fitr)) nm1<-length(fitr)-1 for(i in 1:nm1){ ip1<-i+1 for(k in ip1:length(fitr))if(sum(x[i,]==x[k,])==2)iout[k]<-0 } fitr<-fitr[iout>=1] # Eliminate duplicate points in the x-y plane # This is necessary when doing three dimensional plots # with the S-PLUS function interp mkeep<-x[iout>=1,] fitr<-interp(mkeep[,1],mkeep[,2],fitr,duplicate=duplicate) persp(fitr,theta=theta,phi=phi,xlab=xlab,ylab=ylab,zlab=zlab,expand=expand, scale=scale) }} if(d==1){ m<-elimna(cbind(x,y)) x<-m[,1:d] y<-m[,d+1] if(eout && xout)stop("Can't have both eout and xout = F") if(eout){ flag<-outfun(m)$keep m<-m[flag,] } if(xout){ flag<-outfun(x)$keep m<-m[flag,] } x<-m[,1:d] y<-m[,d+1] if(plotit){ plot(x,y,xlab=xlab,ylab=ylab) lines(lowess(x,y,f=low.span)) } tempxy<-lowess(x,y,f=low.span) #yyy<-lowess(x,y,f=low.span)$y yyy<-tempxy$y #xxx<-lowess(x,y,f=low.span)$x xxx<-tempxy$x last<-yyy chkit<-sum(duplicated(x)) if(chkit>0){ last<-rep(1,length(y)) for(j in 1:length(yyy)){ for(i in 1:length(y)){ if(x[i]==xxx[j])last[i]<-yyy[j] }} } } E.power<-1 if(!cor.op)E.power<-varfun(last[!is.na(last)])/varfun(y) if(cor.op || E.power>=1){ if(d==1){ xord<-order(x) E.power<-cor.fun(last,y[xord])$cor^2 } if(d>1)E.power<-cor.fun(last,y)$cor^2 } if(!pyhat)last <- NULL list(Strength.Assoc=sqrt(E.power),Explanatory.power=E.power,yhat.values=last) } qci<-function(x,q=.5,alpha=.05,op=3){ # # Compute a confidence interval for qth quantile # using an estimate of standard error based on # adaptive kernel density estimator. # The qth quantile is estimated with a single order statistic. # # For argument op, see the function qse. # n<-length(x) xsort<-sort(x) iq <- floor(q * n + 0.5) qest<-xsort[iq] se<-qse(x,q,op=op) crit<-qnorm(1-alpha/2) ci.low<-qest-crit*se ci.up<-qest+crit*se list(ci.low=ci.low,ci.up=ci.up,q.est=qest) } qint<-function(x,q=.5,alpha=.05,print=T){ # # Compute a 1-alpha confidence interval for the qth quantile # The function returns the exact probability coverage. # n<-length(x) ii<-floor(q*n+.5) jj<-ii+1 if(ii<=0)stop("Cannot compute a confidence interval for this q") if(jj>n)stop("Cannot compute a confidence interval for this q") jjm<-jj-1 iim<-ii-1 cicov<-pbinom(jjm,n,q)-pbinom(iim,n,q) while(cicov<1-alpha){ iim<-max(iim-1,0) jjm<-min(jjm+1,n) if(iim==0 && jjm==n)break cicov<-pbinom(jjm,n,q)-pbinom(iim,n,q) } xsort<-sort(x) low<-xsort[iim+1] hi<-xsort[jjm] if(cicov<1-alpha){ if(print)print("Warning: Desired probability coverage could not be achieved") } list(ci.low=low,ci.up=hi,ci.coverage=cicov) } anova1<-function(x){ # # conventional one-way anova # if(is.matrix(x))x<-listm(x) A<-0 B<-0 C<-0 N<-0 for(j in 1:length(x)){ N<-N+length(x[[j]]) A<-A+sum(x[[j]]^2) B<-B+sum(x[[j]]) C<-C+(sum(x[[j]]))^2/length(x[[j]]) } SST<-A-B^2/N SSBG<-C-B^2/N SSWG<-A-C nu1<-length(x)-1 nu2<-N-length(x) MSBG<-SSBG/nu1 MSWG<-SSWG/nu2 FVAL<-MSBG/MSWG pvalue<-1-pf(FVAL,nu1,nu2) list(F.test=FVAL,p.value=pvalue,df1=nu1,df2=nu2,MSBG=MSBG,MSWG=MSWG) } qest<-function(x,q=.5){ # # Compute an estimate of qth quantile # using a single order statistic # x<-elimna(x) if(q<=0 || q>=1)stop("q must be > 0 and < 1") n<-length(x) xsort<-sort(x) iq <- floor(q * n + 0.5) qest<-NA if(iq>0 || iq<=n)qest<-xsort[iq] qest } taureg<-function(m,y,corfun=tau){ # # Compute Kendall's tau between y and each of the # p variables stored in the n by p matrix m. # # Alternative measures of correlation can be used via the # argument corfun. The only requirement is that the function # corfun returns the correlation in corfun$cor and the p-value # in corfun$siglevel. # # This function also returns the two-sided significance level # for all pairs of variables, plus a test of zero correlations # among all pairs. (See chapter 9 of Wilcox, 2005, for details.) # m<-as.matrix(m) tauvec<-NA siglevel<-NA for (i in 1:ncol(m)){ pbc<-corfun(m[,i],y) tauvec[i]<-pbc$cor siglevel[i]<-pbc$siglevel } list(cor=tauvec,siglevel=siglevel) } correg.sub<-function(X,theta,corfun=tau){ np<-ncol(X) p<-np-1 x<-X[,1:p] y<-X[,np] temp<-t(t(x)*theta) yhat<-apply(temp,1,sum) yhat<-yhat res<-y-yhat val<-sum(abs(taureg(x,res,corfun=corfun)$cor)) val } correg<-function(x,y,corfun=tau,loc.fun=median){ # # A generalization of the Theil-Sen estimator # Rather than use Kendall's tau, can use an alternative # correlation via the argument corfun. # loc.fun determines how the intercept is computed; # # The Nelder-Mead method is used rather than # Gauss-Seidel. # # X<-cbind(x,y) X<-elimna(X) np<-ncol(X) N<-np-1 temp<-tsreg(x,y)$coef START<-temp[2:np] temp<-nelderv2(X,N,FN=correg.sub,START=START,corfun=corfun) x <- as.matrix(x) alpha <- loc.fun(y - x %*% temp) coef <- c(alpha,temp) res <- y - x %*% temp - alpha list(coef = coef, residuals = res) } rmulnorm<-function(n,p,cmat,SEED=F){ # # Generate data from a multivariate normal # n= sample size # p= number of variables # cmat is the covariance (or correlation) matrix # # Method (e.g. Browne, M. W. (1968) A comparison of factor analytic # techniques. Psychometrika, 33, 267-334. # Let U'U=R be the Cholesky decomposition of R. Generate independent data # from some dist yielding X. Then XU has population correlation matrix R # if(SEED)set.seed(2) y<-matrix(rnorm(n*p),ncol=p) rval<-matsqrt(cmat) y<-t(rval%*%t(y)) y } matsqrt <- function(x) { xev1<-NA xe <- eigen(x) xe1 <- xe$values if(all(xe1 >= 0)) { xev1 <- diag(sqrt(xe1)) } if(is.na(xev1[1]))stop("The matrix has negative eigenvalues") xval1 <- cbind(xe$vectors) xval1i <- solve(xval1) y <- xval1 %*% xev1 %*% xval1i y } ghmul<-function(n,g=0,h=0,p=2,cmat=diag(rep(1,p)),SEED=F){ # # generate n observations from a p-variate dist # based on the g and h dist. # # cmat is the correlation matrix # x<-rmulnorm(n,p,cmat,SEED=SEED) for(j in 1:p){ if (g>0){ x[,j]<-(exp(g*x[,j])-1)*exp(h*x[,j]^2/2)/g } if(g==0)x[,j]<-x[,j]*exp(h*x[,j]^2/2) } x } yhbt<-function(x,y,tr=.2,alpha=.05,nboot=600,SEED=T){ # # Compute a 1-alpha confidence interval for the difference between # the trimmed means corresponding to two independent groups. # The bootstrap-t method with Hall's transformation is used. # if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. x<-x[!is.na(x)] # Remove missing values in x y<-y[!is.na(y)] # Remove missing values in y xcen<-x-mean(x,tr) ycen<-y-mean(y,tr) print("Taking bootstrap samples. Please wait.") datax<-matrix(sample(xcen,size=length(x)*nboot,replace=T),nrow=nboot) datay<-matrix(sample(ycen,size=length(y)*nboot,replace=T),nrow=nboot) val<-NA for(ib in 1:nboot)val[ib]<-yhall(datax[ib,],datay[ib,],tr=tr)$test.stat temp<-yhall(x,y,tr=tr) sigtil<-temp$sig.tilda nhat<-temp$nu.tilda val<-sort(val) ilow<-round(alpha*nboot) il<-ilow+1 uval<-nboot-ilow b.low<-3*(1+nhat*val[il]-nhat/6)^{1/3}/nhat-3/nhat b.hi<-3*(1+nhat*val[uval]-nhat/6)^{1/3}/nhat-3/nhat dif<-mean(x,tr=tr)-mean(y,tr=tr) ci.low<-dif-sigtil*b.hi ci.up<-dif-sigtil*b.low list(ci.low=ci.low,ci.up=ci.up) } yhall<-function(x,y,tr=.2,alpha=.05){ # # Perform Yuen's test for trimmed means on the data in x and y # in conjunction with Hall's transformation. # The default amount of trimming is 20% # Missing values (values stored as NA) are automatically removed. # # A confidence interval for the trimmed mean of x minus the # the trimmed mean of y is computed and returned in yuen$ci. # x<-x[!is.na(x)] # Remove any missing values in x y<-y[!is.na(y)] # Remove any missing values in y winx<-winval(x,tr=tr) winy<-winval(y,tr=tr) m3x<-sum((winx-mean(winx))^3)/length(x) m3y<-sum((winy-mean(winy))^3)/length(y) h1<-length(x)-2*floor(tr*length(x)) h2<-length(y)-2*floor(tr*length(y)) mwx<-length(x)*m3x/h1 mwy<-length(y)*m3y/h2 q1<-(length(x)-1)*winvar(x,tr)/(h1*(h1-1)) q2<-(length(y)-1)*winvar(y,tr)/(h2*(h2-1)) sigtil<-q1+q2 mtil<-(mwx/h1^2)-(mwy/h2^2) dif<-mean(x,tr)-mean(y,tr) thall<-dif+mtil/(6*sigtil)+mtil*dif^2/(3*sigtil^2)+mtil^2*dif^3/(27*sigtil^3) thall<-thall/sqrt(sigtil) nhat<-mtil/sigtil^1.5 list(test.stat=thall,nu.tilda=nhat,sig.tilda=sqrt(sigtil)) } linconm<-function(x,con=0,est=onestep,alpha=.05,nboot=500,pr=T,...){ # # Compute a 1-alpha confidence interval for a set of d linear contrasts # involving M-estimators using a bootstrap method. (See Chapter 6.) # Independent groups are assumed. # # The data are assumed to be stored in x in list mode. Thus, # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. Length(x)=the number of groups = J, say. # # con is a J by d matrix containing the contrast coefficents of interest. # If unspecified, all pairwise comparisons are performed. # For example, con[,1]=c(1,1,-1,-1,0,0) and con[,2]=c(,1,-1,0,0,1,-1) # will test two contrasts: (1) the sum of the first two measures of location is # equal to the sum of the second two, and (2) the difference between # the first two is equal to the difference between the measure of location for # groups 5 and 6. # # The default number of bootstrap samples is nboot=399 # # This function uses the function trimpartt written for this # book. # # # # if(pr){ print("Note: confidence intervals are adjusted to control FWE") print("But p-values are not adjusted to control FWE") } if(is.matrix(x))x<-listm(x) con<-as.matrix(con) if(!is.list(x))stop("Data must be stored in list mode.") J<-length(x) Jm<-J-1 d<-(J^2-J)/2 if(sum(con^2)==0){ con<-matrix(0,J,d) id<-0 for (j in 1:Jm){ jp<-j+1 for (k in jp:J){ id<-id+1 con[j,id]<-1 con[k,id]<-0-1 }}} if(nrow(con)!=length(x))stop("The number of groups does not match the number of contrast coefficients.") m1<-matrix(0,J,nboot) m2<-1 # Initialize m2 mval<-1 set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") for(j in 1:J){ if(pr)print(paste("Working on group ",j)) mval[j]<-est(x[[j]],...) xcen<-x[[j]]-est(x[[j]],...) data<-matrix(sample(xcen,size=length(x[[j]])*nboot,replace=T),nrow=nboot) m1[j,]<-apply(data,1,est,...) # A J by nboot matrix. m2[j]<-var(m1[j,]) } boot<-matrix(0,ncol(con),nboot) bot<-1 for (d in 1:ncol(con)){ top<-apply(m1,2,trimpartt,con[,d]) # A vector of length nboot containing psi hat values consq<-con[,d]^2 bot[d]<-trimpartt(m2,consq) boot[d,]<-abs(top)/sqrt(bot[d]) } testb<-apply(boot,2,max) ic<-floor((1-alpha)*nboot) testb<-sort(testb) psihat<-matrix(0,ncol(con),6) dimnames(psihat)<-list(NULL,c("con.num","psihat","ci.lower","ci.upper","se","p.value")) for (d in 1:ncol(con)){ psihat[d,1]<-d psihat[d,2]<-trimpartt(mval,con[,d]) psihat[d,3]<-psihat[d,2]-testb[ic]*sqrt(bot[d]) psihat[d,4]<-psihat[d,2]+testb[ic]*sqrt(bot[d]) psihat[d,5]<-sqrt(bot[d]) pval<-mean((boot[d,]1)fval<-akerdmul(xx,pts=pts,hval=hval,aval=aval,fr=fr,pr=pyhat, plotit=plotit,theta=theta,phi=phi,expand=expand,scale=scale) plotit<-F } if(is.matrix(xx) && ncol(xx)==1)xx<-xx[,1] if(!is.matrix(xx)){ x<-sort(xx) if(op==1){ m<-mad(x) if(m==0){ temp<-idealf(x) m<-(temp$qu-temp$ql)/(qnorm(.75)-qnorm(.25)) } if(m==0)m<-sqrt(winvar(x)/.4129) if(m==0)stop("All measures of dispersion are equal to 0") fhat <- rdplot(x,pyhat=T,plotit=F,fr=fr) if(m>0)fhat<-fhat/(2*fr*m) } if(op==2){ init<-density(xx) fhat <- init$y x<-init$x } n<-length(x) if(is.na(hval)){ sig<-sqrt(var(x)) temp<-idealf(x) iqr<-(temp$qu-temp$ql)/1.34 A<-min(c(sig,iqr)) if(A==0)A<-sqrt(winvar(x))/.64 hval<-1.06*A/length(x)^(.2) # See Silverman, 1986, pp. 47-48 } gm<-exp(mean(log(fhat[fhat>0]))) alam<-(fhat/gm)^(0-aval) dhat<-NA if(is.na(pts[1]))pts<-x pts<-sort(pts) for(j in 1:length(pts)){ temp<-(pts[j]-x)/(hval*alam) epan<-ifelse(abs(temp)yq) B<-mean(flag1*flag2) flag1<-(x>xq) flag2<-(y<=yq) C1<-mean(flag1*flag2) flag1<-(x>xq) flag2<-(y>yq) D1<-mean(flag1*flag2) fx<-akerd(x,pts=xq,plotit=F,pyhat=T) fy<-akerd(y,pts=yq,plotit=F,pyhat=T) v1<-(q-1)^2*A v2<-(q-1)*q*B v3<-(q-1)*q*C1 v4<-q*q*D1 temp<-0-2*(v1+v2+v3+v4)/(fx*fy)+q*(1-q)/fx^2+q*(1-q)/fy^2 val<-sqrt(temp/n) val } akerdmul<-function(x,pts=NA,hval=NA,aval=.5,fr=.8,pr=F,plotit=T,theta=50, phi=25,expand=.5,scale=F,xlab="X",ylab="Y",zlab=""){ # # Compute adaptive kernel density estimate # for multivariate data # (See Silverman, 1986) # # Use expected frequency as initial estimate of the density # # hval is the span used by the kernel density estimator # fr is the span used by the expected frequency curve # pr=T, returns density estimates at pts # library(MASS) library(akima) if(is.na(pts[1]))pts<-x if(ncol(x)!=ncol(pts))stop("Number of columns for x and pts do not match") if(!is.matrix(x))stop("Data should be stored in a matrix") fhat <- rdplot(x,pyhat=T,plotit=F,fr=fr) n<-nrow(x) d<-ncol(x) pi<-gamma(.5)^2 cd<-c(2,pi) if(d==2)A<-1.77 if(d==3)A<-2.78 if(d>2){ for(j in 3:d)cd[j]<-2*pi*cd[j-2]/n # p. 76 } if(d>3)A<-(8*d*(d+2)*(d+4)*(2*sqrt(pi))^d)/((2*d+1)*cd[d]) # p. 87 if(is.na(hval))hval<-A*(1/n)^(1/(d+4)) # Silverman, p. 86 svec<-NA for(j in 1:d){ sig<-sqrt(var(x[,j])) temp<-idealf(x[,j]) iqr<-(temp$qu-temp$ql)/1.34 A<-min(c(sig,iqr)) x[,j]<-x[,j]/A svec[j]<-A } hval<-hval*sqrt(mean(svec^2)) # Silverman, p. 87 # Now do adaptive; see Silverman, 1986, p. 101 gm<-exp(mean(log(fhat[fhat>0]))) alam<-(fhat/gm)^(0-aval) dhat<-NA nn<-nrow(pts) for(j in 1:nn){ #temp1<-t(t(x)-x[j,])/(hval*alam) temp1<-t(t(x)-pts[j,])/(hval*alam) temp1<-temp1^2 temp1<-apply(temp1,1,FUN="sum") temp<-.5*(d+2)*(1-temp1)/cd[d] epan<-ifelse(temp1<1,temp,0) # Epanechnikov kernel, p. 76 dhat[j]<-mean(epan/(alam*hval)^d) } if(plotit && d==2){ fitr<-dhat iout<-c(1:length(fitr)) nm1<-length(fitr)-1 for(i in 1:nm1){ ip1<-i+1 for(k in ip1:length(fitr))if(sum(x[i,]==x[k,])==2)iout[k]<-0 } fitr<-fitr[iout>=1] mkeep<-x[iout>=1,] fit<-interp(mkeep[,1],mkeep[,2],fitr) persp(fit,theta=theta,phi=phi,xlab=xlab,ylab=ylab,zlab=zlab,expand=expand, scale=scale) } m<-"Done" if(pr)m<-dhat m } cov2med<-function(x,y=NA,q=.5){ # # Estimate the covariance between two dependent # order statistics # By default, q=.5 meaning that an estimate of # of covariance is made when a single order statistic # is used to estimate the median. # y=NA, function returns squared standard error. # if(is.na(y[1]))val<-qse(x,q=q,op=3)^2 if(!is.na(y[1])){ if(sum((x-y)^2)==0)val<-qse(x,q=q,op=3)^2 if(sum((x-y)^2)>0){ n<-length(x) m<-floor(q*n+.5) yord<-sort(y) flag<-(y<=yord[m]) xord<-sort(x) xq<-xord[m] yord<-sort(y) yq<-yord[m] flag1<-(x<=xq) flag2<-(y<=yq) A<-mean(flag1*flag2) flag1<-(x<=xq) flag2<-(y>yq) B<-mean(flag1*flag2) flag1<-(x>xq) flag2<-(y<=yq) C1<-mean(flag1*flag2) flag1<-(x>xq) flag2<-(y>yq) D1<-mean(flag1*flag2) fx<-akerd(x,pts=xq,plotit=F,pyhat=T) fy<-akerd(y,pts=yq,plotit=F,pyhat=T) v1<-(q-1)^2*A v2<-(q-1)*q*B v3<-(q-1)*q*C1 v4<-q*q*D1 val<-((v1+v2+v3+v4)/(fx*fy))/n }} val } covmmed<-function(x,p=length(x),grp=c(1:p),q=.5){ # # Estimate the covariance matrix for the sample medians # based on a SINGLE order statistic, using # the data in the S-PLUS variable x. # (x[[1]] contains the data for group 1, x[[2]] the data for group 2, etc.) # The function returns a p by p matrix of covariances, the diagonal # elements being equal to the squared standard error of the sample # trimmed means, where p is the number of groups to be included. # By default, all the groups in x are used, but a subset of # the groups can be used via grp. For example, if # the goal is to estimate the covariances between the medians # for groups 1, 2, and 5, use the command grp<-c(1,2,5) # before calling this function. # # Missing values (values stored as NA) are not allowed. # if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("The data are not stored in a matrix or list mode.") p<-length(grp) pm1<-p-1 for (i in 1:pm1){ ip<-i+1 if(length(x[[grp[ip]]])!=length(x[[grp[i]]]))stop("The number of observations in each group must be equal") } n<-length(x[[grp[1]]]) covest<-matrix(0,p,p) for(j in 1:p){ for(k in 1:p){ if(j==k)covest[j,j]<-cov2med(x[[grp[j]]],q=q) if(j=20 # if(!is.na(y[1]))x<-cbind(x,y) if(!is.matrix(x))stop("Something is wrong, with x or y") x<-elimna(x) y<-x[,2] x<-x[,1] n<-length(y) df<-n-1 if(is.na(se.val[1])){ if(!bop)se.val<-sedm(x,y,q=q) if(bop)se.val<-bootdse(x,y,est=qest,q=q,pr=F,nboot=nboot) } test<-(qest(x,q)-qest(y,q))/se.val sig.level<-2*(1-pt(abs(test),df)) list(test.stat=test,p.value=sig.level,se=se.val) } lincdm<-function(x,con=0,alpha=.05,q=.5,mop=F,nboot=100,SEED=T){ # # A heteroscedastic test of d linear contrasts among # dependent groups using medians. # # The data are assumed to be stored in $x$ in list mode. # Length(x) is assumed to correspond to the total number of groups, J # It is assumed all groups are independent. # # con is a J by d matrix containing the contrast coefficients that are used. # If con is not specified, all pairwise comparisons are made. # # q is the quantile used to compare groups. # con contains contrast coefficients, # con=0 means all pairwise comparisons are used # mop=F, use single order statistic # mop=T, use usual sample median, even if q is not equal to .5 # in conjunction with a bootstrap estimate of covariances among # the medians using # nboot samples. # # Missing values are automatically removed. # # if(mop && SEED)set.seed(2) if(is.list(x)){ x<-matl(x) x<-elimna(x) } if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in a matrix or in list mode.") con<-as.matrix(con) J<-length(x) h<-length(x[[1]]) w<-vector("numeric",J) xbar<-vector("numeric",J) for(j in 1:J){ if(!mop)xbar[j]<-qest(x[[j]],q=q) if(mop)xbar[j]<-median(x[[j]]) } if(sum(con^2)==0){ temp<-qdmcp(x,alpha=alpha,q=q,pr=F) test<-temp$test psihat<-temp$psihat num.sig<-temp$num.sig } if(sum(con^2)>0){ ncon<-ncol(con) if(alpha==.05){ dvec<-c(.05,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) if(ncon > 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.01,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01)dvec<-alpha/c(1:ncon) if(nrow(con)!=length(x)){ stop("The number of groups does not match the number of contrast coefficients.") } psihat<-matrix(0,ncol(con),4) dimnames(psihat)<-list(NULL,c("con.num","psihat","ci.lower","ci.upper")) test<-matrix(0,ncol(con),5) dimnames(test)<-list(NULL,c("con.num","test","p.value","crit.p.value","se")) df<-length(x[[1]])-1 if(!mop)w<-covmmed(x,q=q) if(mop)w<-bootcov(x,nboot=nboot,pr=F) for (d in 1:ncol(con)){ psihat[d,1]<-d psihat[d,2]<-sum(con[,d]*xbar) cvec<-as.matrix(con[,d]) sejk<-sqrt(t(cvec)%*%w%*%cvec) test[d,1]<-d test[d,2]<-sum(con[,d]*xbar)/sejk test[d,3]<-2*(1-pt(abs(test[d,2]),df)) test[d,5]<-sejk } temp1<-test[,3] temp2<-order(0-temp1) zvec<-dvec[1:ncon] test[temp2,4]<-zvec psihat[,3]<-psihat[,2]-qt(1-test[,4]/2,df)*test[,5] psihat[,4]<-psihat[,2]+qt(1-test[,4]/2,df)*test[,5] num.sig<-sum(test[,3]<=test[,4]) } list(test=test,psihat=psihat,num.sig=num.sig) } mwwmcp<-function(J,K,x,grp=c(1:p),p=J*K,q=.5,bop=F,alpha=.05,nboot=100, SEED=T){ # # For a J by K anova using quantiles with # repeated measures on both factors, # Perform all multiple comparisons for main effects # and interactions. # # q=.5 by default meaning medians are compared # bop=F means bootstrap option not used; # with bop=T, function uses usual medians rather # rather than a single order statistic to estimate median # in conjunction with a bootstrap estimate of covariances # among sample medians. # # The s-plus variable data is assumed to contain the raw # data stored in a matrix or in list mode. # When in list mode data[[1]] contains the data # for the first level of both factors: level 1,1. # data[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second: level 1,2 # data[[K]] is the data for level 1,K # data[[K+1]] is the data for level 2,1, data[2K] is level 2,K, etc. # # It is assumed that data has length JK, the total number of # groups being tested, but a subset of the data can be analyzed # using grp # Qa<-NA Qab<-NA if(is.list(x))x<-elimna(matl(x)) if(is.matrix(x))x<-elimna(x) data<-x if(is.matrix(data))data<-listm(data) if(!is.list(data))stop("Data are not stored in list mode or a matrix") if(p!=length(data)){ print("The total number of groups, based on the specified levels, is") print(p) print("The number of groups stored in x is") print(length(data)) print("Warning: These two values are not equal") } if(p!=length(grp))stop("Apparently a subset of the groups was specified that does not match the total number of groups indicated by the values for J and K.") tmeans<-0 # Create the three contrast matrices # Ja<-(J^2-J)/2 Ka<-(K^2-K)/2 JK<-J*K conA<-matrix(0,nrow=JK,ncol=Ja) ic<-0 for(j in 1:J){ for(jj in 1:J){ if(j < jj){ ic<-ic+1 mat<-matrix(0,nrow=J,ncol=K) mat[j,]<-1 mat[jj,]<-0-1 conA[,ic]<-t(mat) }}} conB<-matrix(0,nrow=JK,ncol=Ka) ic<-0 for(k in 1:K){ for(kk in 1:K){ if(k0){ ncon<-ncol(con) if(alpha==.05){ dvec<-c(.05,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) if(ncon > 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.01,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01)dvec<-alpha/c(1:ncon) if(nrow(con)!=length(x)){ stop("The number of groups does not match the number of contrast coefficients.") } psihat<-matrix(0,ncol(con),4) dimnames(psihat)<-list(NULL,c("con.num","psihat","ci.lower","ci.upper")) test<-matrix(0,ncol(con),5) dimnames(test)<-list(NULL,c("con.num","test","p.value","crit.p.value","se")) df<-length(x[[1]])-1 w<-covmtrim(x,tr=tr) for (d in 1:ncol(con)){ psihat[d,1]<-d psihat[d,2]<-sum(con[,d]*xbar) cvec<-as.matrix(con[,d]) sejk<-sqrt(t(cvec)%*%w%*%cvec) test[d,1]<-d test[d,2]<-sum(con[,d]*xbar)/sejk test[d,3]<-2*(1-pt(abs(test[d,2]),df)) test[d,5]<-sejk } temp1<-test[,3] temp2<-order(0-temp1) zvec<-dvec[1:ncon] test[temp2,4]<-zvec psihat[,3]<-psihat[,2]-qt(1-test[,4]/2,df)*test[,5] psihat[,4]<-psihat[,2]+qt(1-test[,4]/2,df)*test[,5] num.sig<-sum(test[,3]<=test[,4]) } list(test=test,psihat=psihat,num.sig=num.sig) } sintv2<-function(x,alpha=.05,nullval=0){ # # Compute a 1-alpha confidence interval for the median using # the Hettmansperger-Sheather interpolation method. # (See section 4.5.2.) # # The default value for alpha is .05. # ci<-sint(x,alpha=alpha) alph<-c(1:99)/100 for(i in 1:99){ irem<-i chkit<-sint(x,alpha=alph[i]) if(chkit[1]>nullval || chkit[2]nullval || chkit[2]nullval || chkit[2] 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.01,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01)dvec<-alpha/c(1:ncon) psihat<-matrix(0,CC,5) dimnames(psihat)<-list(NULL,c("Group","Group","psihat","ci.lower","ci.upper")) test<-matrix(NA,CC,6) dimnames(test)<-list(NULL,c("Group","Group","test","p-value","p.crit","se")) if(bop)se.val<-bootdse(x,nboot=nboot,pr=pr) temp1<-0 jcom<-0 for (j in 1:J){ for (k in 1:J){ if (j < k){ jcom<-jcom+1 if(!bop)temp<-qdtest(x[,j],x[,k],q=q,bop=bop) if(bop)temp<-qdtest(x[,j],x[,k],se.val=se.val[jcom]) sejk<-temp$se test[jcom,6]<-sejk test[jcom,3]<-temp$test.stat test[jcom,4]<-temp$p.value if(length(x[,j])<20)test[jcom,4]<-mrm1way(x[,c(j,k)],q=q,SEED=SEED)$p.value psihat[jcom,1]<-j psihat[jcom,2]<-k test[jcom,1]<-j test[jcom,2]<-k psihat[jcom,3]<-(xbar[j]-xbar[k]) }}} temp1<-test[,4] temp2<-order(0-temp1) zvec<-dvec[1:ncon] test[temp2,5]<-zvec psihat[,4]<-psihat[,3]-qt(1-test[,5]/2,df)*test[,6] psihat[,5]<-psihat[,3]+qt(1-test[,5]/2,df)*test[,6] num.sig<-sum(test[,4]<=test[,5]) list(test=test,psihat=psihat,num.sig=num.sig) } bwmedbmcp<-function(J,K,x,tr=.2,JK=J*K,grp=c(1:JK),con=0,alpha=.05,dif=F,pool=F,bop=F,nboot=100,SEED=T){ # # All pairwise comparisons among levels of Factor B # in a split-plot design using trimmed means. # # Data are pooled for each level # of Factor B. # Then this function calls rmmcp. # # The s-plus variable x is assumed to contain the raw # data stored in list mode or in a matrix. # If in list mode, x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second: level 1,2 # x[[K]] is the data for level 1,K # x[[K+1]] is the data for level 2,1, x[[2K]] is level 2,K, etc. # # If the data are in a matrix, column 1 is assumed to # correspond to x[[1]], column 2 to x[[2]], etc. # # When in list mode x is assumed to have length JK, the total number of # groups being tested, but a subset of the data can be analyzed # using grp # if(is.matrix(x)) { y <- list() for(j in 1:ncol(x)) y[[j]] <- x[, j] x <- y } JK<-J*K data<-list() for(j in 1:length(x)){ data[[j]]<-x[[grp[j]]] # Now have the groups in proper order. } x<-data if(pool){ data<-list() m1<-matrix(c(1:JK),J,K,byrow=T) for(k in 1:K){ for(j in 1:J){ flag<-m1[j,k] if(j==1)temp<-x[[flag]] if(j>1){ temp<-c(temp,x[[flag]]) }} data[[k]]<-temp } print("Group numbers refer to levels of Factor B") if(!dif)temp<-lincdm(data,con=con,alpha=alpha,nboot=nboot,mop=bop) if(dif)temp<-qdmcpdif(data,con=con,alpha=alpha) return(temp) } if(!pool){ mat<-matrix(c(1:JK),ncol=K,byrow=T) for(j in 1:J){ data<-list() ic<-0 for(k in 1:K){ ic<-ic+1 data[[ic]]<-x[[mat[j,k]]] } print(paste("For level ", j, " of Factor A:")) if(!dif)temp<-lincdm(data,con=con,alpha=alpha,nboot=nboot,mop=bop) if(dif)temp<-qdmcpdif(data,con=con,alpha=alpha) print(temp$test) print(temp$psihat) }} } qdmcpdif<-function(x, con = 0,alpha = 0.05){ # # MCP with medians on difference scores # FWE controlled with Rom's method # if(!is.matrix(x))x<-matl(x) if(!is.matrix(x))stop("Data must be stored in a matrix or in list mode.") con<-as.matrix(con) J<-ncol(x) xbar<-vector("numeric",J) x<-elimna(x) # Remove missing values nval<-nrow(x) h1<-nrow(x) df<-h1-1 if(sum(con^2!=0))CC<-ncol(con) if(sum(con^2)==0)CC<-(J^2-J)/2 ncon<-CC if(alpha==.05){ dvec<-c(.05,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) if(ncon > 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.01,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01)dvec<-alpha/c(1:ncon) if(sum(con^2)==0){ psihat<-matrix(0,CC,5) dimnames(psihat)<-list(NULL,c("Group","Group","psihat","ci.lower","ci.upper")) test<-matrix(NA,CC,5) dimnames(test)<-list(NULL,c("Group","Group","p-value","p.crit","se")) temp1<-0 jcom<-0 for (j in 1:J){ for (k in 1:J){ if (j < k){ jcom<-jcom+1 dv<-x[,j]-x[,k] test[jcom,5]<-msmedse(dv) temp<-sintv2(dv,alpha=alpha/CC) temp1[jcom]<-temp$p.value test[jcom,3]<-temp$p.value psihat[jcom,1]<-j psihat[jcom,2]<-k test[jcom,1]<-j test[jcom,2]<-k psihat[jcom,3]<-median(dv) psihat[jcom,4]<-temp$ci.low psihat[jcom,5]<-temp$ci.up }}} temp2<-order(0-temp1) zvec<-dvec[1:ncon] sigvec<-(test[temp2]>=zvec) if(sum(sigvec)0){ if(nrow(con)!=ncol(x))print("WARNING: The number of groups does not match the number of contrast coefficients.") ncon<-ncol(con) psihat<-matrix(0,ncol(con),4) dimnames(psihat)<-list(NULL,c("con.num","psihat","ci.lower","ci.upper")) test<-matrix(0,ncol(con),4) dimnames(test)<-list(NULL,c("con.num","sig","crit.sig","se")) temp1<-NA for (d in 1:ncol(con)){ psihat[d,1]<-d for(j in 1:J){ if(j==1)dval<-con[j,d]*x[,j] if(j>1)dval<-dval+con[j,d]*x[,j] } temp3<-sintv2(dval) temp1[d]<-temp3$p.value test[d,1]<-d test[d,4]<-msmedse(dval) psihat[d,2]<-median(dval) psihat[d,3]<-temp3$ci.low psihat[d,4]<-temp3$ci.up } test[,2]<-temp1 temp2<-order(0-temp1) zvec<-dvec[1:ncon] print(c(ncon,zvec)) sigvec<-(test[temp2,2]>=zvec) if(sum(sigvec)0)+ sum(psihat[,5]<0) if(sum(con^2)>0)num.sig<-sum(psihat[,3]>0)+ sum(psihat[,4]<0) list(test=test,psihat=psihat,con=con,num.sig=num.sig) } loc2dif<-function(x,y,est=median,na.rm=T,...){ # # Compute a measure of location associated with the # distribution of x-y, the measure location given by # the argument # est. # Advantage: high efficiency even under normality versus # using sample means. # val<-est(as.vector(outer(x,y,FUN="-")),na.rm=na.rm,...) val } l2dci<-function(x,y,est=median,alpha=.05,nboot=500,SEED=T,pr=T,...){ # # Compute a bootstrap confidence interval for a # measure of location associated with # the distribution of x-y, # est indicates which measure of location will be used # # Function returns confidence interval, p-value and estimate # of square standard error of the estimator used. # x<-x[!is.na(x)] # Remove any missing values in x y<-y[!is.na(y)] # Remove any missing values in y if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. if(pr)print("Taking bootstrap samples. Please wait.") datax<-matrix(sample(x,size=length(x)*nboot,replace=T),nrow=nboot) datay<-matrix(sample(y,size=length(y)*nboot,replace=T),nrow=nboot) bvec<-NA for(i in 1:nboot)bvec[i]<-loc2dif(datax[i,],datay[i,],est=est) bvec<-sort(bvec) low<-round((alpha/2)*nboot)+1 up<-nboot-low temp<-sum(bvec<0)/nboot+sum(bvec==0)/(2*nboot) sig.level<-2*(min(temp,1-temp)) se<-var(bvec) list(ci=c(bvec[low],bvec[up]),p.value=sig.level,sq.se=se) } qdec2ci<-function(x,y=NA,nboot=500,alpha=.05,pr=T,SEED=T,plotit=T){ # # Compare the deciles of two dependent groups # with quantiles estimated with a single order statistic # if(SEED)set.seed(2) if(is.na(y[1])){ y<-x[,2] x<-x[,1] } bvec<-matrix(NA,nrow=nboot,ncol=9) if(pr)print("Taking bootstrap samples. Please Wait.") data<-matrix(sample(n,size=n*nboot,replace=T),nrow=nboot) for(i in 1:nboot)bvec[i,]<-qdec(x[data[i,]])-qdec(y[data[i,]]) pval<-NA m<-matrix(0,9,4) dimnames(m)<-list(NULL,c("lower","upper","Delta.hat","p.values")) crit <- alpha/2 icl <- round(crit * nboot) + 1 icu <- nboot - icl for(i in 1:9){ pval[i]<-(sum(bvec[,i]<0)+.5*sum(bvec[,i]==0))/nboot pval[i]<-2*min(pval[i],1-pval[i]) temp<-sort(bvec[,i]) m[i,1]<-temp[icl] m[i,2]<-temp[icu] } m[,3]<-qdec(x)-qdec(y) m[,4]<-pval if(plotit){ xaxis<-c(qdec(x),qdec(x)) par(pch="+") yaxis<-c(m[,1],m[,2]) plot(xaxis,yaxis,ylab="delta",xlab="x (first group)") par(pch="*") points(qdec(x),m[,3]) } m } ancovam<-function(x1,y1,x2,y2,fr1=1,fr2=1,alpha=.05,plotit=T,pts=NA,sm=F, pr=T){ # # Compare two independent groups using an ancova method # No parametric assumption is made about the form of # the regression lines--a running interval smoother is used. # # This function is designed specifically for # MEDIANS # # Assume data are in x1 y1 x2 and y2 # if(pr){ print("NOTE: Confidence intervals are adjusted to control the probability") print("of at least one Type I error.") print("But p-values are not") } if(is.na(pts[1])){ npt<-5 isub<-c(1:5) # Initialize isub test<-c(1:5) xorder<-order(x1) y1<-y1[xorder] x1<-x1[xorder] xorder<-order(x2) y2<-y2[xorder] x2<-x2[xorder] n1<-1 n2<-1 vecn<-1 for(i in 1:length(x1))n1[i]<-length(y1[near(x1,x1[i],fr1)]) for(i in 1:length(x1))n2[i]<-length(y2[near(x2,x1[i],fr2)]) for(i in 1:length(x1))vecn[i]<-min(n1[i],n2[i]) sub<-c(1:length(x1)) isub[1]<-min(sub[vecn>=12]) isub[5]<-max(sub[vecn>=12]) isub[3]<-floor((isub[1]+isub[5])/2) isub[2]<-floor((isub[1]+isub[3])/2) isub[4]<-floor((isub[3]+isub[5])/2) mat<-matrix(NA,5,9) dimnames(mat)<-list(NULL,c("X","n1","n2","DIF","TEST","se","ci.low","ci.hi","p.value")) critv<-NA if(alpha==.05)critv<-smmcrit(500,5) if(alpha==.01)critv<-smmcrit01(500,5) if(is.na(critv))critv<-smmval(rep(999,5),alpha=alpha) for (i in 1:5){ g1<-y1[near(x1,x1[isub[i]],fr1)] g2<-y2[near(x2,x1[isub[i]],fr2)] g1<-g1[!is.na(g1)] g2<-g2[!is.na(g2)] test<-msmed(g1,g2) mat[i,1]<-x1[isub[i]] mat[i,2]<-length(g1) mat[i,3]<-length(g2) #mat[i,4]<-test$dif mat[i,4]<-median(g1)-median(g2) mat[i,5]<-test$test[3] mat[i,6]<-test$test[5] cilow<-mat[i,4]-critv*mat[i,6] cihi<-mat[i,4]+critv*mat[i,6] mat[i,7]<-cilow mat[i,8]<-cihi mat[i,9]<-test$test[6] }} if(!is.na(pts[1])){ if(length(pts)>=29)stop("At most 28 points can be compared") n1<-1 n2<-1 vecn<-1 for(i in 1:length(pts)){ n1[i]<-length(y1[near(x1,pts[i],fr1)]) n2[i]<-length(y2[near(x2,pts[i],fr2)]) } mat<-matrix(NA,length(pts),9) dimnames(mat)<-list(NULL,c("X","n1","n2","DIF","TEST","se","ci.low","ci.hi", "p.value")) critv<-NA if(length(pts)>=2){ if(alpha==.05)critv<-smmcrit(500,length(pts)) if(alpha==.01)critv<-smmcrit01(500,length(pts)) if(is.na(critv))critv<-smmval(rep(999,length(pts)),alpha=alpha) } if(length(pts)==1)critv<-qnorm(1-alpha/2) for (i in 1:length(pts)){ g1<-y1[near(x1,pts[i],fr1)] g2<-y2[near(x2,pts[i],fr2)] g1<-g1[!is.na(g1)] g2<-g2[!is.na(g2)] test<-msmed(g1,g2) mat[i,1]<-pts[i] mat[i,2]<-length(g1) mat[i,3]<-length(g2) if(length(g1)<=5)print(paste("Warning, there are",length(g1)," points corresponding to the design point X=",pts[i])) if(length(g2)<=5)print(paste("Warning, there are",length(g2)," points corresponding to the design point X=",pts[i])) #mat[i,4]<-test$dif mat[i,4]<-median(g1)-median(g2) mat[i,5]<-test$test[3] mat[i,6]<-test$test[5] cilow<-mat[i,4]-critv*mat[i,6] cihi<-mat[i,4]+critv*mat[i,6] mat[i,7]<-cilow mat[i,8]<-cihi mat[i,9]<-test$test[6] }} if(plotit) runmean2g(x1,y1,x2,y2,fr=fr1,est=median,sm=sm) list(output=mat,crit=critv) } modgen<-function(p,adz=F){ # # Used by regpre to generate all models # p=number of predictors # adz=T, will add the model where only a measure # of location is used. # # model<-list() if(p>5)stop("Current version is limited to 5 predictors") if(p==1)model[[1]]<-1 if(p==2){ model[[1]]<-1 model[[2]]<-2 model[[3]]<-c(1,2) } if(p==3){ for(i in 1:3)model[[i]]<-i model[[4]]<-c(1,2) model[[5]]<-c(1,3) model[[6]]<-c(2,3) model[[7]]<-c(1,2,3) } if(p==4){ for(i in 1:4)model[[i]]<-i model[[5]]<-c(1,2) model[[6]]<-c(1,3) model[[7]]<-c(1,4) model[[8]]<-c(2,3) model[[9]]<-c(2,4) model[[10]]<-c(3,4) model[[11]]<-c(1,2,3) model[[12]]<-c(1,2,4) model[[13]]<-c(1,3,4) model[[14]]<-c(2,3,4) model[[15]]<-c(1,2,3,4) } if(p==5){ for(i in 1:5)model[[i]]<-i model[[6]]<-c(1,2) model[[7]]<-c(1,3) model[[8]]<-c(1,4) model[[9]]<-c(1,5) model[[10]]<-c(2,3) model[[11]]<-c(2,4) model[[12]]<-c(2,5) model[[13]]<-c(3,4) model[[14]]<-c(3,5) model[[15]]<-c(4,5) model[[16]]<-c(1,2,3) model[[17]]<-c(1,2,4) model[[18]]<-c(1,2,5) model[[19]]<-c(1,3,4) model[[20]]<-c(1,3,5) model[[21]]<-c(1,4,5) model[[22]]<-c(2,3,4) model[[23]]<-c(2,3,5) model[[24]]<-c(2,4,5) model[[25]]<-c(3,4,5) model[[26]]<-c(1,2,3,4) model[[27]]<-c(1,2,3,5) model[[28]]<-c(1,2,4,5) model[[29]]<-c(1,3,4,5) model[[30]]<-c(2,3,4,5) model[[31]]<-c(1,2,3,4,5) } if(adz){ ic<-length(model)+1 model[[ic]]<-0 } model } locpre<-function(y,est=mean,error=sqfun,nboot=100,SEED=T,pr=T,mval=round(5*log(length(y)))){ # # Estimate the prediction error using a measure of location # given by the argument # est # # The .632 method is used. # (See Efron and Tibshirani, 1993, pp. 252--254) # # Prediction error is the expected value of the function error. # The argument error defaults to squared error. # # est can be any s-plus function that returns a measure of location # # The default value for mval, the number of observations to resample # for each of the B bootstrap samples is based on results by # Shao (JASA, 1996, 655-665). (Resampling n vectors of observations # model selection may not lead to the correct model as n->infinity. # if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. if(pr)print("Taking bootstrap samples. Please wait.") data<-matrix(sample(length(y),size=mval*nboot,replace=T),nrow=nboot) bid<-apply(data,1,idb,length(y)) # bid is an n by nboot matrix. If the jth bootstrap sample from # 1, ..., mval contains the value i, bid[i,j]=0; otherwise bid[i,j]=1 # yhat<-apply(data,1,locpres1,y,est=est) # yhat is nboot vector # containing the bootstrap estimates # yhat<-matrix(yhat,nrow=length(y),ncol=nboot) # convert to n x nboot matrix bi<-apply(bid,1,sum) # B sub i in notation of Efron and Tibshirani, p. 253 temp<-(bid*(yhat-y)) diff<-apply(temp,1,error) ep0<-sum(diff/bi)/length(y) aperror<-error(y-est(y))/length(y) # apparent error val<-.368*aperror+.632*ep0 val } locpres1<-function(isub,x,est){ # # Compute a measure of location x[isub] # isub is a vector of length mval, # a bootstrap sample from the sequence of integers # 1, 2, 3, ..., n # # mval is the sample size # of the bootstrap sample, where mval1){ if(alpha==.05)crit<-smmcrit(500,CC) if(alpha==.01)crit<-smmcrit01(500,CC) if(is.na(crit))warning("Can only be used with alpha=.05 or .01") } test[jcom,4]<-crit psihat[jcom,4]<-psihat[jcom,3]-crit*test[jcom,5] psihat[jcom,5]<-psihat[jcom,3]+crit*test[jcom,5] }}}} if(sum(con^2)>0){ if(nrow(con)!=length(x))warning("The number of groups does not match the number of contrast coefficients.") psihat<-matrix(0,ncol(con),4) dimnames(psihat)<-list(NULL,c("con.num","psihat","ci.lower","ci.upper")) test<-matrix(0,ncol(con),5) dimnames(test)<-list(NULL,c("con.num","test","crit","se","df")) for (d in 1:ncol(con)){ psihat[d,1]<-d psihat[d,2]<-sum(con[,d]*xbar) sejk<-sqrt(sum(con[,d]^2*w)) test[d,1]<-d test[d,2]<-sum(con[,d]*xbar)/sejk crit<-NA if(CC==1)crit<-qnorm(1-alpha/2) if(alpha==.05)crit<-smmcrit(500,ncol(con)) if(alpha==.01)crit<-smmcrit01(500,ncol(con)) test[d,3]<-crit test[d,4]<-sejk psihat[d,3]<-psihat[d,2]-crit*sejk psihat[d,4]<-psihat[d,2]+crit*sejk }} list(test=test,psihat=psihat) } bpmedse<-function(x){ # # compute standard error of the median using method # recommended by Price and Bonett (2001) # y<-sort(x) n<-length(x) av<-round((n+1)/2-sqrt(n)) if(av==0)av<-1 avm<-av-1 astar<-pbinom(avm,n,.5) #alpha*/2 zval<-qnorm(1-astar) top<-n-av+1 sqse<-((y[top]-y[av])/(2*zval))^2 # The sq. standard error se<-sqrt(sqse) se } exmed<-function(x,y=NA,con=0,alpha=.05,iter=1000,se.fun=bpmedse,SEED=T){ # # Test a set of linear contrasts using medians # # Get exact control over type I errors under normality, provided # iter is sufficietly large. # iter determines number of replications used in a simulation # to determine critical value. # # se.fun indicates method used to estimate standard errors. # default is the method used by Bonett and Price (2002) # To use the McKean-Shrader method, # set se.fun=msmedse # # The data are assumed to be stored in $x$ in a matrix or in list mode. # Length(x) is assumed to correspond to the total number of groups, J # It is assumed all groups are independent. # # con is a J by d matrix containing the contrast coefficients that are used. # If con is not specified, all pairwise comparisons are made. # # Missing values are automatically removed. # # Function returns the critical value used so that FWE=alpha # (under the column crit) # p-values are determined for each test but are not adjusted so # that FWE=alpha. # The confidence intervals are adjusted so that the simultaneous # probability coverage is 1-alpha. # if(!is.na(y[1])){ xx<-list() xx[[1]]<-x xx[[2]]<-y if(is.matrix(x) || is.list(x))stop("When y is speficied, x should not have list mode or be a matrix") x<-xx } if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in a matrix or in list mode.") con<-as.matrix(con) J<-length(x) h<-vector("numeric",J) w<-vector("numeric",J) nval<-vector("numeric",J) xbar<-vector("numeric",J) for(j in 1:J){ xx<-!is.na(x[[j]]) val<-x[[j]] x[[j]]<-val[xx] # Remove missing values xbar[j]<-median(x[[j]]) nval[j]<-length(x[[j]]) # w[j]<-msmedse(x[[j]])^2 w[j]<-se.fun(x[[j]])^2 } if(sum(con^2!=0))CC<-ncol(con) if(sum(con^2)==0){ CC<-(J^2-J)/2 psihat<-matrix(0,CC,5) dimnames(psihat)<-list(NULL,c("Group","Group","psihat","ci.lower","ci.upper")) test<-matrix(NA,CC,6) dimnames(test)<-list(NULL,c("Group","Group","test","crit","se","p.value")) jcom<-0 for (j in 1:J){ for (k in 1:J){ if (j < k){ jcom<-jcom+1 test[jcom,3]<-abs(xbar[j]-xbar[k])/sqrt(w[j]+w[k]) # Next determine p-value for each individual test temp<-msmedsub(c(nval[j],nval[k]),se.fun=se.fun,SEED=SEED,iter=iter) test[jcom,6]<-sum((test[jcom,3]<=temp))/iter sejk<-sqrt(w[j]+w[k]) test[jcom,5]<-sejk psihat[jcom,1]<-j psihat[jcom,2]<-k test[jcom,1]<-j test[jcom,2]<-k psihat[jcom,3]<-(xbar[j]-xbar[k]) # Determine critical value for controlling FWE temp<-msmedsub(nval,se.fun=se.fun,SEED=SEED,iter=iter) ic<-round((1-alpha)*iter) crit<-temp[ic] test[jcom,4]<-crit psihat[jcom,4]<-psihat[jcom,3]-crit*test[jcom,5] psihat[jcom,5]<-psihat[jcom,3]+crit*test[jcom,5] }}}} if(sum(con^2)>0){ if(nrow(con)!=length(x))warning("The number of groups does not match the number of contrast coefficients.") psihat<-matrix(0,ncol(con),4) dimnames(psihat)<-list(NULL,c("con.num","psihat","ci.lower","ci.upper")) test<-matrix(0,ncol(con),5) dimnames(test)<-list(NULL,c("con.num","test","crit","se","p.value")) # Determine critical value that controls FWE temp<-msmedsub(nval,con=con,se.fun=se.fun,SEED=SEED,iter=iter) ic<-round((1-alpha)*iter) crit<-temp[ic] for (d in 1:ncol(con)){ flag<-(con[,d]==0) nvec<-nval[!flag] psihat[d,1]<-d psihat[d,2]<-sum(con[,d]*xbar) sejk<-sqrt(sum(con[,d]^2*w)) test[d,1]<-d test[d,2]<-sum(con[,d]*xbar)/sejk # Determine p-value for individual (dth) test temp<-msmedsub(nvec,iter=iter,se.fun=se.fun,SEED=SEED) test[d,3]<-crit test[d,4]<-sejk test[d,5]<-sum(abs((test[d,2])<=temp))/iter psihat[d,3]<-psihat[d,2]-crit*sejk psihat[d,4]<-psihat[d,2]+crit*sejk }} list(test=test,psihat=psihat) } msmedsub<-function(n,con=0,alpha=.05,se.fun=bpmedse,iter=1000,SEED=T){ # # Determine a Studentized critical value, assuming normality # and homoscedasticity, for the function msmedv2 # # Goal: Test a set of linear contrasts using medians # # The data are assumed to be stored in $x$ in a matrix or in list mode. # Length(x) is assumed to correspond to the total number of groups, J # It is assumed all groups are independent. # # con is a J by d matrix containing the contrast coefficients that are used. # If con is not specified, all pairwise comparisons are made. # if(SEED)set.seed(2) con<-as.matrix(con) J<-length(n) h<-vector("numeric",J) w<-vector("numeric",J) xbar<-vector("numeric",J) x<-list() test<-NA testmax<-NA for (it in 1:iter){ for(j in 1:J){ x[[j]]<-rnorm(n[j]) xbar[j]<-median(x[[j]]) w[j]<-se.fun(x[[j]])^2 } if(sum(con^2!=0))CC<-ncol(con) if(sum(con^2)==0){ CC<-(J^2-J)/2 jcom<-0 for (j in 1:J){ for (k in 1:J){ if (j < k){ jcom<-jcom+1 test[jcom]<-abs(xbar[j]-xbar[k])/sqrt(w[j]+w[k]) }}}} if(sum(con^2)>0){ for (d in 1:ncol(con)){ sejk<-sqrt(sum(con[,d]^2*w)) test[d]<-sum(con[,d]*xbar)/sejk }} testmax[it]<-max(abs(test)) } testmax<-sort(testmax) testmax } cnorm<-function(n,epsilon=.1,k=10){ # # generate n observations from a contaminated normal # distribution # probability 1-epsilon from a standard normal # probability epsilon from normal with mean 0 and standard deviation k # if(epsilon>1)stop("epsilon must be less than or equal to 1") if(epsilon<0)stop("epsilon must be greater than or equal to 0") if(k<=0)stop("k must be greater than 0") val<-rnorm(n) uval<-runif(n) flag<-(uval<=1-epsilon) val[!flag]<-k*val[!flag] val } twwmcp<-function(J,K,x,grp=c(1:p),p=J*K,tr=.2,alpha=.05,dif=F){ # # For a J by K anova using quantiles with # repeated measures on both factors, # Perform all multiple comparisons for main effects # and interactions. # # tr=.2. default trimming # bop=F means bootstrap option not used; # with bop=T, function uses usual medians rather # rather than a single order statistic to estimate median # in conjunction with bootstrap estimate of covariances # among the sample medians. # # The s-plus variable data is assumed to contain the raw # data stored in a matrix or in list mode. # When in list mode data[[1]] contains the data # for the first level of both factors: level 1,1. # data[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second: level 1,2 # data[[K]] is the data for level 1,K # data[[K+1]] is the data for level 2,1, data[2K] is level 2,K, etc. # # It is assumed that data has length JK, the total number of # groups being tested, but a subset of the data can be analyzed # using grp # Qa<-NA Qab<-NA if(is.list(x))x<-elimna(matl(x)) if(is.matrix(x))x<-elimna(x) data<-x if(is.matrix(data))data<-listm(data) if(!is.list(data))stop("Data are not stored in list mode or a matrix") if(p!=length(data)){ print("The total number of groups, based on the specified levels, is") print(p) print("The number of groups stored in x is") print(length(data)) print("Warning: These two values are not equal") } if(p!=length(grp))stop("Apparently a subset of the groups was specified that does not match the total number of groups indicated by the values for J and K.") tmeans<-0 temp<-con2way(J,K) # contrasts matrices stored in temp Qa<-rmmcp(x,con=temp$conA,alpha=alpha,dif=dif,tr=tr) # Do test for factor B Qb<-rmmcp(x,con=temp$conB,alpha=alpha,dif=dif,tr=tr) # Do test for factor A by B interaction Qab<-rmmcp(x,con=temp$conAB,alpha=alpha,dif=dif,tr=tr) list(Qa=Qa,Qb=Qb,Qab=Qab) } bprm<-function(x,grp=NA){ # # Perform Brunner-Puri within groups rank-based ANOVA # # x can be a matrix with columns corresponding to groups # or it can have list mode. # if(is.list(x))x<-matl(x) x<-elimna(x) if(is.na(grp[1]))grp <- c(1:ncol(x)) if(!is.matrix(x))stop("Data are not stored in a matrix or in list mode.") K<-length(grp) # The number of groups. Jb<-matrix(1,K,K) Ib<-diag(1,K) Pb<-Ib-Jb/K y<-matrix(rank(x),ncol=ncol(x)) #ranks of pooled data ybar<-apply(y,2,mean) # average of ranks N<-ncol(x)*nrow(x) vhat<-var(y)/N^2 test<-nrow(x)*sum((ybar-(N+1)/2)^2)/N^2 trval<-sum(diag(Pb%*%vhat)) test<-test/trval # See Brunner, Domhos and Langer, p. 98, eq. 7.12 nu1<-trval^2/sum(diag(Pb%*%vhat%*%Pb%*%vhat)) sig.level<-1-pf(test,nu1,1000000) list(test.stat=test,nu1=nu1,p.value=sig.level) } medpb<-function(x,alpha=.05,nboot=NA,grp=NA,est=median,con=0,bhop=F, SEED=T,...){ # # Multiple comparisons for J independent groups using medians. # # A percentile bootstrap method with Rom's method is used. # # The data are assumed to be stored in x # which either has list mode or is a matrix. In the first case # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. Length(x)=the number of groups = J. # If stored in a matrix, the columns of the matrix correspond # to groups. # # est is the measure of location and defaults to the median # ... can be used to set optional arguments associated with est # # The argument grp can be used to analyze a subset of the groups # Example: grp=c(1,3,5) would compare groups 1, 3 and 5. # # # con can be used to specify linear contrasts; see the function lincon # # Missing values are allowed. # con<-as.matrix(con) if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in list mode or in matrix mode.") if(!is.na(sum(grp))){ # Only analyze specified groups. xx<-list() for(i in 1:length(grp))xx[[i]]<-x[[grp[i]]] x<-xx } J<-length(x) tempn<-0 mvec<-NA for(j in 1:J){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. tempn[j]<-length(temp) x[[j]]<-temp mvec[j]<-est(temp,...) } Jm<-J-1 # # Determine contrast matrix # if(sum(con^2)==0){ ncon<-(J^2-J)/2 con<-matrix(0,J,ncon) id<-0 for (j in 1:Jm){ jp<-j+1 for (k in jp:J){ id<-id+1 con[j,id]<-1 con[k,id]<-0-1 }}} ncon<-ncol(con) if(nrow(con)!=J)stop("Something is wrong with con; the number of rows does not match the number of groups.") # Determine nboot if a value was not specified if(is.na(nboot)){ nboot<-5000 if(J <= 8)nboot<-4000 if(J <= 3)nboot<-2000 } # Determine critical values if(!bhop){ if(alpha==.05){ dvec<-c(.05,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) if(ncon > 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.01,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01){ dvec<-alpha/c(1:ncon) } } if(bhop)dvec<-(ncon-c(1:ncon)+1)*alpha/ncon bvec<-matrix(NA,nrow=J,ncol=nboot) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. #print("Taking bootstrap samples. Please wait.") for(j in 1:J){ #print(paste("Working on group ",j)) data<-matrix(sample(x[[j]],size=length(x[[j]])*nboot,replace=T),nrow=nboot) bvec[j,]<-apply(data,1,est,...) # Bootstrapped values for jth group } test<-NA bcon<-t(con)%*%bvec #ncon by nboot matrix tvec<-t(con)%*%mvec for (d in 1:ncon){ tv<-sum(bcon[d,]==0)/nboot test[d]<-sum(bcon[d,]>0)/nboot+.5*tv if(test[d]> .5)test[d]<-1-test[d] } test<-2*test output<-matrix(0,ncon,6) dimnames(output)<-list(NULL,c("con.num","psihat","p.value","p.crit","ci.lower","ci.upper")) temp2<-order(0-test) zvec<-dvec[1:ncon] sigvec<-(test[temp2]>=zvec) output[temp2,4]<-zvec icl<-round(dvec[ncon]*nboot/2)+1 icu<-nboot-icl-1 for (ic in 1:ncol(con)){ output[ic,2]<-tvec[ic,] output[ic,1]<-ic output[ic,3]<-test[ic] temp<-sort(bcon[ic,]) output[ic,5]<-temp[icl] output[ic,6]<-temp[icu] } num.sig<-sum(output[,3]<=output[,4]) list(output=output,con=con,num.sig=num.sig) } rbbinom<-function(n,nbin,r,s){ # # Generate n values from a beta-binomial, # r and s are the parameters of the beta distribution. # nbin is for the binomial distribution # x<-NA for(i in 1:n){ pval<-rbeta(1,r,s) x[i]<-rbinom(1,nbin,pval) } x } med2g<-function(x,y,alpha=.05,nboot=1000,SEED=T,...){ # # Compare medians of two independent groups using percentile bootstrap # # Missing values are allowed. # x<-elimna(x) y<-elimna(y) mvec<-NA mvec[1]<-median(x) mvec[2]<-median(y) bvec<-NA if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. datax<-matrix(sample(x,size=length(x)*nboot,replace=T),nrow=nboot) datay<-matrix(sample(y,size=length(x)*nboot,replace=T),nrow=nboot) bvec1<-apply(datax,1,median) # Bootstrapped values for jth group bvec2<-apply(datay,1,median) # Bootstrapped values for jth group test<-sum((bvec1>bvec2))/nboot tv<-sum(bvec1==bvec2)/nboot test<-test+.5*tv if(test> .5)test<-1-test test<-2*test dvec<-sort(bvec1-bvec2) icl<-round(alpha*nboot/2)+1 icu<-nboot-icl-1 cilow<-dvec[icl] ciup<-dvec[icu] list(p.value=test,est.dif=mvec[1]-mvec[2],ci.low=cilow,ci.up=ciup) } l2drmci<-function(x,y=NA,est=median,alpha=.05,nboot=500,SEED=T,pr=T,...){ # # Compute a bootstrap confidence interval for a # measure of location associated with # the distribution of x-y, # est indicates which measure of location will be used # x and y are possibly dependent # if(is.na(y[1])){ if(!is.matrix(x))stop("With y missing, x should be a matrix") } if(!is.na(y[1]))x<-cbind(x,y) if(ncol(x)!=2)stop("Should have bivariate data") if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. if(pr)print("Taking bootstrap samples. Please wait.") data<-matrix(sample(nrow(x),size=nrow(x)*nboot,replace=T),nrow=nboot) bvec<-NA for(i in 1:nboot)bvec[i]<-loc2dif(x[data[i,],1],x[data[i,],2],est=est,...) bvec<-sort(bvec) low<-round((alpha/2)*nboot)+1 up<-nboot-low temp<-sum(bvec<0)/nboot+sum(bvec==0)/(2*nboot) sig.level<-2*(min(temp,1-temp)) list(ci=c(bvec[low],bvec[up]),p.value=sig.level) } twobinom<-function(r1=sum(x),n1=length(x),r2=sum(y),n2=length(y),x=NA,y=NA,alpha=.05){ # # Test the hypothesis that two independent binomials have equal # probability of success # # r1=number of successes in group 1 # n1=number of observations in group 1 # n1p<-n1+1 n2p<-n2+1 n1m<-n1-1 n2m<-n2-1 chk<-abs(r1/n1-r2/n2) x<-c(0:n1)/n1 y<-c(0:n2)/n2 phat<-(r1+r2)/(n1+n2) m1<-outer(x,y,"-") m2<-matrix(1,n1p,n2p) flag<-(abs(m1)>=chk) m3<-m2*flag b1<-1 b2<-1 xv<-c(1:n1) yv<-c(1:n2) xv1<-n1-xv+1 yv1<-n2-yv+1 dis1<-c(1,pbeta(phat,xv,xv1)) dis2<-c(1,pbeta(phat,yv,yv1)) pd1<-NA pd2<-NA for(i in 1:n1)pd1[i]<-dis1[i]-dis1[i+1] for(i in 1:n2)pd2[i]<-dis2[i]-dis2[i+1] pd1[n1p]<-phat^n1 pd2[n2p]<-phat^n2 m4<-outer(pd1,pd2,"*") test<-sum(m3*m4) list(p.value=test) } dmedpb<-function(x,y=NA,alpha=.05,con=0,est=median,plotit=T,dif=F,grp=NA, hoch=F,nboot=NA,BA=T,xlab="Group 1",ylab="Group 2",pr=T,SEED=T,...){ # # Use a percentile bootstrap method to compare # marginal medians of dependent groups. # # This is essentially the function rmmcppb, but set to compare medians # by default. # And it is adjusted to handle tied values. # # By default, # compute a .95 confidence interval for all linear contasts # specified by con, a J by C matrix, where C is the number of # contrasts to be tested, and the columns of con are the # contrast coefficients. # If con is not specified, all pairwise comparisons are done. # # A sequentially rejective method # is used to control the probability of at least one Type I error. # # dif=T indicates that difference scores are to be used # dif=F indicates that measure of location associated with # marginal distributions are used instead. # # nboot is the bootstrap sample size. If not specified, a value will # be chosen depending on the number of contrasts there are. # # x can be an n by J matrix or it can have list mode # for two groups, data for second group can be put in y # otherwise, assume x is a matrix (n by J) or has list mode. # # Argument BA: When using dif=F, BA=T uses a correction term # that is recommended with medians. # if(dif){ if(pr)print("dif=T, so analysis is done on difference scores") temp<-rmmcppbd(x,y=y,alpha=.05,con=con,est,plotit=plotit,grp=grp, nboot=nboot,...) output<-temp$output con<-temp$con } if(!dif){ if(pr)print("dif=F, so analysis is done on marginal distributions") if(!is.na(y[1]))x<-cbind(x,y) if(!is.list(x) && !is.matrix(x)) stop("Data must be stored in a matrix or in list mode.") if(is.list(x)){ if(is.matrix(con)){ if(length(x)!=nrow(con)) stop("The number of rows in con is not equal to the number of groups.") }} if(is.list(x)){ # put the data in an n by J matrix mat<-matl(x) } if(is.matrix(x) && is.matrix(con)){ if(ncol(x)!=nrow(con)) stop("The number of rows in con is not equal to the number of groups.") mat<-x } if(is.matrix(x))mat<-x if(!is.na(sum(grp)))mat<-mat[,grp] mat<-elimna(mat) # Remove rows with missing values. x<-mat J<-ncol(mat) xcen<-x for(j in 1:J)xcen[,j]<-x[,j]-est(x[,j]) Jm<-J-1 if(sum(con^2)==0){ d<-(J^2-J)/2 con<-matrix(0,J,d) id<-0 for (j in 1:Jm){ jp<-j+1 for (k in jp:J){ id<-id+1 con[j,id]<-1 con[k,id]<-0-1 }}} d<-ncol(con) if(is.na(nboot)){ if(d<=4)nboot<-1000 if(d>4)nboot<-5000 } n<-nrow(mat) crit.vec<-alpha/c(1:d) connum<-ncol(con) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. xbars<-apply(mat,2,est) psidat<-NA for (ic in 1:connum)psidat[ic]<-sum(con[,ic]*xbars) psihat<-matrix(0,connum,nboot) psihatcen<-matrix(0,connum,nboot) bvec<-matrix(NA,ncol=J,nrow=nboot) bveccen<-matrix(NA,ncol=J,nrow=nboot) print("Taking bootstrap samples. Please wait.") data<-matrix(sample(n,size=n*nboot,replace=T),nrow=nboot) for(ib in 1:nboot){ bvec[ib,]<-apply(x[data[ib,],],2,est,...) bveccen[ib,]<-apply(xcen[data[ib,],],2,est,...) } # # Now have an nboot by J matrix of bootstrap values. # test<-1 bias<-NA tval<-NA tvalcen<-NA for (ic in 1:connum){ psihat[ic,]<-apply(bvec,1,bptdpsi,con[,ic]) psihatcen[ic,]<-apply(bveccen,1,bptdpsi,con[,ic]) tvalcen[ic]<-sum((psihatcen[ic,]==0))/nboot bias[ic]<-sum((psihatcen[ic,]>0))/nboot+sum((psihatcen[ic,]==0))/nboot-.5 tval[ic]<-sum((psihat[ic,]==0))/nboot if(BA){ test[ic]<-sum((psihat[ic,]>0))/nboot+tval[ic]-.1*bias[ic] if(test[ic]<0)test[ic]<-0 } if(!BA)test[ic]<-sum((psihat[ic,]>0))/nboot+tval[ic] test[ic]<-min(test[ic],1-test[ic]) } test<-2*test ncon<-ncol(con) if(alpha==.05){ dvec<-c(.05,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) dvecba<-c(.05,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) if(ncon > 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.01,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) dvecba<-c(.01,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(hoch)dvec<-alpha/(2* c(1:ncon)) dvec<-2*dvec if(alpha != .05 && alpha != .01){ dvec<-alpha/c(1:ncon) dvecba<-dvec dvec[1]<-alpha/2 } if(plotit && ncol(bvec)==2){ z<-c(0,0) one<-c(1,1) plot(rbind(bvec,z,one),xlab=xlab,ylab=ylab,type="n") points(bvec) totv<-apply(x,2,est,...) cmat<-var(bvec) dis<-mahalanobis(bvec,totv,cmat) temp.dis<-order(dis) ic<-round((1-alpha)*nboot) xx<-bvec[temp.dis[1:ic],] xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) lines(xx[temp,]) lines(xx[c(temp[1],temp[length(temp)]),]) abline(0,1) } temp2<-order(0-test) ncon<-ncol(con) zvec<-dvec[1:ncon] if(BA)zvec<-dvecba[1:ncon] sigvec<-(test[temp2]>=zvec) output<-matrix(0,connum,6) dimnames(output)<-list(NULL,c("con.num","psihat","p-value","p.crit", "ci.lower","ci.upper")) tmeans<-apply(mat,2,est,...) psi<-1 for (ic in 1:ncol(con)){ output[ic,2]<-sum(con[,ic]*tmeans) output[ic,1]<-ic output[ic,3]<-test[ic] output[temp2,4]<-zvec temp<-sort(psihat[ic,]) icl<-round(output[ic,4]*nboot/2)+1 icu<-nboot-(icl-1) output[ic,5]<-temp[icl] output[ic,6]<-temp[icu] } } num.sig<-sum(output[,3]<=output[,4]) list(output=output,con=con,num.sig=num.sig) } lband<-function(x,y=NA,alpha=.05,plotit=T,sm=T,op=1){ # # Compute a confidence band for the shift function. # Assuming two dependent groups are being compared # # See Lombard (2005, Technometrics, 47, 364-369) # # if y=NA, assume x is a matrix with two columns # # If plotit=T, a plot of the shift function is created, assuming that # the graphics window has already been activated. # # sm=T, plot of shift function is smoothed using: # expected frequency curve if op!=1 # otherwise use S+ function lowess is used. # # This function removes all missing observations. # # When plotting, the median of x is marked with a + and the two # quartiles are marked with o. # if(!is.na(y[1]))x<-cbind(x,y) if(ncol(x)!=2)stop("Should have two groups only") m<-elimna(x) y<-m[,2] x<-m[,1] n<-length(x) crit<-nelderv2(m,1,lband.fun2,alpha=alpha) plotit<-as.logical(plotit) xsort<-sort(x) ysort<-sort(y) l<-0 u<-0 ysort[0]<-NA ysort[n+1]<-NA lsub<-c(1:n)-floor(sqrt(2*n)*crit) usub<-c(1:n)+floor(sqrt(2*n)*crit) for(ivec in 1:n){ isub<-max(0,lsub[ivec]) l[ivec]<-NA if(isub>0)l[ivec]<-ysort[isub]-xsort[ivec] isub<-min(n+1,usub[ivec]) u[ivec]<-NA if(isub <= n)u[ivec]<-ysort[isub]-xsort[ivec] } num<-length(l[l>0 & !is.na(l)])+length(u[u<0 & !is.na(u)]) qhat<-c(1:n)/n m<-cbind(qhat,l,u) dimnames(m)<-list(NULL,c("qhat","lower","upper")) if(plotit){ xsort<-sort(x) ysort<-sort(y) del<-0 for (i in 1:n)del[i]<-ysort[i]-xsort[i] xaxis<-c(xsort,xsort) yaxis<-c(m[,1],m[,2]) allx<-c(xsort,xsort,xsort) ally<-c(del,m[,2],m[,3]) temp2<-m[,2] temp2<-temp2[!is.na(temp2)] plot(allx,ally,type="n",ylab="delta",xlab="x (first group)") ik<-rep(F,length(xsort)) if(sm){ if(op==1){ ik<-duplicated(xsort) del<-lowess(xsort,del)$y } if(op!=1)del<-runmean(xsort,del,pyhat=T) } lines(xsort[!ik],del) lines(xsort,m[,2],lty=2) lines(xsort,m[,3],lty=2) temp<-summary(x) text(temp[3],min(temp2),"+") text(temp[2],min(temp2),"o") text(temp[5],min(temp2),"o") } list(m=m,crit=crit,numsig=num) } lband.fun<-function(x,y,crit){ # # function used to determine probability of type I error given crit # pi<-gamma(.5)^2 xr<-rank(x) yr<-rank(y) temp<-apply(cbind(xr,yr),1,max) n<-length(x) fj<-NA for(i in 1:n)fj[i]<-sum(temp==i) v1<-NA for(j in 1:n)v1[j]<-(j-sum(fj[1:j]))/n psi<-rep(0,n) for(j in 1:n){ if(v1[j]>0)psi[j]<-crit*exp(0-crit^2/(2*v1[j]))/sqrt(2*pi*v1[j]^3) } res<-mean(fj*psi) res } lband.fun2<-function(m,crit,alpha=.05){ x<-m[,1] y<-m[,2] val<-abs(alpha-lband.fun(x,y,crit)) val } qdec<-function(x){ # # compute deciles using single order statistics # (function deciles uses Harrell-Davis estimator) # vals<-NA for(i in 1:9){ vals[i]<-qest(x,i/10) } vals } m2way<-function(J,K,x,est=hd,alpha=.05,nboot=600,SEED=T,grp=NA,pr=T,...){ # # Two-way ANOVA based on by forming averages # # By default # est=hd meaning that medians are used with the Harrell-Davis estimator. # # The data are assumed to be stored in x in list mode or in a matrix. # If grp is unspecified, it is assumed x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second factor: level 1,2 # x[[j+1]] is the data for level 2,1, etc. # If the data are in wrong order, grp can be used to rearrange the # groups. For example, for a two by two design, grp<-c(2,4,3,1) # indicates that the second group corresponds to level 1,1; # group 4 corresponds to level 1,2; group 3 is level 2,1; # and group 1 is level 2,2. # # Missing values are automatically removed. # JK<-J*K xcen<-list() if(is.matrix(x)) x <- listm(x) if(!is.list(x)) stop("Data must be stored in list mode or a matrix.") if(!is.na(grp[1])) { yy <- x for(j in 1:length(grp)) x[[j]] <- yy[[grp[j]]] } for(j in 1:JK){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. x[[j]]<-temp } xx<-list() mloc<-NA for(i in 1:JK){ xx[[i]]<-x[[i]] mloc[i]<-est(xx[[i]],...) xcen[[i]]<-xx[[i]]-mloc[i] } x<-xx mat<-matrix(mloc,nrow=J,ncol=K,byrow=T) leva<-apply(mat,1,mean) # J averages over columns levb<-apply(mat,2,mean) gm<-mean(levb) testa<-sum((leva-mean(leva))^2) testb<-sum((levb-mean(levb))^2) testab<-NA tempab<-matrix(NA,nrow=J,ncol=K) for(j in 1:J){ for(k in 1:K){ tempab[j,k]<-mat[j,k]-leva[j]-levb[k]+gm }} testab<-sum(tempab^2) bvec<-matrix(NA,nrow=JK,ncol=nboot) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. if(pr)print("Taking bootstrap samples. Please wait.") for(j in 1:JK){ if(pr)print(paste("Working on group ",j)) data<-matrix(sample(xcen[[j]],size=length(xcen[[j]])*nboot,replace=T), nrow=nboot) bvec[j,]<-apply(data,1,est,...) # JK by nboot matrix, jth row contains # bootstrapped estimates for jth group } boota<-NA bootb<-NA bootab<-NA for(i in 1:nboot){ mat<-matrix(bvec[,i],nrow=J,ncol=K,byrow=T) leva<-apply(mat,1,mean) # J averages over columns levb<-apply(mat,2,mean) gm<-mean(mat) boota[i]<-sum((leva-mean(leva))^2) bootb[i]<-sum((levb-mean(levb))^2) for(j in 1:J){ for(k in 1:K){ tempab[j,k]<-mat[j,k]-leva[j]-levb[k]+gm }} bootab[i]<-sum(tempab^2)} pvala<-1-sum(testa>=boota)/nboot pvalb<-1-sum(testb>=bootb)/nboot pvalab<-1-sum(testab>=bootab)/nboot list(p.value.A=pvala,p.value.B=pvalb,p.value.AB=pvalab, test.A=testa,test.B=testb, test.AB=testab,est.loc=matrix(mloc,nrow=J,ncol=K,byrow=T)) } b1way<-function(x,est=mest,nboot=599,SEED=T,...){ # # Test the hypothesis that J measures of location are equal # using the percentile bootstrap method. # By default, M-estimators are compared using 599 bootstrap samples. # # The data are assumed to be stored in x in list mode. Thus, # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. Length(x)=the number of groups = J, say. # # if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in list mode or a matrix.") J<-length(x) for(j in 1:J)x[[j]]=elimna(x[[j]]) nval<-vector("numeric",length(x)) gest<-vector("numeric",length(x)) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. bvec<-matrix(0,J,nboot) print("Taking bootstrap samples. Please wait.") for(j in 1:J){ print(paste("Working on group ",j)) nval[j]<-length(x[[j]]) gest[j]<-est(x[[j]]) xcen<-x[[j]]-est(x[[j]],...) data<-matrix(sample(xcen,size=length(x[[j]])*nboot,replace=T),nrow=nboot) bvec[j,]<-apply(data,1,est,...) # A J by nboot matrix # containing the bootstrap values of est. } teststat<-wsumsq(gest,nval) testb<-apply(bvec,2,wsumsq,nval) p.value<-1 - sum(teststat >= testb)/nboot teststat<-wsumsq(gest,nval) if(teststat == 0)p.value <- 1 list(teststat=teststat,p.value=p.value) } selbybw<-function(m,grpc,coln){ # # For a between by within design, # a commmon situation is to have data stored in an n by p matrix where # a column is a group identification number # and the remaining columns are the within group results. # # m is a matrix containing the data. One column contains group # identification values # and two or more other columns contain repeated measures. # # This function groups all values in the columns # indicated by coln according to the # group numbers in column grpc and stores the results in list mode. # # So if grpc has J values, and coln indicates K columns, # this function returns the data stored in list mode have length JK # # Example: y<-selbybw(blob,3,c(4,6,7))$x # will look for group numbers in col 3 of the matrix blob, # and it assumes within # group data are stored in col 4, 6 and 7. # Result: the data will now be stored in y having list mode # m<-elimna(m) if(!is.matrix(m))stop("Data must be stored in a matrix") if(is.na(grpc[1]))stop("The argument grpc is not specified") if(is.na(coln[1]))stop("The argument coln is not specified") if(length(grpc)!=1)stop("The argument grpc must have length 1") x<-list() grpn<-unique(m[,grpc]) J<-length(grpn) K<-length(coln) JK<-J*K it<-0 for (ig in 1:length(grpn)){ for (ic in 1:length(coln)){ it<-it+1 flag<-(m[,grpc]==grpn[ig]) x[[it]]<-m[flag,coln[ic]] }} list(x=x,grpn=grpn) } lintest<-function(x,y,regfun=tsreg,nboot=500,alpha=.05,xout=F,SEED=T, outfun=out,...){ # # Test the hypothesis that the regression surface is a plane. # Stute et al. (1998, JASA, 93, 141-149). # if(SEED)set.seed(2) x<-as.matrix(x) d<-ncol(x) temp<-elimna(cbind(x,y)) x<-temp[,1:d] x<-as.matrix(x) y<-temp[,d+1] if(xout){ flag<-outfun(x)$keep x<-x[flag,] x<-as.matrix(x) y<-y[flag] } mflag<-matrix(NA,nrow=length(y),ncol=length(y)) for (j in 1:length(y)){ for (k in 1:length(y)){ mflag[j,k]<-(sum(x[j,]<=x[k,])==ncol(x)) } } reg<-regfun(x,y,...) yhat<-y-reg$residuals print("Taking bootstrap sample, please wait.") data<-matrix(runif(length(y)*nboot),nrow=nboot) data<-sqrt(12)*(data-.5) # standardize the random numbers. rvalb<-apply(data,1,lintests1,yhat,reg$residuals,mflag,x,regfun,...) # An n x nboot matrix of R values rvalb<-rvalb/sqrt(length(y)) dstatb<-apply(abs(rvalb),2,max) wstatb<-apply(rvalb^2,2,mean) #dstatb<-sort(dstatb) #wstatb<-sort(wstatb) # compute test statistic v<-c(rep(1,length(y))) rval<-lintests1(v,yhat,reg$residuals,mflag,x,regfun,...) rval<-rval/sqrt(length(y)) dstat<-max(abs(rval)) wstat<-mean(rval^2) ib<-round(nboot*(1-alpha)) p.value.d<-1-sum(dstat>=dstatb)/nboot p.value.w<-1-sum(wstat>=wstatb)/nboot #critw<-wstatb[ib] list(dstat=dstat,wstat=wstat,p.value.d=p.value.d,p.value.w=p.value.w) } t1waybt<-function(x,tr=.2,grp=NA,nboot=599,SEED=T){ # # Test the hypothesis of equal trimmed, corresponding to J independent # groups, using a percentile t bootstrap method. # # The data are assumed to be stored in x in list mode # or in a matrix. In the first case # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. Length(x)=the number of groups = J. # If stored in a matrix, columns correspond to groups. # # grp is used to specify some subset of the groups, if desired. # By default, all J groups are used. # # The default number of bootstrap samples is nboot=599 # if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in a matrix or in list mode.") J<-length(x) if(is.na(grp[1]))grp<-c(1:J) for(j in 1:J){ temp<-x[[j]] x[[j]]<-temp[!is.na(temp)] # Remove any missing values. } bvec<-array(0,c(J,2,nboot)) hval<-vector("numeric",J) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") for(j in 1:J){ hval[j]<-length(x[[grp[j]]])-2*floor(tr*length(x[[grp[j]]])) # hval is the number of observations in the jth group after trimming. print(paste("Working on group ",grp[j])) xcen<-x[[grp[j]]]-mean(x[[grp[j]]],tr) data<-matrix(sample(xcen,size=length(x[[grp[j]]])*nboot,replace=T),nrow=nboot) bvec[j,,]<-apply(data,1,trimparts,tr) # A 2 by nboot matrix. The first row # contains the bootstrap trimmed means, the second row # contains the bootstrap squared standard errors. } m1<-bvec[,1,] # J by nboot matrix containing the bootstrap trimmed means m2<-bvec[,2,] # J by nboot matrix containing the bootstrap sq standard errors wvec<-1/m2 # J by nboot matrix of w values uval<-apply(wvec,2,sum) # Vector having length nboot blob<-wvec*m1 xtil<-apply(blob,2,sum)/uval # nboot vector of xtil values blob1<-matrix(0,J,nboot) for (j in 1:J)blob1[j,]<-wvec[j,]*(m1[j,]-xtil)^2 avec<-apply(blob1,2,sum)/(length(x)-1) blob2<-(1-wvec/uval)^2/(hval-1) cvec<-apply(blob2,2,sum) cvec<-2*(length(x)-2)*cvec/(length(x)^2-1) testb<-avec/(cvec+1) # A vector of length nboot containing bootstrap test values ct<-sum(is.na(testb)) if(ct>0)print("Some bootstrap estimates of the test statistic could not be computed") test<-t1way(x,tr=tr,grp=grp) pval<-sum(test$TEST<=testb)/nboot list(test=test$TEST,p.value=pval) } tauloc<-function(x,cval=4.5){ # # Compute the tau measure of location as described in # Yohai and Zamar (JASA, 83, 406-413). # x<-elimna(x) s<-qnorm(.75)*mad(x) y<-(x-median(x))/s W<-(1-(y/cval)^2)^2 flag<-(abs(W)>cval) W[flag]<-0 val<-sum(W*x)/sum(W) val } tauvar<-function(x,cval=3){ # # Compute the tau measure of scale as described in # Yohai and Zamar (JASA, 1988, 83, 406-413). # The computational method is described in Maronna and Zamar # (Technometrics, 2002, 44, 307-317) # see p. 310 # x<-elimna(x) s<-qnorm(.75)*mad(x) y<-(x-tauloc(x))/s cvec<-rep(cval,length(x)) W<-apply(cbind(y^2,cvec^2),1,FUN="min") val<-s^2*sum(W)/length(x) val } gkcor<-function(x,y,varfun=tauvar,ccov=F,...){ # # Compute a correlation coefficient using the Gnanadesikan-Ketterning # estimator. # ccov=T, computes covariance instead. # (cf. Marrona & Zomar, 2002, Technometrics # val<-.25*(varfun(x+y,...)-varfun(x-y,...)) if(!ccov)val<-val/(sqrt(varfun(x,...))*sqrt(varfun(y,...))) val } covroc<-function(x){ # # compute Rocke's TBS covariance matrix # library(robust,first=T) temp<-covRob(x,estim="M") val<-temp[2]$cov val } indt<-function(x,y,tr=.0,nboot=500,flag=1,SEED=T){ # # Test the hypothesis of independence between x and y by # testing the hypothesis that the regression surface is a horizontal plane. # Stute et al. (1998, JASA, 93, 141-149). # # flag=1 gives Kolmogorov-Smirnov test statistic # flag=2 gives the Cramer-von Mises test statistic # flag=3 causes both test statistics to be reported. # # tr=0 results in the Cramer-von Mises test statistic when flag=2 # With tr>0, a trimmed version of the test statistic is used. # # Modified Dec 2005. # if(tr<0)stop("Amount trimmed must be > 0") if(tr>.5)stop("Amount trimmed must be <=.5") if(SEED)set.seed(2) x<-as.matrix(x) # First, eliminate any rows of data with missing values. temp <- cbind(x, y) temp <- elimna(temp) pval<-ncol(temp)-1 x <- temp[,1:pval] y <- temp[, pval+1] x<-as.matrix(x) mflag<-matrix(NA,nrow=length(y),ncol=length(y)) for (j in 1:length(y)){ for (k in 1:length(y)){ mflag[j,k]<-(sum(x[j,]<=x[k,])==ncol(x)) } } # ith row of mflag indicates which rows of the matrix x are less # than or equal to ith row of x # yhat<-mean(y) res<-y-yhat print("Taking bootstrap sample, please wait.") data<-matrix(runif(length(y)*nboot),nrow=nboot)# data<-(data-.5)*sqrt(12) # standardize the random numbers. rvalb<-apply(data,1,regts1,yhat,res,mflag,x,tr) # An n x nboot matrix of R values rvalb<-rvalb/sqrt(length(y)) dstatb<-apply(abs(rvalb),2,max) wstatb<-apply(rvalb^2,2,mean,tr=tr) v<-c(rep(1,length(y))) rval<-regts1(v,yhat,res,mflag,x,tr=0) rval<-rval/sqrt(length(y)) dstat<-NA wstat<-NA critd<-NA critw<-NA p.vald<-NA p.valw<-NA if(flag==1 || flag==3){ dstat<-max(abs(rval)) p.vald<-1-sum(dstat>=dstatb)/nboot } if(flag==2 || flag==3){ wstat<-mean(rval^2,tr=tr) p.valw<-1-sum(wstat>=wstatb)/nboot } list(dstat=dstat,wstat=wstat,p.value.d=p.vald,p.value.w=p.valw) } taulc<-function(x,mu.too=F){ # val<-tauvar(x) if(mu.too){ val[2]<-val val[1]<-tauloc(x) } val } trimww.sub<-function(cmat,vmean,vsqse,h,J,K){ # # This function is used by trimww # # The function performs a variation of Johansen's test of C mu = 0 for # a within by within design # C is a k by p matrix of rank k and mu is a p by 1 matrix of # of unknown medians. # The argument cmat contains the matrix C. # vmean is a vector of length p containing the p medians # vsqe is matrix containing the # estimated covariances among the medians # h is the sample size # p<-J*K yvec<-matrix(vmean,length(vmean),1) test<-cmat%*%vsqse%*%t(cmat) invc<-solve(test) test<-t(yvec)%*%t(cmat)%*%invc%*%cmat%*%yvec temp<-0 mtem<-vsqse%*%t(cmat)%*%invc%*%cmat temp<-sum(diag(mtem%*%mtem))+(sum(diag(mtem)))^2/(h-1) A<-.5*sum(temp) cval<-nrow(cmat)+2*A-6*A/(nrow(cmat)+2) test<-test/cval test } trimww<-function(J,K,x,grp=c(1:p),p=J*K,tr=.2){ # # Perform a J by K anova using trimmed means with # repeated measures on both factors. # # tr=.2 is default trimming # # The s-plus variable data is assumed to contain the raw # data stored in list mode. data[[1]] contains the data # for the first level of both factors: level 1,1. # data[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second: level 1,2 # data[[K]] is the data for level 1,K # data[[K+1]] is the data for level 2,1, data[2K] is level 2,K, etc. # # It is assumed that data has length JK, the total number of # groups being tested, but a subset of the data can be analyzed # using grp # if(is.list(x))x<-elimna(matl(x)) if(is.matrix(x))x<-elimna(x) data<-x if(is.matrix(data))data<-listm(data) if(!is.list(data))stop("Data are not stored in list mode or a matrix") if(p!=length(data)){ print("The total number of groups, based on the specified levels, is") print(p) print("The number of groups stored in x is") print(length(data)) print("Warning: These two values are not equal") } if(p!=length(grp))stop("Apparently a subset of the groups was specified that does not match the total number of groups indicated by the values for J and K.") tmeans<-0 h<-length(data[[grp[1]]]) v<-matrix(0,p,p) for (i in 1:p)tmeans[i]<-mean(data[[grp[i]]],tr=tr,na.rm=T) v<-covmtrim(data,tr=tr) ij<-matrix(c(rep(1,J)),1,J) ik<-matrix(c(rep(1,K)),1,K) jm1<-J-1 cj<-diag(1,jm1,J) for (i in 1:jm1)cj[i,i+1]<-0-1 km1<-K-1 ck<-diag(1,km1,K) for (i in 1:km1)ck[i,i+1]<-0-1 # Do test for factor A cmat<-kron(cj,ik) # Contrast matrix for factor A #Qa<-johansp(cmat,tmeans,v,h,J,K) Qa<-trimww.sub(cmat,tmeans,v,h,J,K) #Qa.siglevel<-1-pf(Qa$teststat,J-1,999) Qa.siglevel<-1-pf(Qa,J-1,999) # Do test for factor B cmat<-kron(ij,ck) # Contrast matrix for factor B #Qb<-johansp(cmat,tmeans,v,h,J,K) Qb<-trimww.sub(cmat,tmeans,v,h,J,K) Qb.siglevel<-1-pf(Qb,K-1,999) # Do test for factor A by B interaction cmat<-kron(cj,ck) # Contrast matrix for factor A by B #Qab<-johansp(cmat,tmeans,v,h,J,K) Qab<-trimww.sub(cmat,tmeans,v,h,J,K) Qab.siglevel<-1-pf(Qab,(J-1)*(K-1),999) list(Qa=Qa,Qa.siglevel=Qa.siglevel, Qb=Qb,Qb.siglevel=Qb.siglevel, Qab=Qab,Qab.siglevel=Qab.siglevel) } msmedci<-function(x,alpha=.05,nullval=0){ # # Confidence interval for the median # se<-msmedse(x) est<-median(x) ci.low<-est-qnorm(1-alpha/2)*se ci.hi<-est+qnorm(1-alpha/2)*se test<-(est-nullval)/se p.value<-2*(1-pnorm(abs(test))) list(test=test,ci.low=ci.low,ci.hi=ci.hi,p.value=p.value) } medcipb<-function(x,alpha=.05,null.val=NA,nboot=500,SEED=T,...){ # # Bootstrap confidence interval for the median of single variable. # # Missing values are allowed. # x<-elimna(x) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. #print("Taking bootstrap samples. Please wait.") data<-matrix(sample(x,size=length(x)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,median) # Bootstrapped values test<-NULL if(!is.na(null.val)){ tv<-sum(bvec==null.val)/nboot test<-sum(bvec>null.val)/nboot+.5*tv if(test> .5)test<-1-test test<-2*test } bvec<-sort(bvec) icl<-round(alpha*nboot/2)+1 icu<-nboot-icl-1 cilow<-bvec[icl] ciup<-bvec[icu] list(ci.low=cilow,ci.up=ciup,p.value=test) } regtest<-function(x,y,regfun=tsreg,nboot=600,alpha=.05,plotit=T, grp=c(1:ncol(x)),nullvec=c(rep(0,length(grp))),xout=F,outfun=out,SEED=T){ # # Test the hypothesis that q of the p predictors are equal to # some specified constants. By default, the hypothesis is that all # p predictors have a coefficient equal to zero. # The method is based on a confidence ellipsoid. # The critical value is determined with the percentile bootstrap method # in conjunction with Mahalanobis distance. # x<-as.matrix(x) p1<-ncol(x)+1 p<-ncol(x) xy<-cbind(x,y) xy<-elimna(xy) x<-xy[,1:p] y<-xy[,p1] if(xout){ m<-cbind(x,y) flag<-outfun(x,plotit=F)$keep m<-m[flag,] x<-m[,1:p] y<-m[,p1] } x<-as.matrix(x) if(length(grp)!=length(nullvec))stop("The arguments grp and nullvec must have the same length.") if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") data<-matrix(sample(length(y),size=length(y)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,regboot,x,y,regfun) # A p+1 by nboot matrix. The first row # contains the bootstrap intercepts, the second row # contains the bootstrap values for first predictor, etc. grp<-grp+1 est<-regfun(x,y)$coef estsub<-est[grp] bsub<-t(bvec[grp,]) if(length(grp)==1){ m1<-sum((bvec[grp,]-est)^2)/(length(y)-1) dis<-(bsub-estsub)^2/m1 } if(length(grp)>1){ mvec<-apply(bsub,2,FUN=mean) m1<-var(t(t(bsub)-mvec+estsub)) dis<-mahalanobis(bsub,estsub,m1) } dis2<-order(dis) dis<-sort(dis) critn<-floor((1-alpha)*nboot) crit<-dis[critn] test<-mahalanobis(t(estsub),nullvec,m1) sig.level<-1-sum(test>dis)/nboot if(length(grp)==2 && plotit){ plot(bsub,xlab="Parameter 1",ylab="Parameter 2") points(nullvec[1],nullvec[2],pch=0) xx<-bsub[dis2[1:critn],] xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) lines(xx[temp,]) lines(xx[c(temp[1],temp[length(temp)]),]) } list(test=test,crit=crit,p.value=sig.level,nullvec=nullvec,est=estsub) } reg2ci<-function(x,y,x1,y1,regfun=tsreg,nboot=599,alpha=.05,plotit=T,SEED=T){ # # Compute a .95 confidence interval for the difference between the # the intercepts and slopes corresponding to two independent groups. # The default regression method is Theil-Sen. # # The predictor values for the first group are # assumed to be in the n by p matrix x. # The predictors for the second group are in x1 # # The default number of bootstrap samples is nboot=599 # # regfun can be any s-plus function that returns the coefficients in # the vector regfun$coef, the first element of which contains the # estimated intercept, the second element contains the estimate of # the first predictor, etc. # x<-as.matrix(x) x1<-as.matrix(x1) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") data<-matrix(sample(length(y),size=length(y)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,regboot,x,y,regfun) # A p+1 by nboot matrix. The first row # contains the bootstrap intercepts, the second row # contains the bootstrap values for first predictor, etc. data<-matrix(sample(length(y1),size=length(y1)*nboot,replace=T),nrow=nboot) bvec1<-apply(data,1,regboot,x1,y1,regfun) bvec<-bvec-bvec1 p1<-ncol(x)+1 regci<-matrix(0,p1,4) dimnames(regci)<-list(NULL, c("Parameter","ci.lower","ci.upper","sig.level")) ilow<-round((alpha/2)*nboot)+1 ihi<-nboot-(ilow-1) for(i in 1:p1){ temp<-sum(bvec[i,]<0)/nboot+sum(bvec[i,]==0)/(2*nboot) regci[i,4]<-2*min(temp,1-temp) bsort<-sort(bvec[i,]) regci[i,2]<-bsort[ilow] regci[i,3]<-bsort[ihi] regci[,1]<-c(0:ncol(x)) } if(ncol(x)==1 && plotit){ plot(c(x,x1),c(y,y1),type="n",xlab="X",ylab="Y") points(x,y) points(x1,y1,pch="+") abline(regfun(x,y)$coef) abline(regfun(x1,y1)$coef,lty=2) } regci } cidv2<-function(x,y,alpha=.05,plotit=F,pop=1,fr=.8,rval=15){ # # p-value for Cliff's analog of WMW test # nullval<-0 ci<-cid(x,y,alpha=alpha,plotit=plotit,pop=pop,fr=fr,rval=rval) alph<-c(1:99)/100 for(i in 1:99){ irem<-i chkit<-cid(x,y,alpha=alph[i],plotit=F) if(chkit[[1]]>nullval || chkit[[2]]nullval || chkit[[2]]nullval || chkit[[2]]0 & !is.na(l)])+length(u[u<0 & !is.na(u)]) qhat<-c(1:length(x))/length(x) m<-matrix(c(qhat,l,u),length(x),3) dimnames(m)<-list(NULL,c("qhat","lower","upper")) if(plotit){ xsort<-sort(x) ysort<-sort(y) del<-0 for (i in 1:length(x)){ ival<-round(length(y)*i/length(x)) if(ival<=0)ival<-1 if(ival>length(y))ival<-length(y) #del[i]<-ysort[round(length(y)*i/length(x))]-xsort[i] del[i]<-ysort[ival]-xsort[i] } xaxis<-c(xsort,xsort) yaxis<-c(m[,1],m[,2]) allx<-c(xsort,xsort,xsort) ally<-c(del,m[,2],m[,3]) temp2<-m[,2] temp2<-temp2[!is.na(temp2)] plot(allx,ally,type="n",ylab=ylab,xlab=xlab) ik<-rep(F,length(xsort)) if(sm){ if(op==1){ ik<-duplicated(xsort) del<-lowess(xsort,del)$y } if(op!=1)del<-runmean(xsort,del,pyhat=T) } lines(xsort[!ik],del[!ik]) lines(xsort,m[,2],lty=2) lines(xsort,m[,3],lty=2) temp<-summary(x) text(temp[3],min(temp2),"+") text(temp[2],min(temp2),"o") text(temp[5],min(temp2),"o") } list(m=m,crit=crit,numsig=num,pc=pc) } med2mcp<-function(J,K,x,alpha=.05,nboot=NA,grp=NA,est=median,bhop=F,SEED=T, ...){ # # Multiple comparisons for J by K designs using percentile # bootstrap and medians (independent groups). # # A percentile bootstrap method with Rom's method is used. # # The data are assumed to be stored as done in the function t2way # # est is the measure of location and defaults to the median # ... can be used to set optional arguments associated with est # # The argument grp can be used to analyze a subset of the groups # Example: grp=c(1,3,5) would compare groups 1, 3 and 5. # # Missing values are allowed. # chk<-con2way(J,K) test1<-medpb(x,alpha=alpha,nboot=nboot,grp=grp,est=est,con=chk$conA,bhop=F,SEED=T,...) test2<-medpb(x,alpha=alpha,nboot=nboot,grp=grp,est=est,con=chk$conB,bhop=F,SEED=T,...) test3<-medpb(x,alpha=alpha,nboot=nboot,grp=grp,est=est,con=chk$conAB,bhop=F,SEED=T,...) list(Factor.A=test1,Factor.B=test2,Factor.AB=test3) } anova1<-function(x){ # # conventional one-way anova # if(is.matrix(x))x<-listm(x) A<-0 B<-0 C<-0 N<-0 for(j in 1:length(x)){ N<-N+length(x[[j]]) A<-A+sum(x[[j]]^2) B<-B+sum(x[[j]]) C<-C+(sum(x[[j]]))^2/length(x[[j]]) } SST<-A-B^2/N SSBG<-C-B^2/N SSWG<-A-C nu1<-length(x)-1 nu2<-N-length(x) MSBG<-SSBG/nu1 MSWG<-SSWG/nu2 FVAL<-MSBG/MSWG pvalue<-1-pf(FVAL,nu1,nu2) list(F.test=FVAL,p.value=pvalue,df1=nu1,df2=nu2,MSBG=MSBG,MSWG=MSWG) } twodcor8<-function(x,y){ # # Compute a .95 confidence interval for # the difference between two dependent # correlations corresponding to two independent # goups. # # # x is a matrix with two columns, # y is a vector # Goal: test equality of Pearson correlation for x1, y versus x2, y. # # For general use, twodcor10 is probably better, # which calls this function and estimates an adjusted p-value. # X<-elimna(cbind(x,y)) Z1<-(X[,1]-mean(X[,1]))/sqrt(var(X[,1])) Z2<-(X[,2]-mean(X[,2]))/sqrt(var(X[,2])) temp<-cor.test(Z1-Z2,X[,3]) temp<-temp[3]$p.value list(p.value=temp) } twodcor10<-function(x,y,nboot=500,SEED=T,alpha=.05){ # # Compute a .95 confidence interval for # the difference between two dependent # correlations corresponding to two independent # goups. # # x is a matrix with two columns, # y is a vector # Goal: test equality of Pearson correlation for x1, y versus x2, y. # # This function uses an adjusted p-value, the adjustment # being made assuming normality. # # nboot indicates how many samples from a normal distribution # are used to approximate the adjustment. # # Simulations suggest that this fucntion # continues to work well under non-normality. # if(SEED)set.seed(2) X<-elimna(cbind(x,y)) if(ncol(X)!=3)stop("x should be a matrix with two columns") n<-nrow(X) cval<-cor(X) nval<-(cval[1,3]+cval[2,3])/2 cmat<-bdiag(1,3,nval) cmat[1,2]<-nval cmat[2,1]<-nval pval<-NA for(i in 1:nboot){ d<-rmul(n,p=3,cmat=cmat) pval[i]<-twodcor8(d[,1:2],d[,3])$p.value } pval<-sort(pval) iv<-round(alpha*nboot) est.p<-pval[iv] adp<-alpha/est.p test<-twodcor8(X[,1:2],X[,3])$p.value p.value<-test*adp if(p.value>1)p.value<-1 list(p.value=p.value) } matsplit<-function(m,coln=NULL){ # # Column coln of matrix m is assumed to have a binary variable # This functions removes rows with missing values # and then splits m into two matrices based on the values # in column coln # m<-elimna(m) if(is.null(coln))stop("specify coln") x<-m[,coln] val<-unique(x) if(length(val)>2)stop("More than two values detected in specified column") flag<-(x==val[1]) m1<-m[flag,] m2<-m[!flag,] list(m1=m1,m2=m2) } tkmcp<-function(x,alpha=.05,ind.pval=T){ # # conventional Tukey-Kramer multiple comparison procedure # for all pairiwise comparisons. # # ind.pval=T, computes p-value for each individual test # ind.pval=F computes p-value based on controlling the # familywise error rate. (The alpha level at which the # Tukey-Kramer test would reject.) # if(is.matrix(x))x<-listm(x) J<-length(x) A<-0 B<-0 C<-0 N<-0 for(j in 1:J){ N<-N+length(x[[j]]) A<-A+sum(x[[j]]^2) B<-B+sum(x[[j]]) C<-C+(sum(x[[j]]))^2/length(x[[j]]) } SST<-A-B^2/N SSBG<-C-B^2/N SSWG<-A-C nu1<-length(x)-1 nu2<-N-length(x) MSBG<-SSBG/nu1 MSWG<-SSWG/nu2 numcom<-length(x)*(length(x)-1)/2 output<-matrix(nrow=numcom,ncol=7) dimnames(output)<-list(NULL,c("Group","Group","t.test","est.difference", "ci.lower","ci.upper","p.value")) ic<-0 for (j in 1:J){ for (k in 1:J){ if (j < k){ ic<-ic+1 output[ic,1]<-j output[ic,2]<-k dif<-mean(x[[j]])-mean(x[[k]]) output[ic,3]<-abs(dif)/sqrt(MSWG*(1/length(x[[j]])+1/length(x[[k]]))/2) output[ic,4]<-dif crit<-qtukey(1-alpha,length(x),nu2) output[ic,5]<-dif-crit*sqrt(MSWG*(1/length(x[[j]])+1/length(x[[k]]))/2) output[ic,6]<-dif+crit*sqrt(MSWG*(1/length(x[[j]])+1/length(x[[k]]))/2) if(!ind.pval)output[ic,7]<-1-ptukey(output[ic,3],length(x),nu2) if(ind.pval)output[ic,7]<-2*(1-pt(output[ic,3],nu2)) }}} output } lstest4<-function(vstar,yhat,res,x){ ystar <- yhat + res * vstar p<-ncol(x) pp<-p+1 vals<-t(as.matrix(lsfit(x,ystar)$coef[2:pp])) sa<-lsfitNci4(x, ystar)$cov[-1, -1] sai<-solve(sa) test<-(vals)%*%sai%*%t(vals) test<-test[1,1] test } twodcor10<-function(x,y,nboot=500,SEED=T,alpha=.05){ # # Compute a .95 confidence interval for # the difference between two dependent # correlations corresponding to two independent # goups. # # x is a matrix with two columns, # y is a vector # Goal: test equality of Pearson correlation for x1, y versus x2, y. # # This function uses an adjusted p-value, the adjustment # being made assuming normality. # # nboot indicates how many samples from a normal distribution # are used to approximate the adjustment. # # Simulations suggest that this fucntion # continues to work well under non-normality. # if(SEED)set.seed(2) X<-elimna(cbind(x,y)) if(ncol(X)!=3)stop("x should be a matrix with two columns") n<-nrow(X) cval<-cor(X) nval<-(cval[1,3]+cval[2,3])/2 cmat<-bdiag(1,3,nval) cmat[1,2]<-nval cmat[2,1]<-nval pval<-NA for(i in 1:nboot){ d<-rmul(n,p=3,cmat=cmat) pval[i]<-twodcor8(d[,1:2],d[,3])$p.value } pval<-sort(pval) iv<-round(alpha*nboot) est.p<-pval[iv] adp<-alpha/est.p test<-twodcor8(X[,1:2],X[,3])$p.value p.value<-test*adp if(p.value>1)p.value<-1 list(p.value=p.value) } twodcor8<-function(x,y){ # # Compute a .95 confidence interval for # the difference between two dependent # correlations corresponding to two independent # goups. # # # x is a matrix with two columns, # y is a vector # Goal: test equality of Pearson correlation for x1, y versus x2, y. # # For general use, twodcor10 is probably better, # which calls this function and estimates an adjusted p-value. # X<-elimna(cbind(x,y)) Z1<-(X[,1]-mean(X[,1]))/sqrt(var(X[,1])) Z2<-(X[,2]-mean(X[,2]))/sqrt(var(X[,2])) temp<-cor.test(Z1-Z2,X[,3]) temp<-temp[3]$p.value list(p.value=temp) } lsfitNci4<-function(x,y,alpha=.05){ # # Compute confidence for least squares # regression using heteroscedastic method # recommended by Cribari-Neto (2004). # x<-as.matrix(x) if(nrow(x) != length(y))stop("Length of y does not match number of x values") m<-cbind(x,y) m<-elimna(m) y<-m[,ncol(x)+1] temp<-lsfit(x,y) x<-cbind(rep(1,nrow(x)),m[,1:ncol(x)]) xtx<-solve(t(x)%*%x) h<-diag(x%*%xtx%*%t(x)) n<-length(h) d<-(n*h)/sum(h) for(i in 1:length(d)){ d[i]<-min(4, d[i]) } hc4<-xtx%*%t(x)%*%diag(temp$res^2/(1-h)^d)%*%x%*%xtx df<-nrow(x)-ncol(x) crit<-qt(1-alpha/2,df) al<-ncol(x) ci<-matrix(NA,nrow=al,ncol=3) for(j in 1:al){ ci[j,1]<-j ci[j,2]<-temp$coef[j]-crit*sqrt(hc4[j,j]) ci[j,3]<-temp$coef[j]+crit*sqrt(hc4[j,j]) } list(ci=ci,stand.errors=sqrt(diag(hc4)), cov=hc4) } hc4qtest<-function(x,y,k,nboot=500,SEED=T){ # # Test the hypothesis that a OLS slope is zero using HC4 wild bootstrap using quasi-t test. # k is the index of coefficient being tested # if(SEED)set.seed(2) x<-as.matrix(x) # First, eliminate any rows of data with missing values. temp <- cbind(x, y) temp <- elimna(temp) pval<-ncol(temp)-1 x <- temp[,1:pval] y <- temp[, pval+1] x<-as.matrix(x) p<-ncol(x) pp<-p+1 temp<-lsfit(x,y) yhat<-mean(y) res<-y-yhat s<-lsfitNci4(x, y)$cov[-1, -1] s<-as.matrix(s) si<-s[k,k] b<-temp$coef[2:pp] qtest<-b[k]/sqrt(si) data<-matrix(runif(length(y)*nboot),nrow=nboot) data<-(data-.5)*sqrt(12) # standardize the random numbers. rvalb<-apply(data,1,lsqtest4,yhat,res,x, k) sum<-sum(abs(rvalb)>= abs(qtest[1])) p.val<-sum/nboot list(p.value=p.val) } lsqtest4<-function(vstar,yhat,res,x, k){ ystar <- yhat + res * vstar p<-ncol(x) pp<-p+1 vals<-lsfit(x,ystar)$coef[2:pp] sa<-lsfitNci4(x, ystar)$cov[-1, -1] sa<-as.matrix(sa) sai<-sa[k,k] test<-vals[k]/sqrt(sai) test } mrm1way<-function(x,q=.5,grp=NA,bop=F,SEED=T,mop=F){ # Perform a within groups one-way ANOVA using medians # # If grp specified, do analysis on only the groups in grp. # Example: grp=(c(1,4)), compare groups 1 and 4 only. # # bop=F, use non-bootstrap estimate of covariance matrix # bop=T, use bootstrap # # mop=T, use usual median, otherwise use single order statistic # if(SEED)set.seed(2) if(is.matrix(x))x<-listm(x) K<-length(x) # Number of groups p<-K if(is.na(grp[1]))grp<-c(1:p) x<-x[grp] if(!is.list(x))stop("Data are not stored in list mode or a matrix") tmeans<-0 n<-length(x[[1]]) v<-matrix(0,p,p) if(!mop){ for (i in 1:p)tmeans[i]<-qest(x[[i]],q=q) if(!bop)v<-covmmed(x,q=q) if(bop)v<-bootcov(x,pr=F,est=qest,q=q) } if(mop){ tmeans[i]<-median(x[[i]]) v<-bootcov(x,pr=F) } km1<-K-1 ck<-diag(1,km1,K) for (i in 1:km1)ck[i,i+1]<-0-1 Qb<-johansp(ck,tmeans,v,n,1,K) #print(Qb) #p.value<-1-pf(Qb$teststat,K-1,999) p.value<-Qb$siglevel if(n>=20)p.value<-1-pf(Qb$teststat,K-1,999) list(test.stat=Qb$teststat,p.value=p.value) } rmul<-function(n,p=2,cmat=diag(rep(1,p)),rho=NA, mar.fun=rnorm,...){ # # generage n observations from a p-variate dist # Be default, use normal distributions. # #To get a g-and-h distribution # for the marginals, use mar.fun=ghdist. # Example rmul(30,p=4,rho=.3,mar.fun=ghdist,g=.5,h=.2) will # generate 30 vectors from a 4-variate distribution where the marginals # have a g-and-h distribution with g=.5 and h=.2. # # This function is similar to ghmul, only here, generate the marginal values # and then transform the data to have correlation matrix cmat # # cmat is the correlation matrix # if argument # rho is specified, the correlations are taken to # have a this common value. # # Method (e.g. Browne, M. W. (1968) A comparison of factor analytic # techniques. Psychometrika, 33, 267-334. # Let U'U=R be the Cholesky decomposition of R. Generate independent data # from some dist yielding X. Then XU has population correlation matrix R # if(!is.na(rho)){ if(abs(rho)>1)stop("rho must be between -1 and 1") cmat<-matrix(rho,p,p) diag(cmat)<-1 } np<-n*p x<-matrix(mar.fun(np,...),nrow=n,ncol=p) rmat<-matsqrt(cmat) x<-x%*%rmat x } tsp1reg<-function(x,y,plotit=F){ # # Compute the Theil-Sen regression estimator. # Only a single predictor is allowed in this version # temp<-matrix(c(x,y),ncol=2) temp<-elimna(temp) # Remove any pairs with missing values x<-temp[,1] y<-temp[,2] ord<-order(x) xs<-x[ord] ys<-y[ord] vec1<-outer(ys,ys,"-") vec2<-outer(xs,xs,"-") v1<-vec1[vec2>0] v2<-vec2[vec2>0] slope<-median(v1/v2) coef<-median(y)-slope*median(x) names(coef)<-"Intercept" coef<-c(coef,slope) if(plotit){ plot(x,y,xlab="X",ylab="Y") abline(coef) } res<-y-slope*x-coef[1] list(coef=coef,residuals=res) } L1medcen <- function(X, tol = 1e-08, maxit = 200, m.init = apply(X, 2, median), trace = FALSE) { ## L1MEDIAN calculates the multivariate L1 median ## I/O: mX=L1median(X,tol); ## ## X : the data matrix ## tol: the convergence criterium: ## the iterative process stops when ||m_k - m_{k+1}|| < tol. ## maxit: maximum number of iterations ## init.m: starting value for m; typically coordinatewise median ## ## Ref: Hossjer and Croux (1995) ## "Generalizing Univariate Signed Rank Statistics for Testing ## and Estimating a Multivariate Location Parameter"; ## Non-parametric Statistics, 4, 293-308. ## ## Implemented by Kristel Joossens ## Many thanks to Martin Maechler for improving the program! ## slightly faster version of 'sweep(x, 2, m)': centr <- function(X,m) X - rep(m, each = n) ## computes objective function in m based on X and a: mrobj <- function(X,m) sum(sqrt(rowSums(centr(X,m)^2))) d <- dim(X); n <- d[1]; p <- d[2] m <- m.init if(!is.numeric(m) || length(m) != p) stop("'m.init' must be numeric of length p =", p) k <- 1 if(trace) nstps <- 0 while (k <= maxit) { mold <- m obj.old <- if(k == 1) mrobj(X,mold) else obj X. <- centr(X, m) Xnorms <- sqrt(rowSums(X. ^ 2)) inorms <- order(Xnorms) dx <- Xnorms[inorms] # smallest first, i.e., 0's if there are X <- X [inorms,] X. <- X.[inorms,] ## using 1/x weighting {MM: should this be generalized?} w <- ## (0 norm -> 0 weight) : if (all(dn0 <- dx != 0)) 1/dx else c(rep.int(0, length(dx)- sum(dn0)), 1/dx[dn0]) delta <- colSums(X. * rep(w,p)) / sum(w) nd <- sqrt(sum(delta^2)) maxhalf <- if (nd < tol) 0 else ceiling(log2(nd/tol)) m <- mold + delta # computation of a new estimate ## If step 'delta' is too far, we try halving the stepsize nstep <- 0 while ((obj <- mrobj(X, m)) >= obj.old && nstep <= maxhalf) { nstep <- nstep+1 m <- mold + delta/(2^nstep) } if(trace) { if(trace >= 2) cat(sprintf("k=%3d obj=%19.12g m=(",k,obj), paste(formatC(m),collapse=","), ")", if(nstep) sprintf(" nstep=%2d halvings",nstep) else "", "\n", sep="") nstps[k] <- nstep } if (nstep > maxhalf) { ## step halving failed; keep old m <- mold ## warning("step halving failed in ", maxhalf, " steps") break } k <- k+1 } if (k > maxit) warning("iterations did not converge in ", maxit, " steps") if(trace == 1) cat("needed", k, "iterations with a total of", sum(nstps), "stepsize halvings\n") # return(m) list(center=m) } spatcen<-function(x){ # # compute spatial median # x is an n by p matrix # if(!is.matrix(x))stop("x must be a matrix") x<-elimna(x) START<-apply(x,2,median) val<-nelder(x,ncol(x),spat.sub,START=START) list(center=val) } olswbtest<-function(x,y,nboot=500,SEED=T,RAD=T,alpha=.05){ # # Compute confidence intervals for all OLS slopes # using HC4 wild bootstrap and Wald test. # # This function calls the functions # olshc4 and # lstest4 # if(SEED)set.seed(2) x<-as.matrix(x) # First, eliminate any rows of data with missing values. temp <- cbind(x, y) temp <- elimna(temp) pval<-ncol(temp)-1 x <- temp[,1:pval] y <- temp[, pval+1] x<-as.matrix(x) p<-ncol(x) pp<-p+1 temp<-lsfit(x,y) yhat<-mean(y) res<-y-yhat #s<-lsfitNci4(x, y)$cov[-1, -1] s<-olshc4(x, y)$cov[-1, -1] si<-solve(s) b<-temp$coef[2:pp] test=abs(b)*sqrt(diag(si)) #wtest<-t(b)%*%si%*%b print("Taking bootstrap samples. Please wait.") if(RAD)data<-matrix(ifelse(rbinom(length(y)*nboot,1,0.5)==1,-1,1),nrow=nboot) if(!RAD){ data<-matrix(runif(length(y)*nboot),nrow=nboot) data<-(data-.5)*sqrt(12) # standardize the random numbers. } rvalb<-apply(data,1,olswbtest.sub,yhat,res,x) #a p by nboot matrix rvalb=abs(rvalb) ic=round((1-alpha)*nboot) if(p==1)rvalb=t(as.matrix(rvalb)) temp=apply(rvalb,1,sort) # nboot by p matrix pvals=NA for(j in 1:p)pvals[j]=mean((rvalb[j,]>=test[j])) cr=temp[ic,] ci=b-cr/diag(sqrt(si)) #dividing because si is reciprocal of sq se ci=cbind(ci,b+cr/diag(sqrt(si))) ci=cbind(b,ci) ci=cbind(c(1:nrow(ci)),ci,test,pvals) dimnames(ci)<- list(NULL,c("Slope_No.","Slope_est","Lower.ci","Upper.ci","Test.Stat","p-value")) ci } olswbtest.sub<-function(vstar,yhat,res,x){ ystar <- yhat + res * vstar p<-ncol(x) pp<-p+1 vals<-t(as.matrix(lsfit(x,ystar)$coef[2:pp])) sa<-olshc4(x, ystar)$cov[-1, -1] sai<-solve(sa) test<-vals*sqrt(diag(sai)) test } medind<-function(x,y,qval=.5,nboot=1000,com.pval=F,SEED=T,alpha=.05,pr=T, xout=F,outfun=out,chk.table=F,make.table=F,...){ # # Test the hypothesis that the regression surface is a flat # horizontal plane. # The method is based on a modification of a method derived by # He and Zhu 2003, JASA, 98, 1013-1022. # Here, resampling is avoided using approximate critical values if # com.pval=F # # critical values are available for 10<=n<=400, p=1,...,8 and # q=.25,.5, .75. # # To get a p-value, via simulations, set com.pval=T # nboot is number of simulations used to determine the p-value. # # Note: the arguments chk.table and make.table should not be used # as yet. Some bugs need be work out. # if(pr){ if(!com.pval)print("To get a p-value, set com.pval=T") print("Reject if the test statistic exceeds the critical value") } store.it=F x<-as.matrix(x) p<-ncol(x) pp1<-p+1 p.val<-NULL crit.val<-NULL yx<-elimna(cbind(y,x)) #Eliminate missing values. y<-yx[,1] x<-yx[,2:pp1] x<-as.matrix(x) if(xout){ flag<-outfun(x,...)$keep x<-x[flag,] y<-y[flag] } n<-length(y) if(qval==.5){ resmat1=matrix(c( 0.0339384580, 0.044080032, 0.050923441, 0.064172557, 0.0153224731, 0.021007108, 0.027687963, 0.032785044, 0.0106482053, 0.014777728, 0.018249546, 0.023638611, 0.0066190573, 0.009078091, 0.011690825, 0.014543009, 0.0031558563, 0.004374515, 0.005519069, 0.007212951, 0.0015448987, 0.002231473, 0.002748314, 0.003725916, 0.0007724197, 0.001021767, 0.001370776, 0.001818037),ncol=4,nrow=7,byrow=T) if(make.table) write(c(10,1,.5,resmat1[1,],20,1,.5,resmat1[2,]),"medind.crit",ncolumns=7) resmat2=matrix(c( 0.052847794, 0.061918744, 0.071346969, 0.079163419, 0.021103277, 0.027198076, 0.031926052, 0.035083610, 0.013720585, 0.018454145, 0.022177381, 0.026051716, 0.008389969, 0.010590374, 0.012169233, 0.015346065, 0.004261627, 0.005514060, 0.007132021, 0.008416836, 0.001894753, 0.002416311, 0.003085230, 0.003924706, 0.001045346, 0.001347837, 0.001579373, 0.001864344),ncol=4,nrow=7,byrow=T) resmat3=matrix(c( 0.071555715, 0.082937665, 0.089554679, 0.097538044, 0.031060795, 0.035798539, 0.043862556, 0.053712151, 0.019503635, 0.023776479, 0.027180121, 0.030991367, 0.011030001, 0.013419347, 0.015557409, 0.017979524, 0.005634478, 0.006804788, 0.007878358, 0.008807657, 0.002552182, 0.003603778, 0.004275965, 0.005021989, 0.001251044, 0.001531919, 0.001800608, 0.002037870),ncol=4,nrow=7,byrow=T) resmat4=matrix(c( 0.093267532, 0.101584002, 0.108733965, 0.118340448, 0.038677863, 0.045519806, 0.051402903, 0.060097046, 0.024205231, 0.029360145, 0.034267265, 0.039381482, 0.013739157, 0.015856343, 0.018065898, 0.019956084, 0.006467562, 0.007781030, 0.009037972, 0.010127143, 0.003197162, 0.003933525, 0.004656625, 0.005929469, 0.001652690, 0.001926060, 0.002363874, 0.002657071),ncol=4,nrow=7,byrow=T) resmat5=matrix(c( 0.117216934, 0.124714114, 0.129458602, 0.136456163, 0.048838630, 0.055608712, 0.060580045, 0.067943676, 0.030594644, 0.035003872, 0.040433885, 0.047648696, 0.016940240, 0.019527491, 0.022047442, 0.025313443, 0.008053039, 0.009778574, 0.011490394, 0.013383628, 0.003760567, 0.004376294, 0.005097890, 0.005866240, 0.001894616, 0.002253522, 0.002612405, 0.002938808),ncol=4,nrow=7,byrow=T) resmat6=matrix(c( 0.136961531, 0.144120225, 0.149003907, 0.152667432, 0.055909481, 0.062627211, 0.069978086, 0.081189957, 0.034634825, 0.040740587, 0.044161376, 0.047722045, 0.020165417, 0.023074738, 0.025881208, 0.028479913, 0.009436297, 0.011246968, 0.013220963, 0.015100546, 0.004644596, 0.005334418, 0.006040595, 0.007237195, 0.002277590, 0.002635712, 0.002997398, 0.003669488),ncol=4,nrow=7,byrow=T) resmat7=matrix(c( 0.156184672, 0.163226643, 0.171754686, 0.177142753, 0.070117003, 0.077052773, 0.082728047, 0.090410797, 0.041774517, 0.047379662, 0.053101833, 0.057674454, 0.023384451, 0.026014421, 0.029609042, 0.032619018, 0.010856382, 0.012567043, 0.013747870, 0.016257014, 0.005164004, 0.006131755, 0.006868101, 0.008351046, 0.002537642, 0.003044154, 0.003623654, 0.003974469),ncol=4,nrow=7,byrow=T) resmat8=matrix(c( 0.178399742, 0.180006714, 0.193799396, 0.199585892, 0.078032767, 0.085624186, 0.091511226, 0.102491785, 0.045997886, 0.052181615, 0.057362163, 0.062630424, 0.025895739, 0.029733034, 0.033764463, 0.037873655, 0.012195876, 0.013663248, 0.015487587, 0.017717864, 0.005892418, 0.006876488, 0.007893475, 0.008520783, 0.002839731, 0.003243909, 0.003738571, 0.004124057),ncol=4,nrow=7,byrow=T) crit5=array(cbind(resmat1,resmat2,resmat3,resmat4,resmat5,resmat6,resmat7, resmat8),c(7,4,8)) flag=T crit.val=NULL if(p > 8)flag=F if(n<10 || n>=400)flag=F aval<-c(.1,.05,.025,.01) aokay<-duplicated(c(alpha,aval)) if(sum(aokay)==0)flag=F if(flag){ nalpha=c(0:4) asel=c(0,aval) ialpha=nalpha[aokay] critit=crit5[,ialpha,p] nvec<-c(10,20,30,50,100,200,400) nval<-duplicated(c(n,nvec)) nval<-nval[2:8] if(sum(nval)>0)crit.val<-critit[nval] loc<-rank(c(n,nvec)) xx<-c(1/nvec[loc[1]-1]^1.5,1/nvec[loc[1]]^1.5) yy<-c(critit[loc[1]-1],critit[loc[1]]) icoef<-tsp1reg(xx,yy)$coef crit.val<-icoef[1]+icoef[2]/n^1.5 }} mqval<-min(c(qval,1-qval)) if(mqval==.25){ resmat1=matrix(c( 0.029933486, 0.0395983678, 0.054087714, 0.062961453, 0.011122294, 0.0149893431, 0.018154062, 0.022685244, 0.009207200, 0.0113020766, 0.014872309, 0.019930730, 0.004824185, 0.0070402246, 0.010356886, 0.013176896, 0.002370379, 0.0033146605, 0.004428004, 0.005122988, 0.001106460, 0.0016110185, 0.001984450, 0.002650256, 0.000516646, 0.0006796144, 0.000868751, 0.001202042),ncol=4,nrow=7,byrow=T) resmat2=matrix(c( 0.0448417783, 0.0602598211, 0.066001091, 0.087040667, 0.0173410522, 0.0224713157, 0.027370822, 0.033435727, 0.0121205549, 0.0150409465, 0.018938516, 0.022643559, 0.0064894201, 0.0084611518, 0.010700320, 0.013232000, 0.0029734778, 0.0040641310, 0.004911086, 0.005769038, 0.0015149104, 0.0020584993, 0.002582982, 0.003114029, 0.0007984207, 0.0009929547, 0.001182739, 0.001398774),ncol=4,nrow=7,byrow=T) resmat3=matrix(c( 0.0636530860, 0.072974943, 0.083840562, 0.097222407, 0.0216586978, 0.027436566, 0.031875356, 0.036830302, 0.0152898678, 0.018964066, 0.021728817, 0.028959751, 0.0083568493, 0.010071525, 0.012712862, 0.015254576, 0.0039033578, 0.004764140, 0.005577071, 0.006660322, 0.0019139215, 0.002343152, 0.002833612, 0.003465269, 0.0009598105, 0.001146689, 0.001355930, 0.001547572),ncol=4,nrow=7,byrow=T) resmat4=matrix(c( 0.085071252, 0.095947936, 0.104197413, 0.118449765, 0.029503024, 0.034198704, 0.039543410, 0.045043759, 0.019203266, 0.022768842, 0.026886843, 0.033481535, 0.011440493, 0.013555017, 0.016138970, 0.018297815, 0.004863139, 0.005756305, 0.007385239, 0.009114958, 0.002635144, 0.003111160, 0.003769051, 0.004215897, 0.001188837, 0.001435179, 0.001727871, 0.001956372),ncol=4,nrow=7,byrow=T) resmat5=matrix(c( 0.102893512, 0.114258558, 0.122545016, 0.130222265, 0.036733497, 0.042504996, 0.048663576, 0.055456582, 0.024192946, 0.028805967, 0.032924489, 0.038209545, 0.012663224, 0.014635216, 0.017275594, 0.019736410, 0.006105572, 0.007310803, 0.008960242, 0.009745320, 0.003067163, 0.003614637, 0.003997615, 0.004812373, 0.001441008, 0.001732819, 0.002078651, 0.002307551),ncol=4,nrow=7,byrow=T) resmat6=matrix(c( 0.117642769, 0.126566104, 0.133106804, 0.142280074, 0.044309420, 0.049731991, 0.053912739, 0.060512997, 0.028607224, 0.033826020, 0.038616476, 0.043546500, 0.015445120, 0.017557181, 0.020040720, 0.022747707, 0.007334749, 0.008406468, 0.009392098, 0.010919651, 0.003352200, 0.003814582, 0.004380562, 0.005252154, 0.001703698, 0.002001713, 0.002338651, 0.002772864),ncol=4,nrow=7,byrow=T) resmat7=matrix(c( 0.106573121, 0.113058950, 0.117388191, 0.121286795, 0.052170054, 0.058363322, 0.064733684, 0.069749344, 0.030696897, 0.035506926, 0.039265698, 0.044437674, 0.016737307, 0.019605734, 0.021253610, 0.022922988, 0.007767232, 0.009231789, 0.010340874, 0.011471110, 0.003998261, 0.004590177, 0.005506926, 0.006217415, 0.001903372, 0.002174748, 0.002519055, 0.002858655),ncol=4,nrow=7,byrow=T) resmat8=matrix(c( 0.119571179, 0.126977461, 0.130120853, 0.133258294, 0.059499563, 0.067185338, 0.071283297, 0.079430577, 0.034310968, 0.039827130, 0.044451690, 0.048512464, 0.018599530, 0.021093909, 0.023273085, 0.027471116, 0.009135712, 0.010901687, 0.012288682, 0.013729545, 0.004382249, 0.005191810, 0.005598429, 0.006484433, 0.002196973, 0.002525918, 0.002818550, 0.003242426),ncol=4,nrow=7,byrow=T) crit5=array(cbind(resmat1,resmat2,resmat3,resmat4,resmat5,resmat6,resmat7, resmat8),c(7,4,8)) flag=T crit.val=NULL if(p > 8)flag=F if(n<10 || n>=400)flag=F aval<-c(.1,.05,.025,.01) aokay<-duplicated(c(alpha,aval)) if(sum(aokay)==0)flag=F if(flag){ nalpha=c(0:4) asel=c(0,aval) ialpha=nalpha[aokay] critit=crit5[,ialpha,p] nvec<-c(10,20,30,50,100,200,400) nval<-duplicated(c(n,nvec)) nval<-nval[2:8] if(sum(nval)>0)crit.val<-critit[nval,p] loc<-rank(c(n,nvec)) xx<-c(1/nvec[loc[1]-1]^1.5,1/nvec[loc[1]]^1.5) yy<-c(critit[loc[1]-1],critit[loc[1]]) icoef<-tsp1reg(xx,yy)$coef crit.val<-icoef[1]+icoef[2]/n^1.5 }} if(is.null(crit.val))com.pval=T # no critical value found, so a p-value will be computed # the code for checking the file medind.crit, which appears # next, is not working yet. if(is.null(crit.val)){ # no critical value found yet, check file medind.crit if(chk.table){ z<-read.table("medind.crit") nz1<-nrow(z)+1 flag1<-as.matrix(duplicated(c(n,z[,1]))) flag2<-as.matrix(duplicated(c(p,z[,2]))) flag3<-as.matrix(duplicated(c(qval,z[,3]))) zz<-cbind(flag1,flag2,flag3) zz<-zz[2:nz1,] find.row<-apply(zz,1,sum) if(max(find.row)==3){ ir<-order(find.row) nir<-length(ir) ir<-ir[nir] critvals<-z[ir,4:7] if(pr){print("The .1, .05, .025 and .01 critical values are:") print(critvals) } crit.val<-critvals[2] } if(max(find.row)!=3){ store.it=T if(!com.pval){ print("Critical values not available, will set com.pval=T") print("and compute them") com.pval<-T }} } } gdot<-cbind(rep(1,n),x) gdot<-ortho(gdot) x<-gdot[,2:pp1] x<-as.matrix(x) coef<-NULL if(qval==.5)coef<-median(y) if(qval==.25)coef<-idealf(y)$ql if(qval==.75)coef<-idealf(y)$qu if(is.null(coef))coef<-qest(y,q=qval) res<-y-coef psi<-NA psi<-ifelse(res>0,qval,qval-1) rnmat<-matrix(0,nrow=n,ncol=pp1) ran.mat<-apply(x,2,rank) flagvec<-apply(ran.mat,1,max) for(j in 1:n){ flag<-ifelse(flagvec<=flagvec[j],T,F) flag<-as.numeric(flag) rnmat[j,]<-apply(flag*psi*gdot,2,sum) } rnmat<-rnmat/sqrt(n) temp<-matrix(0,pp1,pp1) for(i in 1:n)temp<-temp+rnmat[i,]%*%t(rnmat[i,]) temp<-temp/n test<-max(eigen(temp)$values) if(com.pval){ if(SEED)set.seed(2) p.val<-0 rem<-0 for(i in 1:nboot){ yboot<-rnorm(n) if(p==1)xboot<-rnorm(n) if(p>1)xboot<-rmul(n,p=p) temp3<-medindsub(x,yboot,qval=qval) if(test>=temp3)p.val<-p.val+1 rem[i]<-temp3 } ic10<-round(.9*nboot) ic05<-round(.95*nboot) ic025<-round(.975*nboot) ic001<-round(.99*nboot) rem<-sort(rem) p.val<-1-p.val/nboot # now remember the critical values by storing them in "medind.crit" if(store.it) write(c(n,p,qval,rem[ic10],rem[ic05],rem[ic025],rem[ic001]),"medind.crit", append=T,ncolumns=7) print("The .1, .05, .025 and .001 critical values are:") print(c(rem[ic10],rem[ic05],rem[ic025],rem[ic001])) crit.val<-rem[ic05] } names(crit.val)="" list(test.stat=test,crit.value=crit.val,p.value=p.val) } medindsub<-function(x,y,qval=.5){ # x<-as.matrix(x) n<-length(y) p<-ncol(x) pp1<-p+1 tvec<-c(qval,0-qval,1-qval,qval-1) pval<-c((1-qval)/2,(1-qval)/2,qval/2,qval/2) gdot<-cbind(rep(1,n),x) gdot<-ortho(gdot) x<-gdot[,2:pp1] x<-as.matrix(x) if(qval==.5)coef<-median(y) if(qval!=.5)coef<-qest(y) res<-y-coef psi<-NA psi<-ifelse(res>0,qval,qval-1) rnmat<-matrix(0,nrow=n,ncol=pp1) ran.mat<-apply(x,2,rank) flagvec<-apply(ran.mat,1,max) for(j in 1:n){ #flag<-ifelse(flagvec<=flagvec[j],T,F) flag<-ifelse(flagvec>=flagvec[j],T,F) rnmat[j,]<-apply(flag*psi*gdot,2,sum) } rnmat<-rnmat/sqrt(n) temp<-matrix(0,pp1,pp1) for(i in 1:n)temp<-temp+rnmat[i,]%*%t(rnmat[i,]) temp<-temp/n test<-max(eigen(temp)$values) test } regpre<-function(x,y,regfun=lsfit,error=absfun,nboot=100,adz=T, mval=round(5*log(length(y))),model=NULL,locfun=mean,pr=T, xout=F,outfun=out, plotit=T,xlab="Model Number",ylab="Prediction Error",SEED=T,...){ # # Estimate the prediction error using the regression method # regfun. The .632 method is used. # (See Efron and Tibshirani, 1993, pp. 252--254) # # The predictor values are assumed to be in the n by p matrix x. # The default number of bootstrap samples is nboot=100 # # Prediction error is the expected value of the function error. # The argument error defaults to squared error. # # regfun can be any s-plus function that returns the coefficients in # the vector regfun$coef, the first element of which contains the # estimated intercept, the second element contains the estimate of # the first predictor, etc. # # The default value for mval, the number of observations to resample # for each of the B bootstrap samples is based on results by # Shao (JASA, 1996, 655-665). (Resampling n vectors of observations # model selection may not lead to the correct model as n->infinity. # # The argument model should have list mode, model[[1]] indicates # which predictors are used in the first model. For example, storing # 1,4 in model[[1]] means predictors 1 and 4 are being considered. # If model is not specified, and number of predictors is at most 5, # then all models are considered. # # If adz=T, added to the models to be considered is where # all regression slopes are zero. That is, use measure of location only # corresponding to # locfun. # if(pr){ print("By default, least squares regression is used, ") print("But from Wilcox, R. R. 2008, Journal of Applied Statistics, 35, 1-8") print("Setting regfun=tsreg appears to be a better choice for general use.") print("That is, replace least squares with the Theil-Sen estimator") print("Note: Default for the argument error is now absfun") print(" meaning absolute error is used") print("To use squared error, set error=sqfun") } x<-as.matrix(x) d<-ncol(x) p1<-d+1 temp<-elimna(cbind(x,y)) x<-temp[,1:d] y<-temp[,d+1] x<-as.matrix(x) if(xout){ x<-as.matrix(x) flag<-outfun(x,...)$keep x<-x[flag,] y<-y[flag] x<-as.matrix(x) } if(is.null(model)){ if(d<=5)model<-modgen(d,adz=adz) if(d>5)model[[1]]<-c(1:ncol(x)) } mout<-matrix(NA,length(model),5,dimnames=list(NULL,c("apparent.error", "boot.est","err.632","var.used","rank"))) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. if(pr)print("Taking bootstrap samples. Please wait.") data<-matrix(sample(length(y),size=mval*nboot,replace=T),nrow=nboot) bid<-apply(data,1,idb,length(y)) # bid is an n by nboot matrix. If the jth bootstrap sample from # 1, ..., mval contains the value i, bid[i,j]=0; otherwise bid[i,j]=1 for (imod in 1:length(model)){ nmod=length(model[[imod]])-1 temp=c(nmod:0) mout[imod,4]=sum(model[[imod]]*10^temp) if(sum(model[[imod]]==0)!=1){ xx<-x[,model[[imod]]] xx<-as.matrix(xx) if(sum(model[[imod]]==0)!=1)bvec<-apply(data,1,regpres1,xx,y,regfun,mval,...) # bvec is a p+1 by nboot matrix. The first row # contains the bootstrap intercepts, the second row # contains the bootstrap values for first predictor, etc. if(sum(model[[imod]]==0)!=1)yhat<-cbind(1,xx)%*%bvec if(sum(model[[imod]]==0)==1){ bvec0<-matrix(0,nrow=p1,ncol=nboot) for(it in 1:nboot){ bvec0[1,it]<-locfun(y[data[it,]]) } yhat<-cbind(1,x)%*%bvec0 } # yhat is n by nboot matrix of predicted values based on # bootstrap regressions. bi<-apply(bid,1,sum) # B sub i in notation of Efron and Tibshirani, p. 253 temp<-(bid*(yhat-y)) diff<-apply(temp,1,error) ep0<-sum(diff/bi)/length(y) aperror<-error(regfun(xx,y,...)$resid)/length(y) # apparent error regpre<-.368*aperror+.632*ep0 mout[imod,1]<-aperror mout[imod,3]<-regpre temp<-yhat-y diff<-apply(temp,1,error) mout[imod,2]<-sum(diff)/(nboot*length(y)) } if(sum(model[[imod]]==0)==1){ mout[imod,3]<-locpre(y,error=error,est=locfun,SEED=SEED,mval=mval) }} mout[,5]=rank(mout[,3]) if(plotit)plot(c(1:nrow(mout)),mout[,3],xlab=xlab,ylab=ylab) list(estimates=mout) } push<-function(mat){ # # For every column of mat, move entry down 1 # matn<-matrix(NA,nrow=nrow(mat),ncol=ncol(mat)) Jm<-nrow(mat)-1 for (k in 1:ncol(mat)){ temp<-mat[,k] vec<-0 vec[2:nrow(mat)]<-temp[1:Jm] matn[,k]<-vec } matn } ancova<-function(x1,y1,x2,y2,fr1=1,fr2=1,tr=.2,alpha=.05,plotit=T,pts=NA,sm=F, pr=T){ # # Compare two independent groups using the ancova method # No parametric assumption is made about the form of # the regression lines--a running interval smoother is used. # # Assume data are in x1 y1 x2 and y2 # # sm=T will create smooths using bootstrap bagging. # pts can be used to specify the design points where the regression lines # are to be compared. # xy=elimna(cbind(x1,y1)) x1=xy[,1] y1=xy[,2] xy=elimna(cbind(x2,y2)) x2=xy[,1] y2=xy[,2] if(pr){ print("NOTE: Confidence intervals are adjusted to control the probability") print("of at least one Type I error.") print("But p-values are not") } if(is.na(pts[1])){ npt<-5 isub<-c(1:5) # Initialize isub test<-c(1:5) xorder<-order(x1) y1<-y1[xorder] x1<-x1[xorder] xorder<-order(x2) y2<-y2[xorder] x2<-x2[xorder] n1<-1 n2<-1 vecn<-1 for(i in 1:length(x1))n1[i]<-length(y1[near(x1,x1[i],fr1)]) for(i in 1:length(x1))n2[i]<-length(y2[near(x2,x1[i],fr2)]) for(i in 1:length(x1))vecn[i]<-min(n1[i],n2[i]) sub<-c(1:length(x1)) isub[1]<-min(sub[vecn>=12]) isub[5]<-max(sub[vecn>=12]) isub[3]<-floor((isub[1]+isub[5])/2) isub[2]<-floor((isub[1]+isub[3])/2) isub[4]<-floor((isub[3]+isub[5])/2) mat<-matrix(NA,5,10) dimnames(mat)<-list(NULL,c("X","n1","n2","DIF","TEST","se","ci.low","ci.hi","p.value","crit.val")) for (i in 1:5){ g1<-y1[near(x1,x1[isub[i]],fr1)] g2<-y2[near(x2,x1[isub[i]],fr2)] g1<-g1[!is.na(g1)] g2<-g2[!is.na(g2)] test<-yuen(g1,g2,tr=tr) mat[i,1]<-x1[isub[i]] mat[i,2]<-length(g1) mat[i,3]<-length(g2) mat[i,4]<-test$dif mat[i,5]<-test$teststat mat[i,6]<-test$se critv<-NA if(alpha==.05)critv<-smmcrit(test$df,5) if(alpha==.01)critv<-smmcrit01(test$df,5) if(is.na(critv))critv<-smmval(test$df,5,alpha=alpha) mat[i,10]<-critv cilow<-test$dif-critv*test$se cihi<-test$dif+critv*test$se mat[i,7]<-cilow mat[i,8]<-cihi mat[i,9]<-test$p.value }} if(!is.na(pts[1])){ if(length(pts)>=29)stop("At most 28 points can be compared") n1<-1 n2<-1 vecn<-1 for(i in 1:length(pts)){ n1[i]<-length(y1[near(x1,pts[i],fr1)]) n2[i]<-length(y2[near(x2,pts[i],fr2)]) } mat<-matrix(NA,length(pts),10) dimnames(mat)<-list(NULL,c("X","n1","n2","DIF","TEST","se","ci.low","ci.hi", "p.value","crit.val")) for (i in 1:length(pts)){ g1<-y1[near(x1,pts[i],fr1)] g2<-y2[near(x2,pts[i],fr2)] g1<-g1[!is.na(g1)] g2<-g2[!is.na(g2)] test<-yuen(g1,g2,tr=tr) mat[i,1]<-pts[i] mat[i,2]<-length(g1) mat[i,3]<-length(g2) if(length(g1)<=5)print(paste("Warning, there are",length(g1)," points corresponding to the design point X=",pts[i])) if(length(g2)<=5)print(paste("Warning, there are",length(g2)," points corresponding to the design point X=",pts[i])) mat[i,4]<-test$dif mat[i,5]<-test$teststat mat[i,6]<-test$se if(length(pts)>=2)critv<-smmcrit(test$df,length(pts)) if(length(pts)==1)critv<-qt(.975,test$df) cilow<-test$dif-critv*test$se cihi<-test$dif+critv*test$se mat[i,7]<-cilow mat[i,8]<-cihi print(test) mat[i,9]<-test$p.value mat[i,10]<-critv }} if(plotit) runmean2g(x1,y1,x2,y2,fr=fr1,est=mean,tr=tr,sm=sm) list(output=mat) } miss2na<-function(m,na.val=NULL){ # # Convert any missing value, indicatd by na.val, # to NA. # # Example, if 999 is missing value, use miss2na(m,999) # if(is.null(na.val))stop("Specify a missing value") if(is.vector(m)){ if(!is.list(m)){ flag=(m==na.val) m[flag]=NA }} if(is.matrix(m)){ for(j in 1:ncol(m)){ x=m[,j] flag=(x==na.val) x[flag]=NA m[,j]=x }} if(is.list(m)){ for(j in 1:length(m)){ x=m[[j]] flag=(x==na.val) x[flag]=NA m[[j]]=x }} m } plotCI <- function (x, y = NULL, uiw, liw = uiw, aui=NULL, ali=aui, err="y", ylim=NULL, sfrac = 0.01, gap=0, add=FALSE, col=par("col"), lwd=par("lwd"), slty=par("lty"), xlab=NULL, ylab=NULL, ...) { ## originally from Bill Venables, R-list if (is.list(x)) { y <- x$y x <- x$x } if (is.null(y)) { if (is.null(x)) stop("both x and y NULL") y <- as.numeric(x) x <- seq(along = x) } if (missing(xlab)) xlab <- deparse(substitute(x)) if (missing(ylab)) ylab <- deparse(substitute(y)) if (missing(uiw)) { ## absolute limits ui <- aui li <- ali } else { ## relative limits if (err=="y") z <- y else z <- x ui <- z + uiw li <- z - liw } if (is.null(ylim)) ylim <- range(c(y, ui, li), na.rm=TRUE) if (add) { points(x, y, col=col, lwd=lwd, ...) } else { plot(x, y, ylim = ylim, col=col, lwd=lwd, xlab=xlab, ylab=ylab, ...) } if (gap==TRUE) gap <- 0.01 ## default gap size ul <- c(li, ui) if (err=="y") { gap <- rep(gap,length(x))*diff(par("usr")[3:4]) # smidge <- diff(par("usr")[1:2]) * sfrac smidge <- par("fin")[1] * sfrac # segments(x , li, x, pmax(y-gap,li), col=col, lwd=lwd, lty=slty) # segments(x , ui, x, pmin(y+gap,ui), col=col, lwd=lwd, lty=slty) arrows(x , li, x, pmax(y-gap,li), col=col, lwd=lwd, lty=slty, angle=90, length=smidge, code=1) arrows(x , ui, x, pmin(y+gap,ui), col=col, lwd=lwd, lty=slty, angle=90, length=smidge, code=1) ## horizontal segments # x2 <- c(x, x) # segments(x2 - smidge, ul, x2 + smidge, ul, col=col, lwd=lwd) } else if (err=="x") { gap <- rep(gap,length(x))*diff(par("usr")[1:2]) smidge <- par("fin")[2] * sfrac # smidge <- diff(par("usr")[3:4]) * sfrac arrows(li, y, pmax(x-gap,li), y, col=col, lwd=lwd, lty=slty, angle=90, length=smidge, code=1) arrows(ui, y, pmin(x+gap,ui), y, col=col, lwd=lwd, lty=slty, angle=90, length=smidge, code=1) ## vertical segments # y2 <- c(y, y) # segments(ul, y2 - smidge, ul, y2 + smidge, col=col, lwd=lwd) } invisible(list(x = x, y = y)) } bdanova2<-function(x1,x2=NULL,alpha=.05,power=.9,delta){ # # Do the second stage of the Bishop-Duewicz ANOVA # if(is.null(x2[1])){ stage1=bdanova1(x1,alpha=alpha,power=power,delta=delta) return(list(N=stage1$N,d=stage1$d,crit=stage1$crit)) } if(!is.null(x2[1])){ if(is.na(delta))stop("A value for delta was not specified") if(!is.list(x1)){ if(!is.matrix(x1))stop("Data must be stored in a matrix or in list mode") y<-x1 x1<-list() for(j in 1:ncol(y))x1[[j]]<-y[,j] } if(is.na(delta))stop("A value for delta was not specified") if(!is.list(x2)){ if(!is.matrix(x2))stop("Data must be stored in matrix or in list mode") y<-x2 x2<-list() for(j in 1:ncol(y))x2[[j]]<-y[,j] } if(length(x1)!=length(x2))stop("Length of x1 does not match the length of x2") TT<-NA U<-NA J<-length(x1) nvec<-NA nvec2<-NA svec<-NA for(j in 1:length(x1)){ nvec[j]<-length(x1[[j]]) nvec2[j]<-length(x2[[j]]) svec[j]<-var(x1[[j]]) TT[j]<-sum(x1[[j]]) U[j]<-sum(x2[[j]]) } temp<-bdanova1(x1,alpha=alpha,power=power,delta=delta) need<-temp$N-nvec #for(j in 1:length(x1))print(c(nvec2[j],need[j])) for(j in 1:length(x1))if(nvec2[j]=dv[1:nboot])/nboot if(op==4)print(sig.level) list(p.value=sig.level,output=output) } rm2mcp<-function(J,K,x,est=tmean,alpha=.05,grp=NA,dif=T,nboot=NA, plotit=F,BA=F,hoch=F,...){ # # This function performs multiple comparisons for # dependent groups in a within by within designs. # It creates the linear contrasts and calls rmmcppb # assuming that main effects and interactions for a # two-way design are to be tested. # # The data are assumed to be stored in x in list mode or in a matrix. # If grp is unspecified, it is assumed x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second factor: level 1,2 # x[[j+1]] is the data for level 2,1, etc. # If the data are in wrong order, grp can be used to rearrange the # groups. For example, for a two by two design, grp<-c(2,4,3,1) # indicates that the second group corresponds to level 1,1; # group 4 corresponds to level 1,2; group 3 is level 2,1; # and group 1 is level 2,2. # # Missing values are automatically removed. # JK <- J * K if(is.matrix(x)) x <- listm(x) if(!is.na(grp[1])) { yy <- x for(j in 1:length(grp)) x[[j]] <- yy[[grp[j]]] } if(!is.list(x)) stop("Data must be stored in list mode or a matrix.") for(j in 1:JK) { xx <- x[[j]] # xx[[j]] <- xx[!is.na(xx)] x[[j]] <- xx[!is.na(xx)] } # # Create the three contrast matrices # temp<-con2way(J,K) conA<-temp$conA conB<-temp$conB conAB<-temp$conAB ncon <- max(nrow(conA), nrow(conB), nrow(conAB)) FacA<-rmmcppb(x,con=conA,est=est,plotit=plotit,dif=dif,grp=grp, nboot=nboot,BA=T,hoch=F,...) FacB<-rmmcppb(x,con=conB,est=est,plotit=plotit,dif=dif,grp=grp, nboot=nboot,BA=T,hoch=F,...) FacAB<-rmmcppb(x,con=conAB,est=est,plotit=plotit,dif=dif,grp=grp, nboot=nboot,BA=T,hoch=F,...) list(Factor.A=FacA,Factor.B=FacB,Factor.AB=FacAB) } akp.effect<-function(x,y){ # # Computes the robust effect size suggested by #Algina, Keselman, Penfield Pcyh Methods, 2005, 317-328 x<-elimna(x) y<-elimna(y) n1<-length(x) n2<-length(y) spsq<-(n1-1)*winvar(x)+(n2-1)*winvar(y) sp<-sqrt(spsq/(n1+n2-2)) dval<-.642*(tmean(x)-tmean(y))/sp dval } acbinomci<-function(x=sum(y),nn=length(y),y=NA,n=NA,alpha=.05){ # # Compute a 1-alpha confidence interval for p, the probability of # success for a binomial distribution, using a generalization of the # Agresti-Coull method that was studied by Brown, Cai DasGupta # (Annals of Statistics, 2002, 30, 160-201.) # # y is a vector of 1s and 0s. # x is number of successes. # if(nn==1)stop("Something is wrong: number of observations is only 1") n<-nn if(x!=n && x!=0){ cr=qnorm(1-alpha/2) ntil=n+cr^2 ptil=(x+cr^2/2)/ntil lower=ptil-cr*sqrt(ptil*(1-ptil)/ntil) upper=ptil+cr*sqrt(ptil*(1-ptil)/ntil) } if(x==0){ lower<-0 upper<-1-alpha^(1/n) } if(x==1){ upper<-1-(alpha/2)^(1/n) lower<-1-(1-alpha/2)^(1/n) } if(x==n-1){ lower<-(alpha/2)^(1/n) upper<-(1-alpha/2)^(1/n) } if(x==n){ lower<-alpha^(1/n) upper<-1 } phat<-x/n list(phat=phat,ci=c(lower,upper)) } covmtrim<-function(x,tr=.2,p=length(x),grp=c(1:p)){ # # Estimate the covariance matrix for the sample trimmed means corresponding # to the data in the S-PLUS variable x, # which is assumed to be stored in list mode or a matrix. # (x[[1]] contains the data for group 1, x[[2]] the data for group 2, etc.) # The function returns a p by p matrix of covariances, the diagonal # elements being equal to the squared standard error of the sample # trimmed means, where p is the number of groups to be included. # By default, all the groups in x are used, but a subset of # the groups can be used via grp. For example, if # the goal is to estimate the covariances between the sample trimmed # means for groups 1, 2, and 5, use the command grp<-c(1,2,5) # before calling this function. # # The default amount of trimming is 20% # # Missing values (values stored as NA) are not allowed. # # This function uses winvar from chapter 2. # x=matl(x) x=elimna(x) x=listm(x) if(!is.list(x))stop("The data are not stored in list mode or a matrix.") p<-length(grp) pm1<-p-1 for (i in 1:pm1){ ip<-i+1 if(length(x[[grp[ip]]])!=length(x[[grp[i]]]))stop("The number of observations in each group must be equal") } n<-length(x[[grp[1]]]) h<-length(x[[grp[1]]])-2*floor(tr*length(x[[grp[1]]])) covest<-matrix(0,p,p) covest[1,1]<-(n-1)*winvar(x[[grp[1]]],tr)/(h*(h-1)) for (j in 2:p){ jk<-j-1 covest[j,j]<-(n-1)*winvar(x[[grp[j]]],tr)/(h*(h-1)) for (k in 1:jk){ covest[j,k]<-(n-1)*wincor(x[[grp[j]]],x[[grp[k]]],tr)$cov/(h*(h-1)) covest[k,j]<-covest[j,k] } } covmtrim<-covest covmtrim } bwwcovm<-function(J,K,L,x,tr=.2){ # # compute covariance matrix for a between by within by within design # p=J*K*L idep=K*L mat=matrix(0,nrow=p,ncol=p) id=c(1:idep) for(j in 1:J){ mat[id,id]=covmtrim(x[id],tr=tr) id=id+idep } mat } bwwmatna<-function(J,K,L,x){ # # data are assumed to be stored in a matrix # for a between by within by within (three-way) anova, # for the last two factors, eliminate any missing values # and then store the data in list mode. # y=list() ad=K*L ilow=1 iup=ad ic=0 for(j in 1:J){ z=x[,ilow:iup] d=elimna(z) im=0 for(k in 1:K){ for(l in 1:L){ ic=ic+1 im=im+1 y[[ic]]=d[,im] }} ilow=ilow+ad iup=iup+ad } y } bwwna<-function(J,K,L,x){ # # data are assumed to be stored in list mode # for a between by within by within (three-way) anova, # for the last two factors, eliminate any missing values. # y=list() ad=K*L ilow=1 iup=ad ic=0 for(j in 1:J){ z=x[ilow:iup] d=elimna(matl(z)) #print(d) im=0 for(k in 1:K){ for(l in 1:L){ ic=ic+1 im=im+1 y[[ic]]=d[,im] }} ilow=ilow+ad iup=iup+ad } y } bwwtrim<-function(J,K,L,data,tr=.2,grp=c(1:p),alpha=.05,p=J*K*L){ # Perform a between by within by within (three-way) anova # on trimmed means where # # J independent groups, KL dependent groups # # The variable data is assumed to contain the raw # data stored in list mode. data[[1]] contains the data # for the first level of all three factors: level 1,1,1. # data][2]] is assumed to contain the data for level 1 of the # first two factors and level 2 of the third factor: level 1,1,2 # data[[L]] is the data for level 1,1,L # data[[L+1]] is the data for level 1,2,1. data[[2L]] is level 1,2,L. # data[[KL+1]] is level 2,1,1, etc. # # The default amount of trimming is tr=.2 # # It is assumed that data has length JKL, the total number of # groups being tested. # if(is.list(data))data=bwwna(J,K,L,data) # remove missing values if(is.matrix(data))data=bwwmatna(J,K,L,data) #remove missing values # and convert to list mode if(!is.list(data))stop("The data are not stored in list mode or a matrix") if(p!=length(data)){ print("The total number of groups, based on the specified levels, is") print(p) print("The number of groups in data is") print(length(data)) print("Warning: These two values are not equal") } tmeans<-0 h<-0 v<-0 for (i in 1:p){ tmeans[i]<-mean(data[[grp[i]]],tr) h[i]<-length(data[[grp[i]]])-2*floor(tr*length(data[[grp[i]]])) # h is the effective sample size } v=bwwcovm(J,K,L,data,tr=tr) ij<-matrix(c(rep(1,J)),1,J) ik<-matrix(c(rep(1,K)),1,K) il<-matrix(c(rep(1,L)),1,L) jm1<-J-1 cj<-diag(1,jm1,J) cj<-diag(1,jm1,J) for (i in 1:jm1)cj[i,i+1]<-0-1 km1<-K-1 ck<-diag(1,km1,K) for (i in 1:km1)ck[i,i+1]<-0-1 lm1<-L-1 cl<-diag(1,lm1,L) for (i in 1:lm1)cl[i,i+1]<-0-1 # Do test for factor A cmat<-kron(cj,kron(ik,il)) # Contrast matrix for factor A Qa=bwwtrim.sub(cmat, tmeans, v, h,p) Qa.siglevel <- 1 - pf(Qa, J - 1, 999) # Do test for factor B cmat<-kron(ij,kron(ck,il)) # Contrast matrix for factor B Qb=bwwtrim.sub(cmat, tmeans, v, h,p) Qb.siglevel <- 1 - pf(Qb, K - 1, 999) # Do test for factor C cmat<-kron(ij,kron(ik,cl)) # Contrast matrix for factor C Qc<-bwwtrim.sub(cmat, tmeans, v, h,p) Qc.siglevel <- 1 - pf(Qc, L - 1, 999) # Do test for factor A by B interaction cmat<-kron(cj,kron(ck,il)) # Contrast matrix for factor A by B Qab<-bwwtrim.sub(cmat, tmeans, v, h,p) Qab.siglevel <- 1 - pf(Qab, (J - 1) * (K - 1), 999) # Do test for factor A by C interaction cmat<-kron(cj,kron(ik,cl)) # Contrast matrix for factor A by C Qac<-bwwtrim.sub(cmat, tmeans, v, h,p) Qac.siglevel <- 1 - pf(Qac, (J - 1) * (L - 1), 999) # Do test for factor B by C interaction cmat<-kron(ij,kron(ck,cl)) # Contrast matrix for factor B by C Qbc<-bwwtrim.sub(cmat, tmeans, v, h,p) Qbc.siglevel <- 1 - pf(Qbc, (K - 1) * (L - 1), 999) # Do test for factor A by B by C interaction cmat<-kron(cj,kron(ck,cl)) # Contrast matrix for factor A by B by C Qabc<-bwwtrim.sub(cmat, tmeans, v, h,p) Qabc.siglevel <-1-pf(Qabc,(J-1)*(K-1)*(L-1), 999) list(Qa=Qa,Qa.p.value=Qa.siglevel,Qb=Qb,Qb.crit=Qb.siglevel, Qc=Qc,Qc.p.value=Qc.siglevel,Qab=Qab,Qab.p.value=Qab.siglevel, Qac=Qac,Qac.p.value=Qac.siglevel,Qbc=Qbc,Qbc.p.value=Qbc.siglevel, Qabc=Qabc,Qabc.p.value=Qabc.siglevel) } bbwcovm<-function(J,K,L,x,tr=.2){ # # compute covariance matrix for a between by within by within design # p=J*K*L idep=L mat=matrix(0,nrow=p,ncol=p) id=c(1:idep) for(j in 1:J){ for(k in 1:K){ mat[id,id]=covmtrim(x[id],tr=tr) id=id+idep }} mat } bbwmatna<-function(J,K,L,x){ # # data are assumed to be stored in a matrix # for a between by within by within (three-way) anova. # For the last factor, eliminate any missing values # and then store the data in list mode. # y=list() ad=L ilow=1 iup=ad ic=0 for(j in 1:J){ for(k in 1:K){ z=x[,ilow:iup] d=elimna(z) im=0 for(l in 1:L){ ic=ic+1 im=im+1 y[[ic]]=d[,im] } ilow=ilow+ad iup=iup+ad }} y } bbwna<-function(J,K,L,x){ # # data are assumed to be stored in list mode # for a between by within by within (three-way) anova. # For the last factor, eliminate any missing values. # y=list() ad=L ilow=1 iup=ad ic=0 for(j in 1:J){ for(k in 1:K){ z=x[ilow:iup] d=as.matrix(elimna(matl(z))) im=0 ilow=ilow+ad iup=iup+ad for(l in 1:L){ ic=ic+1 im=im+1 y[[ic]]=d[,im] }} } y } bbwtrim<-function(J,K,L,data,tr=.2,alpha=.05,p=J*K*L){ # Perform a between-within-within (three-way) anova on trimmed means where # # J independent groups, KL dependent groups # # The variable data is assumed to contain the raw # data stored in list mode. data[[1]] contains the data # for the first level of all three factors: level 1,1,1. # data][2]] is assumed to contain the data for level 1 of the # first two factors and level 2 of the third factor: level 1,1,2 # data[[L]] is the data for level 1,1,L # data[[L+1]] is the data for level 1,2,1. data[[2L]] is level 1,2,L. # data[[KL+1]] is level 2,1,1, etc. # # The default amount of trimming is tr=.2 # # It is assumed that data has length JKL, the total number of # groups being tested. # if(is.list(data))data=bbwna(J,K,L,data) if(is.matrix(data))data=bbwmatna(J,K,L,data) grp=c(1:p) data=bbwna(J,K,L,data) if(!is.list(data))stop("Data are not stored in list mode") if(p!=length(data)){ print("The total number of groups, based on the specified levels, is") print(p) print("The number of groups is") print(length(data)) print("Warning: These two values are not equal") } tmeans<-0 h<-0 v<-0 for (i in 1:p){ tmeans[i]<-mean(data[[grp[i]]],tr) h[i]<-length(data[[grp[i]]])-2*floor(tr*length(data[[grp[i]]])) # h is the effective sample size } v=bbwcovm(J,K,L,data,tr=tr) ij<-matrix(c(rep(1,J)),1,J) ik<-matrix(c(rep(1,K)),1,K) il<-matrix(c(rep(1,L)),1,L) jm1<-J-1 cj<-diag(1,jm1,J) cj<-diag(1,jm1,J) for (i in 1:jm1)cj[i,i+1]<-0-1 km1<-K-1 ck<-diag(1,km1,K) for (i in 1:km1)ck[i,i+1]<-0-1 lm1<-L-1 cl<-diag(1,lm1,L) for (i in 1:lm1)cl[i,i+1]<-0-1 # Do test for factor A cmat<-kron(cj,kron(ik,il)) # Contrast matrix for factor A Qa=bwwtrim.sub(cmat, tmeans, v, h,p) Qa.siglevel <- 1 - pf(Qa, J - 1, 999) # Do test for factor B cmat<-kron(ij,kron(ck,il)) # Contrast matrix for factor B Qb=bwwtrim.sub(cmat, tmeans, v, h,p) Qb.siglevel <- 1 - pf(Qb, K - 1, 999) # Do test for factor C cmat<-kron(ij,kron(ik,cl)) # Contrast matrix for factor C Qc<-bwwtrim.sub(cmat, tmeans, v, h,p) Qc.siglevel <- 1 - pf(Qc, L - 1, 999) # Do test for factor A by B interaction cmat<-kron(cj,kron(ck,il)) # Contrast matrix for factor A by B Qab<-bwwtrim.sub(cmat, tmeans, v, h,p) Qab.siglevel <- 1 - pf(Qab, (J - 1) * (K - 1), 999) # Do test for factor A by C interaction cmat<-kron(cj,kron(ik,cl)) # Contrast matrix for factor A by C Qac<-bwwtrim.sub(cmat, tmeans, v, h,p) Qac.siglevel <- 1 - pf(Qac, (J - 1) * (L - 1), 999) # Do test for factor B by C interaction cmat<-kron(ij,kron(ck,cl)) # Contrast matrix for factor B by C Qbc<-bwwtrim.sub(cmat, tmeans, v, h,p) Qbc.siglevel <- 1 - pf(Qbc, (K - 1) * (L - 1), 999) # Do test for factor A by B by C interaction cmat<-kron(cj,kron(ck,cl)) # Contrast matrix for factor A by B by C Qabc<-bwwtrim.sub(cmat, tmeans, v, h,p) Qabc.siglevel <-1-pf(Qabc,(J-1)*(K-1)*(L-1), 999) list(Qa=Qa,Qa.p.value=Qa.siglevel,Qb=Qb,Qb.crit=Qb.siglevel, Qc=Qc,Qc.p.value=Qc.siglevel,Qab=Qab,Qab.p.value=Qab.siglevel, Qac=Qac,Qac.p.value=Qac.siglevel,Qbc=Qbc,Qbc.p.value=Qbc.siglevel, Qabc=Qabc,Qabc.p.value=Qabc.siglevel) } bwwtrim.sub<-function(cmat,vmean,vsqse,h,p){ # # The function performs a variation of Johansen's test of C mu = 0 for # a within by within design # C is a k by p matrix of rank k and mu is a p by 1 matrix of # of unknown medians. # The argument cmat contains the matrix C. # vmean is a vector of length p containing the p trimmed means # vsqe is matrix containing the # estimated covariances among the trimmed means # h is the sample size # yvec<-matrix(vmean,length(vmean),1) test<-cmat%*%vsqse%*%t(cmat) invc<-solve(test) test<-t(yvec)%*%t(cmat)%*%invc%*%cmat%*%yvec temp<-0 mtem<-vsqse%*%t(cmat)%*%invc%*%cmat temp<-sum(diag(mtem%*%mtem))+(sum(diag(mtem)))^2/(h-1) A<-.5*sum(temp) cval<-nrow(cmat)+2*A-6*A/(nrow(cmat)+2) test<-test/cval test } ghmean<-function(g,h){ # #Compute the mean and variance of a g-and-h distribution # if(h<0)stop("h must be > 0") val=NULL val2=NULL if(h<1) val=(exp(g^2/(2*(1-h)))-1)/(g*sqrt(1-h)) if(h<.5) val2=(exp(2*g^2/(1-2*h))-2*exp(g^2/(2*(1-2*h)))+1)/(g^2*sqrt(1-2*h))- (exp(g^2/(2*(1-h)))-1)^2/(g^2*(1-h)) list(mean=val,variance=val2) } skew<-function(x){ # # Compute skew and kurtosis # x=elimna(x) m1<-mean(x) m2<-var(x) m3<-sum((x-m1)^3)/length(x) m4<-sum((x-m1)^4)/length(x) sk<-m3/m2^1.5 ku<-m4/m2^2 list(skew=sk,kurtosis=ku) } t3wayv2<-function(J,K,L,data,tr=.2,grp=c(1:p),alpha=.05,p=J*K*L,MAT=F, lev.col=c(1:3),var.col=4){ # Perform a J by K by L (three-way) anova on trimmed means where # all JKL groups are independent. # # Same as t3way, only computes p-values # # if MAT=F (default) # The s-plus variable data is assumed to contain the raw # data stored in list mode. data[[1]] contains the data # for the first level of all three factors: level 1,1,1. # data][2]] is assumed to contain the data for level 1 of the # first two factors and level 2 of the third factor: level 1,1,2 # data[[L]] is the data for level 1,1,L # data[[L+1]] is the data for level 1,2,1. data[[2L]] is level 1,2,L. # data[[KL+1]] is level 2,1,1, etc. # # MAT=T, assumes data are stored in matrix with 3 columns indicating # levels of the three factors. # That is, this function calls selby2 for you. # # The default amount of trimming is tr=.2 # # It is assumed that data has length JKL, the total number of # groups being tested. # if(MAT){ if(!is.matrix(data))stop("With MAT=T, data must be a matrix") if(length(lev.col)!=3)stop("Argument lev.col should have 3 values") data=selby2(data,lev.col,var.col)$x } if(is.matrix(data))data=listm(data) if(!is.list(data))stop("Data is not stored in list mode") if(p!=length(data)){ print("The total number of groups, based on the specified levels, is") print(p) print("The number of groups in data is") print(length(data)) print("Warning: These two values are not equal") } tmeans<-0 h<-0 v<-0 for (i in 1:p){ tmeans[i]<-mean(data[[grp[i]]],tr) h[i]<-length(data[[grp[i]]])-2*floor(tr*length(data[[grp[i]]])) # h is the effective sample size v[i]<-(length(data[[grp[i]]])-1)*winvar(data[[grp[i]]],tr)/(h[i]*(h[i]-1)) # v contains the squared standard errors } v<-diag(v,p,p) # Put squared standard errors in a diag matrix. ij<-matrix(c(rep(1,J)),1,J) ik<-matrix(c(rep(1,K)),1,K) il<-matrix(c(rep(1,L)),1,L) jm1<-J-1 cj<-diag(1,jm1,J) for (i in 1:jm1)cj[i,i+1]<-0-1 km1<-K-1 ck<-diag(1,km1,K) for (i in 1:km1)ck[i,i+1]<-0-1 lm1<-L-1 cl<-diag(1,lm1,L) for (i in 1:lm1)cl[i,i+1]<-0-1 # Do test for factor A cmat<-kron(cj,kron(ik,il)) # Contrast matrix for factor A Qa <- johan(cmat, tmeans, v, h, alpha) Qa.pv=t3pval(cmat, tmeans, v, h) # Do test for factor B cmat<-kron(ij,kron(ck,il)) # Contrast matrix for factor B Qb<-johan(cmat,tmeans,v,h,alpha) Qb.pv=t3pval(cmat, tmeans, v, h) # Do test for factor C cmat<-kron(ij,kron(ik,cl)) # Contrast matrix for factor C Qc<-johan(cmat,tmeans,v,h,alpha) Qc.pv=t3pval(cmat, tmeans, v, h) # Do test for factor A by B interaction cmat<-kron(cj,kron(ck,il)) # Contrast matrix for factor A by B Qab<-johan(cmat,tmeans,v,h,alpha) Qab.pv=t3pval(cmat, tmeans, v, h) # Do test for factor A by C interaction cmat<-kron(cj,kron(ik,cl)) # Contrast matrix for factor A by C Qac<-johan(cmat,tmeans,v,h,alpha) Qac.pv=t3pval(cmat, tmeans, v, h) # Do test for factor B by C interaction cmat<-kron(ij,kron(ck,cl)) # Contrast matrix for factor B by C Qbc<-johan(cmat,tmeans,v,h,alpha) Qbc.pv=t3pval(cmat, tmeans, v, h) # Do test for factor A by B by C interaction cmat<-kron(cj,kron(ck,cl)) # Contrast matrix for factor A by B by C Qabc<-johan(cmat,tmeans,v,h,alpha) Qabc.pv=t3pval(cmat, tmeans, v, h) list(Qa=Qa$teststat,Qa.crit=Qa$crit,Qa.p.value=Qa.pv, Qb=Qb$teststat,Qb.crit=Qb$crit,Qb.p.value=Qb.pv, Qc=Qc$teststat,Qc.crit=Qc$crit,Qc.p.value=Qc.pv, Qab=Qab$teststat,Qab.crit=Qab$crit,Qab.p.value=Qab.pv, Qac=Qac$teststat,Qac.crit=Qac$crit,Qac.p.value=Qac.pv, Qbc=Qbc$teststat,Qbc.crit=Qbc$crit,Qbc.p.value=Qbc.pv, Qabc=Qabc$teststat,Qabc.crit=Qabc$crit,Qabc.p.value=Qabc.pv) } t3pval<-function(cmat,tmeans,v,h){ alph<-c(1:99)/100 for(i in 1:99){ irem<-i chkit<-johan(cmat,tmeans,v,h,alph[i]) if(chkit$teststat>chkit$crit)break } p.value <- irem/100 if(p.value <= 0.1) { iup <- (irem + 1)/100 alph <- seq(0.001, iup, 0.001) for(i in 1:length(alph)) { p.value <- alph[i] chkit<-johan(cmat,tmeans,v,h,alph[i]) if(chkit$teststat>chkit$crit)break } } if(p.value <= 0.001) { alph <- seq(0.0001, 0.001, 0.0001) for(i in 1:length(alph)) { p.value <- alph[i] chkit<-johan(cmat,tmeans,v,h,alph[i]) if(chkit$teststat>chkit$crit)break } } p.value } t1way<-function(x,tr=.2,grp=NA,MAT=F,lev.col=1,var.col=2){ # # A heteroscedastic one-way ANOVA for trimmed means # using a generalization of Welch's method. # # The data are assumed to be stored in $x$ in a matrix or in list mode. # # MAT=F, if x is a matrix, columns correspond to groups. # if MAT=T, assumes argument # lev.col # indicates which column of x denotes the groups. And # var.col indicates the column where the data are stored. # # if x has list mode: # length(x) is assumed to correspond to the total number of groups. # By default, the null hypothesis is that all groups have a common mean. # To compare a subset of the groups, use grp to indicate which # groups are to be compared. For example, if you type the # command grp<-c(1,3,4), and then execute this function, groups # 1, 3, and 4 will be compared with the remaining groups ignored. # # Missing values are automatically removed. # if(MAT){ if(!is.matrix(x))stop("With MAT=T, data must be stored in a matrix") if(length(lev.col)!=1)stop("Argument lev.col should have 1 value") temp=selby(x,lev.col,var.col) x=temp$x grp2=rank(temp$grpn) x=x[grp2] } if(is.matrix(x))x<-listm(x) if(is.na(sum(grp[1])))grp<-c(1:length(x)) if(!is.list(x))stop("Data are not stored in a matrix or in list mode.") J<-length(grp) h<-vector("numeric",J) w<-vector("numeric",J) xbar<-vector("numeric",J) for(j in 1:J){ xx<-!is.na(x[[j]]) val<-x[[j]] x[[j]]<-val[xx] # Remove missing values h[j]<-length(x[[grp[j]]])-2*floor(tr*length(x[[grp[j]]])) # h is the number of observations in the jth group after trimming. w[j]<-h[j]*(h[j]-1)/((length(x[[grp[j]]])-1)*winvar(x[[grp[j]]],tr)) xbar[j]<-mean(x[[grp[j]]],tr) } u<-sum(w) xtil<-sum(w*xbar)/u A<-sum(w*(xbar-xtil)^2)/(J-1) B<-2*(J-2)*sum((1-w/u)^2/(h-1))/(J^2-1) TEST<-A/(B+1) nu1<-J-1 nu2<-1./(3*sum((1-w/u)^2/(h-1))/(J^2-1)) sig<-1-pf(TEST,nu1,nu2) list(TEST=TEST,nu1=nu1,nu2=nu2,siglevel=sig) } t3wayv2<-function(J,K,L,data,tr=.2,grp=c(1:p),alpha=.05,p=J*K*L,MAT=F, lev.col=c(1:3),var.col=4,pr=T){ # Perform a J by K by L (three-way) anova on trimmed means where # all JKL groups are independent. # # Same as t3way, only computes p-values # # if MAT=F (default) # The s-plus variable data is assumed to contain the raw # data stored in list mode. data[[1]] contains the data # for the first level of all three factors: level 1,1,1. # data][2]] is assumed to contain the data for level 1 of the # first two factors and level 2 of the third factor: level 1,1,2 # data[[L]] is the data for level 1,1,L # data[[L+1]] is the data for level 1,2,1. data[[2L]] is level 1,2,L. # data[[KL+1]] is level 2,1,1, etc. # # MAT=T, assumes data are stored in matrix with 3 columns indicating # levels of the three factors. # That is, this function calls selby2 for you. # # The default amount of trimming is tr=.2 # # It is assumed that data has length JKL, the total number of # groups being tested. # if(MAT){ if(!is.matrix(data))stop("With MAT=T, data must be a matrix") if(length(lev.col)!=3)stop("Argument lev.col should have 3 values") temp=selby2(data,lev.col,var.col) lev1=length(unique(temp$grpn[,1])) lev2=length(unique(temp$grpn[,2])) lev3=length(unique(temp$grpn[,3])) gv=apply(temp$grpn,2,rank) gvad=100*gv[,1]+10*gv[,2]+gv[,3] grp=rank(gvad) if(pr){ print(paste("Factor 1 has", lev1, "levels")) print(paste("Factor 2 has", lev2, "levels")) print(paste("Factor 3 has", lev3, "levels")) } if(J!=lev1)warning("J is being reset to the number of levels found") if(K!=lev2)warning("K is being reset to the number of levels found") if(L!=lev3)warning("K is being reset to the number of levels found") J=lev1 K=lev2 L=lev2 data=temp$x } if(is.matrix(data))data=listm(data) if(!is.list(data))stop("Data is not stored in list mode") if(p!=length(data)){ print("The total number of groups, based on the specified levels, is") print(p) print("The number of groups in data is") print(length(data)) print("Warning: These two values are not equal") } tmeans<-0 h<-0 v<-0 for (i in 1:p){ tmeans[i]<-mean(data[[grp[i]]],tr) h[i]<-length(data[[grp[i]]])-2*floor(tr*length(data[[grp[i]]])) # h is the effective sample size v[i]<-(length(data[[grp[i]]])-1)*winvar(data[[grp[i]]],tr)/(h[i]*(h[i]-1)) # v contains the squared standard errors } v<-diag(v,p,p) # Put squared standard errors in a diag matrix. ij<-matrix(c(rep(1,J)),1,J) ik<-matrix(c(rep(1,K)),1,K) il<-matrix(c(rep(1,L)),1,L) jm1<-J-1 cj<-diag(1,jm1,J) for (i in 1:jm1)cj[i,i+1]<-0-1 km1<-K-1 ck<-diag(1,km1,K) for (i in 1:km1)ck[i,i+1]<-0-1 lm1<-L-1 cl<-diag(1,lm1,L) for (i in 1:lm1)cl[i,i+1]<-0-1 # Do test for factor A cmat<-kron(cj,kron(ik,il)) # Contrast matrix for factor A Qa <- johan(cmat, tmeans, v, h, alpha) Qa.pv=t3pval(cmat, tmeans, v, h) # Do test for factor B cmat<-kron(ij,kron(ck,il)) # Contrast matrix for factor B Qb<-johan(cmat,tmeans,v,h,alpha) Qb.pv=t3pval(cmat, tmeans, v, h) # Do test for factor C cmat<-kron(ij,kron(ik,cl)) # Contrast matrix for factor C Qc<-johan(cmat,tmeans,v,h,alpha) Qc.pv=t3pval(cmat, tmeans, v, h) # Do test for factor A by B interaction cmat<-kron(cj,kron(ck,il)) # Contrast matrix for factor A by B Qab<-johan(cmat,tmeans,v,h,alpha) Qab.pv=t3pval(cmat, tmeans, v, h) # Do test for factor A by C interaction cmat<-kron(cj,kron(ik,cl)) # Contrast matrix for factor A by C Qac<-johan(cmat,tmeans,v,h,alpha) Qac.pv=t3pval(cmat, tmeans, v, h) # Do test for factor B by C interaction cmat<-kron(ij,kron(ck,cl)) # Contrast matrix for factor B by C Qbc<-johan(cmat,tmeans,v,h,alpha) Qbc.pv=t3pval(cmat, tmeans, v, h) # Do test for factor A by B by C interaction cmat<-kron(cj,kron(ck,cl)) # Contrast matrix for factor A by B by C Qabc<-johan(cmat,tmeans,v,h,alpha) Qabc.pv=t3pval(cmat, tmeans, v, h) list(Qa=Qa$teststat,Qa.crit=Qa$crit,Qa.p.value=Qa.pv, Qb=Qb$teststat,Qb.crit=Qb$crit,Qb.p.value=Qb.pv, Qc=Qc$teststat,Qc.crit=Qc$crit,Qc.p.value=Qc.pv, Qab=Qab$teststat,Qab.crit=Qab$crit,Qab.p.value=Qab.pv, Qac=Qac$teststat,Qac.crit=Qac$crit,Qac.p.value=Qac.pv, Qbc=Qbc$teststat,Qbc.crit=Qbc$crit,Qbc.p.value=Qbc.pv, Qabc=Qabc$teststat,Qabc.crit=Qabc$crit,Qabc.p.value=Qabc.pv) } t3way<-function(J,K,L,data,tr=.2,grp=c(1:p),alpha=.05,p=J*K*L,MAT=F, lev.col=c(1:3),var.col=4,pr=T){ # Perform a J by K by L (three-way) anova on trimmed means where # all JKL groups are independent. # # The s-plus variable data is assumed to contain the raw # data stored in list mode. data[[1]] contains the data # for the first level of all three factors: level 1,1,1. # data][2]] is assumed to contain the data for level 1 of the # first two factors and level 2 of the third factor: level 1,1,2 # data[[L]] is the data for level 1,1,L # data[[L+1]] is the data for level 1,2,1. data[[2L]] is level 1,2,L. # data[[KL+1]] is level 2,1,1, etc. # # The default amount of trimming is tr=.2 # # It is assumed that data has length JKL, the total number of # groups being tested. # # MAT=T, assumes data are stored in matrix with 3 columns indicating # levels of the three factors. # That is, this function calls selby2 for you. # if(MAT){ if(!is.matrix(data))stop("With MAT=T, data must be a matrix") if(length(lev.col)!=3)stop("Argument lev.col should have 3 values") temp=selby2(data,lev.col,var.col) lev1=length(unique(temp$grpn[,1])) lev2=length(unique(temp$grpn[,2])) lev3=length(unique(temp$grpn[,3])) gv=apply(temp$grpn,2,rank) gvad=100*gv[,1]+10*gv[,2]+gv[,3] grp=rank(gvad) if(pr){ print(paste("Factor 1 has", lev1, "levels")) print(paste("Factor 2 has", lev2, "levels")) print(paste("Factor 3 has", lev3, "levels")) } if(J!=lev1)warning("J is being reset to the number of levels found") if(K!=lev2)warning("K is being reset to the number of levels found") if(L!=lev3)warning("K is being reset to the number of levels found") J=lev1 K=lev2 L=lev3 data=temp$x } if(is.matrix(data))data=listm(data) if(!is.list(data))stop("Data are not stored in list mode") if(p!=length(data)){ print("The total number of groups, based on the specified levels, is") print(p) print("The number of groups in data is") print(length(data)) print("Warning: These two values are not equal") } tmeans<-0 h<-0 v<-0 for (i in 1:p){ tmeans[i]<-mean(data[[grp[i]]],tr) h[i]<-length(data[[grp[i]]])-2*floor(tr*length(data[[grp[i]]])) # h is the effective sample size v[i]<-(length(data[[grp[i]]])-1)*winvar(data[[grp[i]]],tr)/(h[i]*(h[i]-1)) # v contains the squared standard errors } v<-diag(v,p,p) # Put squared standard errors in a diag matrix. ij<-matrix(c(rep(1,J)),1,J) ik<-matrix(c(rep(1,K)),1,K) il<-matrix(c(rep(1,L)),1,L) jm1<-J-1 cj<-diag(1,jm1,J) for (i in 1:jm1)cj[i,i+1]<-0-1 km1<-K-1 ck<-diag(1,km1,K) for (i in 1:km1)ck[i,i+1]<-0-1 lm1<-L-1 cl<-diag(1,lm1,L) for (i in 1:lm1)cl[i,i+1]<-0-1 # Do test for factor A cmat<-kron(cj,kron(ik,il)) # Contrast matrix for factor A Qa<-johan(cmat,tmeans,v,h,alpha) # Do test for factor B cmat<-kron(ij,kron(ck,il)) # Contrast matrix for factor B Qb<-johan(cmat,tmeans,v,h,alpha) # Do test for factor C cmat<-kron(ij,kron(ik,cl)) # Contrast matrix for factor C Qc<-johan(cmat,tmeans,v,h,alpha) # Do test for factor A by B interaction cmat<-kron(cj,kron(ck,il)) # Contrast matrix for factor A by B Qab<-johan(cmat,tmeans,v,h,alpha) # Do test for factor A by C interaction cmat<-kron(cj,kron(ik,cl)) # Contrast matrix for factor A by C Qac<-johan(cmat,tmeans,v,h,alpha) # Do test for factor B by C interaction cmat<-kron(ij,kron(ck,cl)) # Contrast matrix for factor B by C Qbc<-johan(cmat,tmeans,v,h,alpha) # Do test for factor A by B by C interaction cmat<-kron(cj,kron(ck,cl)) # Contrast matrix for factor A by B by C Qabc<-johan(cmat,tmeans,v,h,alpha) list(Qa=Qa$teststat,Qa.crit=Qa$crit,Qb=Qb$teststat,Qb.crit=Qb$crit, Qc=Qc$teststat,Qc.crit=Qc$crit,Qab=Qab$teststat,Qab.crit=Qab$crit, Qac=Qac$teststat,Qac.crit=Qac$crit,Qbc=Qbc$teststat,Qbc.crit=Qbc$crit, Qabc=Qabc$teststat,Qabc.crit=Qabc$crit) } olshc4<-function(x,y,alpha=.05,CN=F,xout=F,outfun=out,...){ # # Compute confidence for least squares # regression using heteroscedastic method # recommended by Cribari-Neto (2004). # CN=F, degrees of freedom are n-p # CN=F degrees of freedom are infinte, as done by Cribari-Neto (2004) # x<-as.matrix(x) if(nrow(x) != length(y))stop("Length of y does not match number of x values") m<-cbind(x,y) m<-elimna(m) y<-m[,ncol(x)+1] x=m[,1:ncol(x)] x<-as.matrix(x) if(xout){ flag<-outfun(x,...)$keep x<-as.matrix(x) x<-x[flag,] y<-y[flag] x<-as.matrix(x) } temp<-lsfit(x,y) #x<-cbind(rep(1,nrow(x)),m[,1:ncol(x)]) x<-cbind(rep(1,nrow(x)),x) xtx<-solve(t(x)%*%x) h<-diag(x%*%xtx%*%t(x)) n<-length(h) d<-(n*h)/sum(h) for(i in 1:length(d)){ d[i]<-min(4, d[i]) } hc4<-xtx%*%t(x)%*%diag(temp$res^2/(1-h)^d)%*%x%*%xtx df<-nrow(x)-ncol(x) crit<-qt(1-alpha/2,df) if(CN)crit=qnorm(1-alpha/2) al<-ncol(x) p=al-1 ci<-matrix(NA,nrow=al,ncol=6) lab.out=rep("Slope",p) dimnames(ci)<-list(c("(Intercept)",lab.out),c("Coef.","Estimates", "ci.lower","ci.upper","p-value","Std.Error")) for(j in 1:al){ ci[j,1]<-j-1 ci[j,2]<-temp$coef[j] ci[j,3]<-temp$coef[j]-crit*sqrt(hc4[j,j]) ci[j,4]<-temp$coef[j]+crit*sqrt(hc4[j,j]) test<-temp$coef[j]/sqrt(hc4[j,j]) ci[j,5]<-2*(1-pt(abs(test),df)) if(CN)ci[j,5]<-2*(1-pnorm(abs(test),df)) } ci[,6]=sqrt(diag(hc4)) list(ci=ci, cov=hc4) } hc4test<-function(x,y,pval=c(1:ncol(x)),xout=F,outfun=out,...){ # # Perform omnibus test using OLS and HC4 estimator # That is, test the hypothesis that all of the slope parameters # are equal to 0 in a manner that allows heteroscedasticity. # # recommended by Cribari-Neto (2004). # x<-as.matrix(x) if(ncol(x)>1)print("WARNING: more than 1 predictor, olstest might be better") if(nrow(x) != length(y))stop("Length of y does not match number of x values") m<-cbind(x,y) m<-elimna(m) y<-m[,ncol(x)+1] x=m[,1:ncol(x)] x<-as.matrix(x) if(xout){ flag<-outfun(x,...)$keep x<-as.matrix(x) x<-x[flag,] y<-y[flag] x<-as.matrix(x) } n<-length(y) pvalp1<-pval+1 temp<-lsfit(x,y) # unrestricted #x<-cbind(rep(1,nrow(x)),m[,1:ncol(x)]) x<-cbind(rep(1,nrow(x)),x) hval<-x%*%solve(t(x)%*%x)%*%t(x) hval<-diag(hval) hbar<-mean(hval) delt<-cbind(rep(4,n),hval/hbar) delt<-apply(delt,1,min) aval<-(1-hval)^(0-delt) x2<-x[,pvalp1] pval<-0-pvalp1 x1<-x[,pval] df<-length(pval) x1<-as.matrix(x1) imat<-diag(1,n) M1<-imat-x1%*%solve(t(x1)%*%x1)%*%t(x1) M<-imat-x%*%solve(t(x)%*%x)%*%t(x) uval<-as.vector(M%*%y) R2<-M1%*%x2 rtr<-solve(t(R2)%*%R2) temp2<-aval*uval^2 S<-diag(aval*uval^2) V<-n*rtr%*%t(R2)%*%S%*%R2%*%rtr nvec<-as.matrix(temp$coef[pvalp1]) test<-n*t(nvec)%*%solve(V)%*%nvec test<-test[1,1] p.value<-1-pchisq(test,df) list(test=test,p.value=p.value) } trimpb<-function(x,y,tr=.2,alpha=.05,nboot=2000,WIN=F,win=.1, plotit=F,pop=1,null.value=0,pr=T,xlab="X"){ # # Compute a 1-alpha confidence interval for # a trimmed mean. # # The default number of bootstrap samples is nboot=2000 # # win is the amount of Winsorizing before bootstrapping # when WIN=T. # # Missing values are automatically removed. # # nv is null value. That test hypothesis trimmed mean equals nv # # plotit=T gives a plot of the bootstrap values # pop=1 results in the expected frequency curve. # pop=2 kernel density estimate # pop=3 boxplot # pop=4 stem-and-leaf # pop=5 histogram # pop=6 adaptive kernel density estimate. # if(pr){ print("The p-value returned by the this function is based on the") print("null value specified by the argument null.value, which defaults to 0") } x<-x[!is.na(x)] if(WIN){ if(win > tr)stop("The amount of Winsorizing must be <= to the amount of trimming") x<-winval(x,win) } crit<-alpha/2 icl<-round(crit*nboot)+1 icu<-nboot-icl bvec<-NA set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") data<-matrix(sample(x,size=length(x)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,mean,tr) # Bootstrapped trimmed means bvec<-sort(bvec) p.value<-sum(bvecQa$crit)break } A.p.value=irem/1000 # Do test for factor B cmat<-kron(ij,ck) # Contrast matrix for factor B for(i in 1:999){ irem<-i Qb<-johan(cmat,tmeans,v,h,alval[i]) if(Qb$teststat>Qb$crit)break } B.p.value=irem/1000 # Do test for factor A by B interaction cmat<-kron(cj,ck) # Contrast matrix for factor A by B for(i in 1:999){ irem<-i Qab<-johan(cmat,tmeans,v,h,alval[i]) if(Qab$teststat>Qab$crit)break } AB.p.value=irem/1000 tmeans=matrix(tmeans,J,K,byrow=T) list(Qa=Qa$teststat,A.p.value=A.p.value, Qb=Qb$teststat,B.p.value=B.p.value, Qab=Qab$teststat,AB.p.value=AB.p.value,means=tmeans) } mcskew <- function(z) { n=length(z) y1=0 y2=0 left=0 right=0 q=0 p=0 eps=0.0000000000001 z=-z xmed=pull(z,n,floor(n/2)+1) if (n%%2 == 0) { xmed=(xmed+pull(z,n,floor(n/2)))/2 } z=z-xmed y=-sort(z) y1=y[y>-eps] y2=y[y<=eps] h1=length(y1) h2=length(y2) left[1:h2]=1 right[1:h2]=h1 nl=0 nr=h1*h2 knew=floor(nr/2)+1 IsFound=0 while ((nr-nl>n) & (IsFound==0)) { weight=0 work=0 j=1 for (i in 1:h2) { if (left[i]<=right[i]) { weight[j]=right[i]-left[i]+1 k=left[i]+floor(weight[j]/2) work[j]=calwork(y1[k],y2[i],k,i,h1+1,eps) j=j+1 } } trial=whimed(work,weight,j-1) j=1 for (i in h2:1) { while ((j<=h1)&(calwork(y1[min(j,h1)],y2[i],j,i,h1+1,eps)>trial)) { j=j+1 } p[i]=j-1 } j=h1 for (i in 1:h2) { while ((j>=1)&(calwork(y1[max(j,1)],y2[i],j,i,h1+1,eps)sumq) { left[1:h2]=q[1:h2] nl=sumq } else { medc=trial IsFound=1 } } } if (IsFound==0) {work=0 j=1 for (i in 1:h2) { if (left[i]<=right[i]) { for (jj in left[i]:right[i]) { work[j]=0-calwork(y1[jj],y2[i],jj,i,h1+1,eps) j=j+1 } } } medc=0-pull(work,j-1,knew-nl) } medc } pull <- function(a,n,k) { b=0 b=a l=1 lr=n while (lax) { j=j-1 } if (jnc<=j) { buffer=b[jnc] b[jnc]=b[j] b[j]=buffer jnc=jnc+1 j=j-1 } } if (jtrial,rep(F,n-nn))]) wmid=sum(iw[c(a[1:nn]==trial,rep(F,n-nn))]) if ((2*wrest+2*wleft)>wtotal) { i=c(a[1:nn]wtotal) { whmed=trial IsFound=1 } else { i=c(a[1:nn]>trial,rep(F,n-nn)) acand=a[i] iwcand=iw[i] nn=length(acand) # nn_kcand_length(acand) wrest=wrest+wleft+wmid } } a[1:nn]=acand[1:nn] iw[1:nn]=iwcand[1:nn] } whmed } calwork <- function(a,b,ai,bi,ab,eps) { if (abs(a-b) < 2.0*eps) { if (ai+bi==ab) { cwork=0 } else { if (ai+bi (n-p)/(2*n) ) r <- (n-p)/(2*n)} # maximum achievable breakdown # # if rejection is not achievable, use c1=0 and best rejection # limvec <- rejpt.bt.lim(p,r) if (1-limvec[2] <= alpha) { c1 <- 0 M <- sqrt(qchisq(1-alpha,p)) } else { c1.plus.M <- sqrt(qchisq(1-alpha,p)) M <- sqrt(p) c1 <- c1.plus.M - M iter <- 1 crit <- 100 eps <- 1e-5 while ((crit > eps)&(iter<100)) { deps <- 1e-4 M.old <- M c1.old <- c1 er <- erho.bt(p,c1,M) fc <- er - r*(M^2/2+c1*(5*c1+16*M)/30) fcc1 <- (erho.bt(p,c1+deps,M)-er)/deps fcM <- (erho.bt(p,c1,M+deps)-er)/deps fcp <- fcM - fcc1 - r*(M-(5*c1+16*M)/30+c1*9/30) M <- M - fc/fcp if (M >= c1.plus.M ){M <- (M.old + c1.plus.M)/2} c1 <- c1.plus.M - M # if (M-c1 < 0) M <- c1.old+(M.old-c1.old)/2 crit <- abs(fc) iter <- iter+1 } } list(c1=c1,M=M,r1=r) } erho.bt.lim <- function(p,c1) # expectation of rho(d) under chi-squared p return(chi.int(p,2,c1)+c1^2*chi.int2(p,0,c1)) erho.bt.lim.p <- function(p,c1) # derivative of erho.bt.lim wrt c1 return(chi.int.p(p,2,c1)+c1^2*chi.int2.p(p,0,c1)+2*c1*chi.int2(p,0,c1)) rejpt.bt.lim <- function(p,r){ # find p-value of translated biweight limit c # that gives a specified breakdown c1 <- 2*p iter <- 1 crit <- 100 eps <- 1e-5 while ((crit > eps)&(iter<100)) { c1.old <- c1 fc <- erho.bt.lim(p,c1) - c1^2*r fcp <- erho.bt.lim.p(p,c1) - 2*c1*r c1 <- c1 - fc/fcp if (c1 < 0) c1 <- c1.old/2 crit <- abs(fc) iter <- iter+1 } return(c(c1,pchisq(c1^2,p),log10(1-pchisq(c1^2,p)))) } chi.int.p <- function(p,a,c1) return( exp(lgamma((p+a)/2)-lgamma(p/2))*2^{a/2}*dchisq(c1^2,p+a)*2*c1 ) chi.int2.p <- function(p,a,c1) return( -exp(lgamma((p+a)/2)-lgamma(p/2))*2^{a/2}*dchisq(c1^2,p+a)*2*c1 ) ksolve.bt <- function(d,p,c1,M,b0){ # find a constant k which satisfies the s-estimation constraint # for modified biweight k <- 1 iter <- 1 crit <- 100 eps <- 1e-5 while ((crit > eps)&(iter<100)) { k.old <- k fk <- mean(rho.bt(d/k,c1,M))-b0 fkp <- -mean(psi.bt(d/k,c1,M)*d/k^2) k <- k - fk/fkp if (k < k.old/2) k <- k.old/2 if (k > k.old*1.5) k <- k.old*1.5 crit <- abs(fk) # print(c(iter,k.old,crit)) iter <- iter+1 } # print(c(iter,k,crit)) return(k) } rho.bt <- function(x,c1,M) { x1 <- (x-M)/c1 ivec1 <- (x1 < 0) ivec2 <- (x1 > 1) return(ivec1*(x^2/2) +ivec2*(M^2/2+c1*(5*c1+16*M)/30) +(1-ivec1-ivec2)*(M^2/2-M^2*(M^4-5*M^2*c1^2+15*c1^4)/(30*c1^4) +(1/2+M^4/(2*c1^4)-M^2/c1^2)*x^2 +(4*M/(3*c1^2)-4*M^3/(3*c1^4))*x^3 +(3*M^2/(2*c1^4)-1/(2*c1^2))*x^4 -4*M*x^5/(5*c1^4)+x^6/(6*c1^4))) } psi.bt <- function(x,c1,M) { x1 <- (x-M)/c1 ivec1 <- (x1 < 0) ivec2 <- (x1 > 1) return(ivec1*x+(1-ivec1-ivec2)*x*(1-x1^2)^2) } psip.bt <- function(x,c1,M) { x1 <- (x-M)/c1 ivec1 <- (x1 < 0) ivec2 <- (x1 > 1) return(ivec1+(1-ivec1-ivec2)*((1-x1^2)^2+4*x*x1*(1-x1^2)/c1)) } wt.bt <- function(x,c1,M) { x1 <- (x-M)/c1 ivec1 <- (x1 < 0) ivec2 <- (x1 > 1) return(ivec1+(1-ivec1-ivec2)*(1-x1^2)^2) } v.bt <- function(x,c1,M) return(x*psi.bt(x,c1,M)) rung3dlchk<-function(x,y,est=mom,regfun=tsreg,beta=.2,plotit=F,nmin=0, fr=NA,...){ # # running mean using interval method # Same as runm3d, but empirically determine the span, f, # by maximizing the percentage bend correlation using the # leave-three-out method. # # x is an n by p matrix of predictors. # # fr controls amount of smoothing and is determined by this function. # If fr is missing, function first considers fr=.8(.05)1.2. If # measure of scale of residuals is mininmized for fr=.8, then consider # fr=.2(.05).75. # # if(!is.matrix(x))stop("Data are not stored in a matrix.") plotit<-as.logical(plotit) chkcor<-1 frtry<-c(.7,.75,.8,.85,.9,.95,1.,1.05,1.1,1.15,1.2) if(!is.na(fr[1]))frtry<-fr chkit<-0 for (it in 1:length(frtry)){ fr<-frtry[it] rmd<-runm3ds1(x,y,fr,tr,F,nmin) # Using leave-three-out method. xm<-y[!is.na(rmd)] rmd<-rmd[!is.na(rmd)] dif<-xm-rmd chkcor[it]<-pbvar(dif,beta) } if(sum(is.na(chkcor))== length(chkcor)) {stop("A value for the span cannot be determined with these data.")} tempc<-sort(chkcor) chkcor[is.na(chkcor)]<-tempc[length(tempc)] temp<-order(chkcor) fr1<-frtry[temp[1]] fr2<-fr1 val1<-min(chkcor) chkcor2<-0 if(is.na(fr)){ if(temp[1] == 1){ frtry<-c(.2,.25,.3,.35,.4,.45,.5,.55,.6,.65,.7,.75) for (it in 1:length(frtry)){ fr<-frtry[it] rmd<-runm3ds1(x,y,fr,tr,F,nmin) xm<-y[!is.na(rmd)] rmd<-rmd[!is.na(rmd)] dif<-xm-rmd chkcor2[it]<-pbvar(dif,beta) } tempc<-sort(chkcor2) chkcor2[is.na(chkcor2)]<-tempc[length(tempc)] print(chkcor2) temp2<-order(chkcor2) fr2<-frtry[temp2[1]] } } sortc<-sort(chkcor2) chkcor2[is.na(chkcor2)]<-sortc[length(sortc)] val2<-min(chkcor2) fr<-fr1 if(val2 < val1)fr<-fr2 rmd<-runm3d(x,y,fr=fr,tr,plotit=F,nmin,pyhat=T,pr=F) xm<-y[!is.na(rmd)] rmd<-rmd[!is.na(rmd)] etasq<-pbcor(rmd,xm)$cor^2 # Next, fit regression line temp<-y-regfun(x,y)$res pbc<-pbcor(temp,y)$cor^2 temp<-(etasq-pbc)/(1-pbc) list(gamma.L=temp,pbcorsq=pbc,etasq=etasq,fr=fr,rmd=rmd,yused=xm,varval=chkcor) } near3dl1<-function(x,pt,fr=1,m){ dis<-mahalanobis(x,pt,m$cov) sdis<-sqrt(sort(dis)) dflag<-(dis < fr & dis > sdis[3]) dflag } listm<-function(x){ # # Store the data in a matrix in a new # S+ variable having list mode. # Col 1 will be stored in y[[1]], col 2 in y[[2]], and so on. # if(!is.matrix(x))stop("The argument x is not a matrix") y<-list() for(j in 1:ncol(x))y[[j]]<-x[,j] y } pbanova<-function(x,tr=.2,alpha=.05,nboot=NA,grp=NA,WIN=F,win=.1){ # # Test the hypothesis that J independent groups have # equal trimmed means using the percentile bootstrap method. # # The data are assumed to be stored in x # which either has list mode or is a matrix. In the first case # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. Length(x)=the number of groups = J. # If stored in a matrix, the columns of the matrix correspond # to groups. # # tr is the amount of trimming # # The argument grp can be used to analyze a subset of the groups # Example: grp=c(1,3,5) would compare groups 1, 3 and 5. # # WIN=T means data are Winsorized before taking bootstraps by the # amount win. # # Missing values are allowed. # if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in list mode or in matrix mode.") if(!is.na(sum(grp))){ # Only analyze specified groups. xx<-list() for(i in 1:length(grp))xx[[i]]<-x[[grp[1]]] x<-xx } J<-length(x) tempn<-0 for(j in 1:J){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. tempn[j]<-length(temp) x[[j]]<-temp } Jm<-J-1 if(WIN){ if(tr < .2){print("Warning: When Winsorizing,") print("the amount of trimming should be at least.2") } if(win > tr)stop("Amount of Winsorizing must be <= amount of trimming") if(min(tempn) < 15){ print("Warning: Winsorizing with sample sizes less than 15") print("can result in poor control over the probability of a Type I error") } for (j in 1:J){ x[[j]]<-winval(x[[j]],win) } } con<-matrix(0,J,J-1) for (j in 1:Jm){ jp<-j+1 con[j,j]<-1 con[jp,j]<-0-1 } # Determine nboot if a value was not specified if(is.na(nboot)){ nboot<-5000 if(J <= 8)nboot<-4000 if(J <= 3)nboot<-2000 } # Determine critical values if(alpha==.05){ dvec<-c(.05,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) if(Jm > 10){ avec<-.05/c(11:Jm) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.01,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(Jm > 10){ avec<-.01/c(11:Jm) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01)dvec<-alpha/c(1:Jm) bvec<-matrix(NA,nrow=J,ncol=nboot) set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") for(j in 1:J){ paste("Working on group ",j) data<-matrix(sample(x[[j]],size=length(x[[j]])*nboot,replace=T),nrow=nboot) bvec[j,]<-apply(data,1,mean,tr) # Bootstrapped trimmed means for jth group } test<-NA for (d in 1:Jm){ dp<-d+1 test[d]<-sum(bvec[d,]>bvec[dp,])/nboot if(test[d]> .5)test[d]<-1-test[d] } test<-(0-1)*sort(-2*test) sig<-sum((test0)print("Significant result obtained: Reject") if(sig==0)print("No significant result obtained: Fail to reject") list(test.vec=test,crit.vec=dvec[1:Jm]) } pbanovag<-function(x,alpha=.05,nboot=NA,grp=NA,est=mom,...){ # # Test the hypothesis that J independent groups have # equal measures of location using the percentile bootstrap method. # (Robust measures of scale can be compared as well.) # # The data are assumed to be stored in x # which either has list mode or is a matrix. In the first case # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. Length(x)=the number of groups = J. # If stored in a matrix, the columns of the matrix correspond # to groups. # # est is the measure of location and defaults to a MOM-estimator # ... can be used to set optional arguments associated with est # # The argument grp can be used to analyze a subset of the groups # Example: grp=c(1,3,5) would compare groups 1, 3 and 5. # # Missing values are allowed. # con<-as.matrix(con) if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in list mode or in matrix mode.") if(!is.na(sum(grp))){ # Only analyze specified groups. xx<-list() for(i in 1:length(grp))xx[[i]]<-x[[grp[1]]] x<-xx } J<-length(x) tempn<-0 for(j in 1:J){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. tempn[j]<-length(temp) x[[j]]<-temp } Jm<-J-1 icl<-ceiling(crit*nboot) icu<-ceiling((1-crit)*nboot) con<-matrix(0,J,J-1) for (j in 1:Jm){ jp<-j+1 con[j,j]<-1 con[jp,j]<-0-1 } # Determine nboot if a value was not specified if(is.na(nboot)){ nboot<-5000 if(J <= 8)nboot<-4000 if(J <= 3)nboot<-2000 } # Determine critical values if(alpha==.05){ dvec<-c(.05,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) if(Jm > 10){ avec<-.05/c(11:Jm) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.01,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(Jm > 10){ avec<-.01/c(11:Jm) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01)dvec<-alpha/c(1:Jm) bvec<-matrix(NA,nrow=J,ncol=nboot) set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") for(j in 1:J){ paste("Working on group ",j) data<-matrix(sample(x[[j]],size=length(x[[j]])*nboot,replace=T),nrow=nboot) bvec[j,]<-apply(data,1,est,...) # Bootstrapped trimmed means for jth group } test<-NA for (d in 1:Jm){ dp<-d+1 test[d]<-sum(bvec[d,]>bvec[dp,])/nboot if(test[d]> .5)test[d]<-1-test[d] } test<-(0-1)*sort(-2*test) sig<-sum((test0)print("Significant result obtained: Reject") if(sig==0)print("No significant result obtained: Fail to reject") list(test.vec=test,crit.vec=dvec[1:Jm]) } bootse<-function(x,nboot=1000,est=median,SEED=T,...){ # # Compute bootstrap estimate of the standard error of the # estimator est # The default number of bootstrap samples is nboot=100 # if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. data<-matrix(sample(x,size=length(x)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,est,...) bootse<-sqrt(var(bvec)) bootse } rananova<-function(x,tr=.2,grp=NA){ # # A heteroscedastic one-way random effects ANOVA for trimmed means. # # The data are assumed to be stored in a matrix on in list mode. # If in list mode, # Length(x) is assumed to correspond to the total number of groups. # If the data are stored in a matrix, groups correspond to columns. # By default, the null hypothesis is that all group have a common mean. # To compare a subset of the groups, use grp to indicate which # groups are to be compared. For example, if you type the # command grp<-c(1,3,4), and then execute this function, groups # 1, 3, and 4 will be compared with the remaining groups ignored. # if(is.matrix(x))x<-listm(x) if(is.na(grp[1]))grp<-c(1:length(x)) if(!is.list(x))stop("Data are not stored in a matrix or in list mode") J<-length(grp) # The number of groups to be compared print("The number of groups to be compared is") print(J) h<-1 xbar<-1 ybar<-1 wvar<-1 ell<-0 for(j in 1:J){ ell[j]<-length(x[[grp[j]]])/(length(x[[grp[j]]])+1) h[j]<-length(x[[grp[j]]])-2*floor(tr*length(x[[grp[j]]])) # h is the number of observations in the jth group after trimming. ybar[j]<-winmean(x[[grp[j]]],tr) xbar[j]<-mean(x[[grp[j]]],tr) wvar[j]<-winvar(x[[grp[j]]],tr) } q<-NA bsst<-var(xbar) for (j in 1:J)q[j]<-(length(x[[grp[j]]]-1)-1)*wvar[j]/(h[j]*(h[j]-1)) wssw<-mean(q) D<-bsst/wssw g<-q/J nu1<-((J-1)*sum(q))^2/((sum(q))^2+(J-2)*J*sum(q^2)) nu2<-(sum(J*q))^2/sum((J*q)^2/(h-1)) sig<-1-pf(D,nu1,nu2) # Next, estimate the Winsorized intraclass correlation sighat<-mean(ell*(ybar-(sum(ell*ybar)/sum(ell)))^2) rho<-sighat/(sighat+winmean(wvar,tr)) list(teststat=D,df=c(nu1,nu2),siglevel=sig,rho=rho) } linpbg<-function(x,con=0,alpha=.05,nboot=NA,est=mest,...){ # # Compute a 1-alpha confidence interval # for a set of d linear contrasts # involving trimmed means using the percentile bootstrap method. # Independent groups are assumed. # # The data are assumed to be stored in x in list mode or in a matrix. # Thus, # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. # If x has list mode, length(x)=the number of groups = J, say. # # Missing values are automatically removed. # # con is a J by d matrix containing the # contrast coefficents of interest. # If unspecified, all pairwise comparisons are performed. # For example, con[,1]=c(1,1,-1,-1,0,0) # and con[,2]=c(,1,-1,0,0,1,-1) # will test two contrasts: (1) the sum of the first # two trimmed means is # equal to the sum of the second two, # and (2) the difference between # the first two is equal to the difference # between the trimmed means of # groups 5 and 6. # # con<-as.matrix(con) if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in a matrix or in list mode.") J<-length(x) for(j in 1:J){ xx<-x[[j]] xx[[j]]<-xx[!is.na(xx)] # Remove any missing values. } Jm<-J-1 d<-(J^2-J)/2 if(sum(con^2)==0){ con<-matrix(0,J,d) id<-0 for (j in 1:Jm){ jp<-j+1 for (k in jp:J){ id<-id+1 con[j,id]<-1 #If con not specified do all pairwise comparisons con[k,id]<-0-1 }}} if(nrow(con)!=length(x)){ stop("The number of groups does not match the number of contrast coefficients.") } if(is.na(nboot)){ nboot<-5000 if(ncol(con)<=4)nboot<-2000 } m1<-matrix(0,nrow=J,ncol=nboot) set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") for(j in 1:J){ paste("Working on group ",j) data<-matrix(sample(x[[j]],size=length(x[[j]])*nboot,replace=T),nrow=nboot) m1[j,]<-apply(data,1,est,...) } testb<-NA boot<-matrix(0,ncol(con),nboot) testvec<-NA for (d in 1:ncol(con)){ boot[d,]<-apply(m1,2,trimpartt,con[,d]) # A vector of length nboot containing psi hat values # and corresponding to the dth linear contrast testb[d]<-sum((boot[d,]>0))/nboot testvec[d]<-min(testb[d],1-testb[d]) } # # Determine critical value # dd<-ncol(con) if(alpha==.05){ if(dd==1)crit<-alpha/2 if(dd==2)crit<-.014 if(dd==3)crit<-.0085 if(dd==4)crit<-.007 if(dd==5)crit<-.006 if(dd==6)crit<-.0045 if(dd==10)crit<-.0023 if(dd==15)crit<-.0016 } else{ crit<-alpha/(2*dd) } icl<-round(crit*nboot) icu<-round((1-crit)*nboot) psihat<-matrix(0,ncol(con),4) test<-matrix(0,ncol(con),3) dimnames(psihat)<-list(NULL,c("con.num","psihat","ci.lower","ci.upper")) dimnames(test)<-list(NULL,c("con.num","test","crit.val")) for (d in 1:ncol(con)){ test[d,1]<-d psihat[d,1]<-d testit<-lincon(x,con[,d],tr) test[d,2]<-testvec[d] temp<-sort(boot[d,]) psihat[d,3]<-temp[icl] psihat[d,4]<-temp[icu] psihat[d,2]<-testit$psihat[1,2] test[d,3]<-crit } list(psihat=psihat,test=test,con=con) } lintpb<-function(x,con=0,tr=.2,alpha=.05,nboot=NA){ # # Compute a 1-alpha confidence interval # for a set of d linear contrasts # involving trimmed means using the percentile bootstrap method. # Independent groups are assumed. # # The data are assumed to be stored in x in list mode or in a matrix. # Thus, # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. # If x has list mode, length(x)=the number of groups = J, say. # # Missing values are automatically removed. # # con is a J by d matrix containing the # contrast coefficents of interest. # If unspecified, all pairwise comparisons are performed. # For example, con[,1]=c(1,1,-1,-1,0,0) # and con[,2]=c(,1,-1,0,0,1,-1) # will test two contrasts: (1) the sum of the first # two trimmed means is # equal to the sum of the second two, # and (2) the difference between # the first two is equal to the difference # between the trimmed means of # groups 5 and 6. # # con<-as.matrix(con) if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in a matrix or in list mode.") J<-length(x) for(j in 1:J){ xx<-x[[j]] xx[[j]]<-xx[!is.na(xx)] # Remove any missing values. } Jm<-J-1 d<-(J^2-J)/2 if(sum(con^2)==0){ con<-matrix(0,J,d) id<-0 for (j in 1:Jm){ jp<-j+1 for (k in jp:J){ id<-id+1 con[j,id]<-1 #If con not specified do all pairwise comparisons con[k,id]<-0-1 }}} if(nrow(con)!=length(x)){ stop("The number of groups does not match the number of contrast coefficients.") } if(is.na(nboot)){ nboot<-5000 if(ncol(con)<=4)nboot<-2000 } m1<-matrix(0,nrow=J,ncol=nboot) set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") for(j in 1:J){ paste("Working on group ",j) data<-matrix(sample(x[[j]],size=length(x[[j]])*nboot,replace=T),nrow=nboot) m1[j,]<-apply(data,1,mean,tr) } testb<-NA boot<-matrix(0,ncol(con),nboot) testvec<-NA for (d in 1:ncol(con)){ boot[d,]<-apply(m1,2,trimpartt,con[,d]) # A vector of length nboot containing psi hat values # and corresponding to the dth linear contrast testb[d]<-sum((boot[d,]>0))/nboot testvec[d]<-min(testb[d],1-testb[d]) } # # Determine critical value # dd<-ncol(con) if(alpha==.05){ if(dd==1)crit<-alpha/2 if(dd==2)crit<-.014 if(dd==3)crit<-.0085 if(dd==4)crit<-.007 if(dd==5)crit<-.006 if(dd==6)crit<-.0045 if(dd==10)crit<-.0023 if(dd==15)crit<-.0016 } else{ crit<-alpha/(2*dd) } icl<-round(crit*nboot) icu<-round((1-crit)*nboot) psihat<-matrix(0,ncol(con),4) test<-matrix(0,ncol(con),3) dimnames(psihat)<-list(NULL,c("con.num","psihat","ci.lower","ci.upper")) dimnames(test)<-list(NULL,c("con.num","test","crit.val")) for (d in 1:ncol(con)){ test[d,1]<-d psihat[d,1]<-d testit<-lincon(x,con[,d],tr) test[d,2]<-testvec[d] temp<-sort(boot[d,]) psihat[d,3]<-temp[icl] psihat[d,4]<-temp[icu] psihat[d,2]<-testit$psihat[1,2] test[d,3]<-crit } list(psihat=psihat,test=test,con=con) } t2waypb<-function(J,K,x,tr=.2,alpha=.05,nboot=NA,grp=NA){ # # Two-way ANOVA for independent groups based on trimmed # means and a percentile bootstrap method. # The data are assumed to be stored in x in list mode or in a matrix. # If grp is unspecified, it is assumed x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second factor: level 1,2 # x[[j+1]] is the data for level 2,1, etc. # If the data are in wrong order, grp can be used to rearrange the # groups. For example, for a two by two design, grp<-c(2,4,3,1) # indicates that the second group corresponds to level 1,1; # group 4 corresponds to level 1,2; group 3 is level 2,1; # and group 1 is level 2,2. # # Missing values are automatically removed. # JK<-J*K if(is.matrix(x))x<-listm(x) if(!is.na(grp)){ yy<-x for(j in 1:length(grp)) x[[j]]<-yy[[grp[j]]] } if(!is.list(x))stop("Data must be stored in list mode or a matrix.") for(j in 1:JK){ xx<-x[[j]] xx[[j]]<-xx[!is.na(xx)] # Remove any missing values. } # # Create the three contrast matrices # ij <- matrix(c(rep(1, J)), 1, J) ik <- matrix(c(rep(1, K)), 1, K) jm1 <- J - 1 cj <- diag(1, jm1, J) for(i in 1:jm1) cj[i, i + 1] <- 0 - 1 km1 <- K - 1 ck <- diag(1, km1, K) for(i in 1:km1) ck[i, i + 1] <- 0 - 1 conA<-t(kron(cj,ik)) conB<-t(kron(ij,ck)) conAB<-t(kron(cj,ck)) ncon<-max(nrow(conA),nrow(conB),nrow(conAB)) if(JK!=length(x)){ print("Warning: The number of groups does not match") print(" the number of contrast coefficients.") } if(is.na(nboot)){ nboot<-5000 if(ncon<=4)nboot<-2000 } m1<-matrix(0,nrow=JK,ncol=nboot) set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") for(j in 1:JK){ paste("Working on group ",j) data<-matrix(sample(x[[j]],size=length(x[[j]])*nboot,replace=T),nrow=nboot) m1[j,]<-apply(data,1,mean,tr) } bootA<-matrix(0,ncol(conA),nboot) bootB<-matrix(0,ncol(conB),nboot) bootAB<-matrix(0,ncol(conAB),nboot) testA<-NA testB<-NA testAB<-NA testvecA<-NA testvecB<-NA testvecAB<-NA for (d in 1:ncol(conA)){ bootA[d,]<-apply(m1,2,trimpartt,conA[,d]) # A vector of length nboot containing psi hat values # corresponding to the dth linear contrast testA[d]<-sum((bootA[d,]>0))/nboot testA[d]<-min(testA[d],1-testA[d]) } for (d in 1:ncol(conB)){ bootB[d,]<-apply(m1,2,trimpartt,conB[,d]) # A vector of length nboot containing psi hat values # corresponding to the dth linear contrast testB[d]<-sum((bootB[d,]>0))/nboot testB[d]<-min(testB[d],1-testB[d]) } for (d in 1:ncol(conAB)){ bootAB[d,]<-apply(m1,2,trimpartt,conAB[,d]) # A vector of length nboot containing psi hat values # corresponding to the dth linear contrast testAB[d]<-sum((bootAB[d,]>0))/nboot testAB[d]<-min(testAB[d],1-testAB[d]) } # # Determine critical value # Jm<-J-1 Km<-K-1 JKm<-(J-1)*(K-1) dvecA <- alpha/c(1:Jm) dvecB <- alpha/c(1:Km) dvecAB <- alpha/c(1:JKm) testA<-(0 - 1) * sort(-2 * testA) testB<-(0 - 1) * sort(-2 * testB) testAB<-(0 - 1) * sort(-2 * testAB) sig <- sum((testA < dvecA[1:Jm])) if(sig > 0) print("Significant result obtained for Factor A: Reject") if(sig == 0) print("No significant result Factor A: Fail to reject") sig <- sum((testB < dvecB[1:Km])) if(sig > 0) print("Significant result obtained for Factor B: Reject") if(sig == 0) print("No significant result Factor B: Fail to reject") sig <- sum((testAB < dvec[1:JKm])) if(sig > 0) print("Significant Interaction: Reject") if(sig == 0) print("No significant Interaction: Fail to reject") list(testA=testA,crit.vecA=dvecA,testB=testB,crit.vecB=dvecB,testAB=testAB,crit.vecAB=dvecAB) } t2waypbg<-function(J,K,x,alpha=.05,nboot=NA,grp=NA,est=mom,...){ # # Two-way ANOVA for independent groups based on # robust measures of location # and a percentile bootstrap method. # The data are assumed to be stored in x in list mode or in a matrix. # If grp is unspecified, it is assumed x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second factor: level 1,2 # x[[j+1]] is the data for level 2,1, etc. # If the data are in wrong order, grp can be used to rearrange the # groups. For example, for a two by two design, grp<-c(2,4,3,1) # indicates that the second group corresponds to level 1,1; # group 4 corresponds to level 1,2; group 3 is level 2,1; # and group 1 is level 2,2. # # Missing values are automatically removed. # JK<-J*K if(is.matrix(x))x<-listm(x) if(!is.na(grp)){ yy<-x for(j in 1:length(grp)) x[[j]]<-yy[[grp[j]]] } if(!is.list(x))stop("Data must be stored in list mode or a matrix.") for(j in 1:JK){ xx<-x[[j]] xx[[j]]<-xx[!is.na(xx)] # Remove any missing values. } # # Create the three contrast matrices # ij <- matrix(c(rep(1, J)), 1, J) ik <- matrix(c(rep(1, K)), 1, K) jm1 <- J - 1 cj <- diag(1, jm1, J) for(i in 1:jm1) cj[i, i + 1] <- 0 - 1 km1 <- K - 1 ck <- diag(1, km1, K) for(i in 1:km1) ck[i, i + 1] <- 0 - 1 conA<-t(kron(cj,ik)) conB<-t(kron(ij,ck)) conAB<-t(kron(cj,ck)) ncon<-max(nrow(conA),nrow(conB),nrow(conAB)) if(JK!=length(x)){ print("Warning: The number of groups does not match") print("the number of contrast coefficients.") } if(is.na(nboot)){ nboot<-5000 if(ncon<=4)nboot<-2000 } m1<-matrix(0,nrow=JK,ncol=nboot) set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") for(j in 1:JK){ paste("Working on group ",j) data<-matrix(sample(x[[j]],size=length(x[[j]])*nboot,replace=T),nrow=nboot) m1[j,]<-apply(data,1,est,...) } bootA<-matrix(0,ncol(conA),nboot) bootB<-matrix(0,ncol(conB),nboot) bootAB<-matrix(0,ncol(conAB),nboot) testA<-NA testB<-NA testAB<-NA testvecA<-NA testvecB<-NA testvecAB<-NA for (d in 1:ncol(conA)){ bootA[d,]<-apply(m1,2,trimpartt,conA[,d]) # A vector of length nboot containing psi hat values # corresponding to the dth linear contrast testA[d]<-sum((bootA[d,]>0))/nboot testA[d]<-min(testA[d],1-testA[d]) } for (d in 1:ncol(conB)){ bootB[d,]<-apply(m1,2,trimpartt,conB[,d]) # A vector of length nboot containing psi hat values # corresponding to the dth linear contrast testB[d]<-sum((bootB[d,]>0))/nboot testB[d]<-min(testB[d],1-testB[d]) } for (d in 1:ncol(conAB)){ bootAB[d,]<-apply(m1,2,trimpartt,conAB[,d]) # A vector of length nboot containing psi hat values # corresponding to the dth linear contrast testAB[d]<-sum((bootAB[d,]>0))/nboot testAB[d]<-min(testAB[d],1-testAB[d]) } # # Determine critical value # Jm<-J-1 Km<-K-1 JKm<-(J-1)*(K-1) dvecA <- alpha/c(1:Jm) dvecB <- alpha/c(1:Km) dvecAB <- alpha/c(1:JKm) testA<-(0 - 1) * sort(-2 * testA) testB<-(0 - 1) * sort(-2 * testB) testAB<-(0 - 1) * sort(-2 * testAB) sig <- sum((testA < dvecA[1:Jm])) if(sig > 0) print("Significant result obtained for Factor A: Reject") if(sig == 0) print("No significant result Factor A: Fail to reject") sig <- sum((testB < dvecB[1:Km])) if(sig > 0) print("Significant result obtained for Factor B: Reject") if(sig == 0) print("No significant result Factor B: Fail to reject") sig <- sum((testAB < dvec[1:JKm])) if(sig > 0) print("Significant Interaction: Reject") if(sig == 0) print("No significant Interaction: Fail to reject") list(testA=testA,crit.vecA=dvecA,testB=testB,crit.vecB=dvecB,testAB=testAB,crit.vecAB=dvecAB) } regout<-function(x,y,regest=stsreg,plotit=T,mbox=T){ # # Check for regression outliers by fitting a # a line to data using regest and then applying # a boxplot rule to the residuals. # mbox=T uses Carling's method # mbox=F uses ideal fourths with conventional boxplot rules. # chk<-regest(x,y) flag<-outbox(chk$residuals,mbox=mbox)$out.id if(plotit){ plot(x,y) points(x[flag],y[flag],pch="o") abline(chk$coef) } list(out.id=flag) } stsreg<-function(x,y,sc=pbvar,...){ # # Compute the S-type modification of # the Theil-Sen regression estimator. # Only a single predictor is allowed in this version # x<-as.matrix(x) ord<-order(x) xs<-x[ord] ys<-y[ord] vec1<-outer(ys,ys,"-") vec2<-outer(xs,xs,"-") v1<-vec1[vec2>0] v2<-vec2[vec2>0] slope<-v1/v2 allvar<-NA for(i in 1:length(slope))allvar[i]<-sc(y-slope[i]*x,...) temp<-order(allvar) coef<-0 coef[2]<-slope[temp[1]] coef[1]<-median(y)-coef[2]*median(x) res<-y-coef[2]*x-coef[1] list(coef=coef,residuals=res) } yuend<-function(x,y,tr=.2,alpha=.05){ # # Compare the trimmed means of two dependent random variables # using the data in x and y. # The default amount of trimming is 20% # # Missing values (values stored as NA) are not allowed. # # A confidence interval for the trimmed mean of x minus the # the trimmed mean of y is computed and returned in yuend$ci. # The significance level is returned in yuend$siglevel # # This function uses winvar from chapter 2. # if(length(x)!=length(y))stop("The number of observations must be equal") m<-cbind(x,y) m<-elimna(m) x<-m[,1] y<-m[,2] h1<-length(x)-2*floor(tr*length(x)) q1<-(length(x)-1)*winvar(x,tr) q2<-(length(y)-1)*winvar(y,tr) q3<-(length(x)-1)*wincor(x,y,tr)$cov df<-h1-1 se<-sqrt((q1+q2-2*q3)/(h1*(h1-1))) crit<-qt(1-alpha/2,df) dif<-mean(x,tr)-mean(y,tr) low<-dif-crit*se up<-dif+crit*se test<-dif/se yuend<-2*(1-pt(abs(test),df)) list(ci=c(low,up),siglevel=yuend,dif=dif,se=se,teststat=test,df=df) } rmmcppbtm<-function(x,alpha=.05,con=0,tr=.2,grp=NA,nboot=NA){ # # Using the percentile bootstrap method, # compute a .95 confidence interval for all linear contasts # specified by con, a J by C matrix, where C is the number of # contrasts to be tested, and the columns of con are the # contrast coefficients. # # The trimmed means of dependent groups are being compared. # By default, 20% trimming is used. # # nboot is the bootstrap sample size. If not specified, a value will # be chosen depending on the number of contrasts there are. # # x can be an n by J matrix or it can have list mode # # For alpha=.05, some critical values have been # determined via simulations and are used by this function; # otherwise an approximation is used. # if(!is.list(x) && !is.matrix(x))stop("Data must be stored in a matrix or in list mode.") if(is.list(x)){ if(is.matrix(con)){ if(length(x)!=nrow(con))stop("The number of rows in con is not equal to the number of groups.") }} if(is.list(x)){ # put the data in an n by J matrix mat<-matrix(0,length(x[[1]]),length(x)) for (j in 1:length(x))mat[,j]<-x[[j]] } if(is.matrix(x) && is.matrix(con)){ if(ncol(x)!=nrow(con))stop("The number of rows in con is not equal to the number of groups.") mat<-x } if(is.matrix(x))mat<-x if(!is.na(sum(grp)))mat<-mat[,grp] mat<-elimna(mat) # Remove rows with missing values. J<-ncol(mat) Jm<-J-1 if(sum(con^2)==0){ d<-(J^2-J)/2 con<-matrix(0,J,d) id<-0 for (j in 1:Jm){ jp<-j+1 for (k in jp:J){ id<-id+1 con[j,id]<-1 con[k,id]<-0-1 }}} d<-ncol(con) if(is.na(crit) && tr != .2){ print("A critical value must be specified when") stop("the amount of trimming differs from .2") } if(is.na(nboot)){ if(d<=3)nboot<-1000 if(d==6)nboot<-2000 if(d==10)nboot<-4000 if(d==15)nboot<-8000 if(d==21)nboot<-8000 if(d==28)nboot<-10000 } n<-nrow(mat) crit<-NA if(alpha==.05){ if(d==1)crit<-alpha/2 if(d==3){ crit<-.004 if(n>=15)crit<-.006 if(n>=30)crit<-.007 if(n>=40)crit<-.008 if(n>=100)crit<-.009 } if(d==6){ crit<-.001 if(n>=15)crit<-.002 if(n>=20)crit<-.0025 if(n>=30)crit<-.0035 if(n>=40)crit<-.004 if(n>=60)crit<-.0045 } if(d==10){ crit<-.00025 if(n>=15)crit<-.00125 if(n>=20)crit<-.0025 } if(d==15){ crit<-.0005 if(n>=20)crit<-.0010 if(n>=30)crit<-.0011 if(n>=40)crit<-.0016 if(n>=100)crit<-.0019 } if(d==21){ crit<-.00025 if(n>=20)crit<-.00037 if(n>=30)crit<-.00075 if(n>=40)crit<-.00087 if(n>=60)crit<-.00115 if(n>=100)crit<-.00125 } if(d==28){ crit<-.0004 if(n>=30)crit<-.0006 if(n>=60)crit<-.0008 if(n>=100)crit<-.001 } } if(is.na(crit)){ crit<-alpha/(2*d) if(n<20)crit<-crit/2 if(n<=10)crit<-crit/2 } icl<-ceiling(crit*nboot)+1 icu<-ceiling((1-crit)*nboot) connum<-ncol(con) set.seed(2) # set seed of random number generator so that # results can be duplicated. # data is an nboot by n matrix xbars<-matrix(0,nboot,ncol(mat)) psihat<-matrix(0,connum,nboot) print("Taking bootstrap samples. Please wait.") bvec<-bootdep(mat,tr,nboot) # # Now have an nboot by J matrix of bootstrap values. # test<-1 for (ic in 1:connum){ psihat[ic,]<-apply(bvec,1,bptdpsi,con[,ic]) test[ic]<-sum((psihat[ic,]>0))/nboot test[ic]<-min(test[ic],1-test[ic]) } print("Reminder: Test statistic must be less than critical value in order to reject.") output<-matrix(0,connum,5) dimnames(output)<-list(NULL,c("con.num","psihat","test","ci.lower","ci.upper")) tmeans<-apply(mat,2,mean,trim=tr) psi<-1 for (ic in 1:ncol(con)){ output[ic,2]<-sum(con[,ic]*tmeans) output[ic,1]<-ic output[ic,3]<-test[ic] temp<-sort(psihat[ic,]) output[ic,4]<-temp[icl] output[ic,5]<-temp[icu] } list(output=output,crit=crit,con=con) } mcppb20<-function(x,crit=NA,con=0,tr=.2,alpha=.05,nboot=2000,grp=NA,WIN=F, win=.1){ # # Compute a 1-alpha confidence interval for a set of d linear contrasts # involving trimmed means using the percentile bootstrap method. # Independent groups are assumed. # # The data are assumed to be stored in x in list mode. Thus, # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. Length(x)=the number of groups = J, say. # # By default, all pairwise comparisons are performed, but contrasts # can be specified with the argument con. # The columns of con indicate the contrast coefficients. # Con should have J rows, J=number of groups. # For example, con[,1]=c(1,1,-1,-1,0,0) and con[,2]=c(,1,-1,0,0,1,-1) # will test two contrasts: (1) the sum of the first two trimmed means is # equal to the sum of the second two, and (2) the difference between # the first two is equal to the difference between the trimmed means of # groups 5 and 6. # # The default number of bootstrap samples is nboot=2000 # # con<-as.matrix(con) if(is.matrix(x)){ xx<-list() for(i in 1:ncol(x)){ xx[[i]]<-x[,i] } x<-xx } if(!is.list(x))stop("Data must be stored in list mode or in matrix mode.") if(!is.na(sum(grp))){ # Only analyze specified groups. xx<-list() for(i in 1:length(grp))xx[[i]]<-x[[grp[1]]] x<-xx } J<-length(x) tempn<-0 for(j in 1:J){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. tempn[j]<-length(temp) x[[j]]<-temp } Jm<-J-1 d<-ifelse(sum(con^2)==0,(J^2-J)/2,ncol(con)) if(is.na(crit) && tr != .2){ print("A critical value must be specified when") stop("the amount of trimming differs from .2") } if(WIN){ if(tr < .2){ print("Warning: When Winsorizing, the amount") print("of trimming should be at least .2") } if(win > tr)stop("Amount of Winsorizing must <= amount of trimming") if(min(tempn) < 15){ print("Warning: Winsorizing with sample sizes") print("less than 15 can result in poor control") print("over the probability of a Type I error") } for (j in 1:J){ x[[j]]<-winval(x[[j]],win) } } if(is.na(crit)){ if(d==1)crit<-alpha/2 if(d==2 && alpha==.05 && nboot==1000)crit<-.014 if(d==2 && alpha==.05 && nboot==2000)crit<-.014 if(d==3 && alpha==.05 && nboot==1000)crit<-.009 if(d==3 && alpha==.05 && nboot==2000)crit<-.0085 if(d==3 && alpha==.025 && nboot==1000)crit<-.004 if(d==3 && alpha==.025 && nboot==2000)crit<-.004 if(d==3 && alpha==.01 && nboot==1000)crit<-.001 if(d==3 && alpha==.01 && nboot==2000)crit<-.001 if(d==4 && alpha==.05 && nboot==2000)crit<-.007 if(d==5 && alpha==.05 && nboot==2000)crit<-.006 if(d==6 && alpha==.05 && nboot==1000)crit<-.004 if(d==6 && alpha==.05 && nboot==2000)crit<-.0045 if(d==6 && alpha==.025 && nboot==1000)crit<-.002 if(d==6 && alpha==.025 && nboot==2000)crit<-.0015 if(d==6 && alpha==.01 && nboot==2000)crit<-.0005 if(d==10 && alpha==.05 && nboot<=2000)crit<-.002 if(d==10 && alpha==.05 && nboot==3000)crit<-.0023 if(d==10 && alpha==.025 && nboot<=2000)crit<-.0005 if(d==10 && alpha==.025 && nboot==3000)crit<-.001 if(d==15 && alpha==.05 && nboot==2000)crit<-.0016 if(d==15 && alpha==.025 && nboot==2000)crit<-.0005 if(d==15 && alpha==.05 && nboot==5000)crit<-.0026 if(d==15 && alpha==.025 && nboot==5000)crit<-.0006 } if(is.na(crit) && alpha==.05)crit<-0.0268660714*(1/d)-0.0003321429 if(is.na(crit))crit<-alpha/(2*d) if(d> 10 && nboot <5000){ print("Warning: Suggest using nboot=5000") print("when the number of contrasts exceeds 10.") } icl<-round(crit*nboot)+1 icu<-round((1-crit)*nboot) if(sum(con^2)==0){ con<-matrix(0,J,d) id<-0 for (j in 1:Jm){ jp<-j+1 for (k in jp:J){ id<-id+1 con[j,id]<-1 con[k,id]<-0-1 }}} psihat<-matrix(0,ncol(con),6) dimnames(psihat)<-list(NULL,c("con.num","psihat","se","ci.lower", "ci.upper","p-value")) if(nrow(con)!=length(x)){ print("The number of groups does not match") stop("the number of contrast coefficients.") } bvec<-matrix(NA,nrow=J,ncol=nboot) set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") for(j in 1:J){ paste("Working on group ",j) data<-matrix(sample(x[[j]],size=length(x[[j]])*nboot,replace=T),nrow=nboot) bvec[j,]<-apply(data,1,mean,tr) # Bootstrapped trimmed means for jth group } test<-NA for (d in 1:ncol(con)){ top<-0 for (i in 1:J){ top<-top+con[i,d]*bvec[i,] } test[d]<-(sum(top>0)+.5*sum(top==0))/nboot test[d]<-min(test[d],1-test[d]) top<-sort(top) psihat[d,4]<-top[icl] psihat[d,5]<-top[icu] } for (d in 1:ncol(con)){ psihat[d,1]<-d testit<-lincon(x,con[,d],tr,pr=F) psihat[d,6]<-2*test[d] psihat[d,2]<-testit$psihat[1,2] psihat[d,3]<-testit$test[1,4] } list(psihat=psihat,crit.p.value=2*crit,con=con) } comvar2d<-function(x,y,SEED=T){ # # Compare the variances of two dependent groups. # nboot<-599 m<-cbind(x,y) m<-elimna(m) # Remove missing values U<-m[,1]-m[,2] V<-m[,1]+m[,2] ci<-pcorb(U,V,SEED=SEED)$ci list(ci=ci) } mom<-function(x,bend=2.24){ # # Compute MOM-estimator of location. # The default bending constant is 2.24 # x<-x[!is.na(x)] #Remove missing values flag1<-(x>median(x)+bend*mad(x)) flag2<-(x.5)pvec[i]<-1-pvec[i] regci[i,1]<-bsort[ilow] regci[i,2]<-bsort[ihi] se[i]<-sqrt(var(bvec[i,])) } pvec<-2*pvec if(pr){ print("First row of regci is the confidence interval for the intercept,") print("the second row is the confidence interval for the first slope, etc.") } list(regci=regci,p.value=pvec,se=se) } rregci<-function(x,y,regfun=chreg,nboot=599,alpha=.05){ # # Compute a .95 confidence interval for each of the parameters of # a linear regression equation. The default regression method is # a bounded influence M-regression with Schweppe weights # (the Coakley-Hettmansperger estimator). # # When using the least squares estimator, and when n<250, use # lsfitci instead. # # The predictor values are assumed to be in the n by p matrix x. # The default number of bootstrap samples is nboot=599 # # regfun can be any s-plus function that returns the coefficients in # the vector regfun$coef, the first element of which contains the # estimated intercept, the second element contains the estimated of # the first predictor, etc. # x<-as.matrix(x) set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") data<-matrix(sample(length(y),size=length(y)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,regboot,x,y,regfun) # A p+1 by nboot matrix. The first row # contains the bootstrap intercepts, the second row # contains the bootstrap values for first predictor, etc. p1<-ncol(x)+1 regci<-matrix(0,p1,2) ilow<-round((alpha/2) * nboot) ihi<-nboot - ilow ilow<-ilow+1 se<-NA pvec<-NA for(i in 1:p1){ bsort<-sort(bvec[i,]) pvec[i]<-sum(bvec[i,]<0)/nboot if(pvec[i]>.5)pvec[i]<-1-pvec[i] regci[i,1]<-bsort[ilow] regci[i,2]<-bsort[ihi] se[i]<-sqrt(var(bvec[i,])) } pvec<-2*pvec list(regci=regci,p.value=pvec,se=se) } pbcan<-function(x,nboot=1000,grp=NA,est=mom,...){ # # Test the hypothesis that J independent groups have # equal measures of location using the percentile bootstrap method. # in conjunction with a partially centering technique. # # The data are assumed to be stored in x # which either has list mode or is a matrix. In the first case # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. Length(x)=the number of groups = J. # If stored in a matrix, the columns of the matrix correspond # to groups. # # est is the measure of location and defaults to an M-estimator # ... can be used to set optional arguments associated with est # # The argument grp can be used to analyze a subset of the groups # Example: grp=c(1,3,5) would compare groups 1, 3 and 5. # # Missing values are allowed. # if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in list mode or in matrix mode.") if(!is.na(sum(grp))){ # Only analyze specified groups. xx<-list() for(i in 1:length(grp))xx[[i]]<-x[[grp[1]]] x<-xx } J<-length(x) tempn<-0 vecm<-0 for(j in 1:J){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. tempn[j]<-length(temp) x[[j]]<-temp vecm[j]<-est(x[[j]],...) } xcen<-list() flag<-rep(T,J) for(j in 1:J){ flag[j]<-F temp<-mean(vecm[flag]) xcen[[j]]<-x[[j]]-temp flag[j]<-T } icrit<-round((1-alpha)*nboot) bvec<-matrix(NA,nrow=J,ncol=nboot) set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") for(j in 1:J){ paste("Working on group ",j) data<-matrix(sample(xcen[[j]],size=length(x[[j]])*nboot,replace=T),nrow=nboot) bvec[j,]<-apply(data,1,est,...) # Bootstrapped values for jth group } vvec<-NA for(j in 1:J){ vvec[j]<-sum((bvec[j,]-vecm[j])^2)/(nboot-1) } dis<-NA for(i in 1:nboot){ dis[i]<-sum((bvec[,i]-vecm)^2/vvec) } tvec<-sum((0-vecm)^2/vvec) dis<-sort(dis) print(tvec) print(dis[icrit]) print(vecm) sig<-1-sum((tvec>=dis))/nboot list(p.value=sig) } ddep<-function(x,est=mom,alpha=.05,grp=NA,nboot=500,plotit=T,SEED=T,...){ # # Do ANOVA on dependent groups # using the partially centered method plus # depth of zero among bootstrap values. # # The data are assumed to be stored in x in list mode # or in a matrix. In the first case # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. Length(x)=the number of groups = J. # If stored in a matrix, columns correspond to groups. # # grp is used to specify some subset of the groups, if desired. # By default, all J groups are used. # # The default number of bootstrap samples is nboot=500 # if(is.list(x)){ nv<-NA for(j in 1:length(x))nv[j]<-length(x[[j]]) if(var(nv) !=0){ stop("The groups are stored in list mode and appear to have different sample sizes") } temp<-matrix(NA,ncol=length(x),nrow=nv[1]) for(j in 1:length(x))temp[,j]<-x[[j]] x<-temp } J<-ncol(x) if(!is.na(grp[1])){ #Select the groups of interest J<-length(grp) for(j in 1:J)temp[,j]<-x[,grp[j]] x<-temp } x<-elimna(x) # Remove any rows with missing values. bvec<-matrix(0,ncol=J,nrow=nboot) hval<-vector("numeric",J) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") n<-nrow(x) totv<-apply(x,2,est,...) data<-matrix(sample(n,size=n*nboot,replace=T),nrow=nboot) for(ib in 1:nboot)bvec[ib,]<-apply(x[data[ib,],],2,est,...) #nboot by J matrix gv<-rep(mean(totv),J) #Grand mean bplus<-nboot+1 m1<-rbind(bvec,gv) center<-totv cmat<-var(bvec) discen<-mahalanobis(m1,totv,cmat) print("Bootstrap complete; computing significance level") if(plotit && ncol(x)==2){ plot(bvec,xlab="Group 1",ylab="Group 2") temp.dis<-order(discen[1:nboot]) ic<-round((1-alpha)*nboot) xx<-bvec[temp.dis[1:ic],] xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) lines(xx[temp,]) lines(xx[c(temp[1],temp[length(temp)]),]) abline(0,1) } sig.level<-sum(discen[bplus]<=discen)/bplus list(p.value=sig.level,center=totv,grand.mean=gv) } rmaseq<-function(x,est=mom,alpha=.05,grp=NA,nboot=NA,...){ # # Using the percentile bootstrap method, # test hypothesis that all marginal distributions # among J dependent groups # have a common measure of location. # This is done by using a sequentially rejective method # of J-1 pairs of groups. # That is, compare group 1 to group 2, group 2 to group 3, etc. # # By default, mom estimator is used. # # nboot is the bootstrap sample size. If not specified, a value will # be chosen depending on the number of groups # # x can be an n by J matrix or it can have list mode # grp can be used to specify a subset of the groups for analysis # # the argument ... can be used to specify options associated # with the argument est. # if(!is.list(x) && !is.matrix(x)){ stop("Data must be stored in a matrix or in list mode.") } if(is.list(x)){ # put the data in an n by J matrix mat<-matrix(0,length(x[[1]]),length(x)) for (j in 1:length(x))mat[,j]<-x[[j]] } if(is.matrix(x))mat<-x mat<-elimna(mat) # Remove rows with missing values. J<-ncol(mat) Jm<-J-1 con<-matrix(0,ncol=Jm,nrow=J) for(j in 1:Jm){ jp<-j+1 for(k in j:jp){ con[j,j]<-1 con[jp,j]<-0-1 }} rmmcp(x,est=est,alpha=alpha,con=con,nboot=nboot,...) } rmanog<-function(x,alpha=.05,est=mom,grp=NA,nboot=NA,...){ # # Using the percentile bootstrap method, # test the hypothesis that all differences among J # dependent groups have a # measure of location equal to zero. # That is, if # Dij is the difference between ith observations # in groups j and j+1, # and Dij has measure of location muj # the goal is to test # H0: mu1=mu2=...=0 # # By default, mom estimator is used. # # nboot is the bootstrap sample size. If not specified, a value will # be chosen depending on the number of groups # # x can be an n by J matrix or it can have list mode # grp can be used to specify a subset of the groups for analysis # # the argument ... can be used to specify options associated # with the argument est. # if(!is.list(x) && !is.matrix(x))stop("Data must be stored in a matrix or in list mode.") if(is.list(x)){ # put the data in an n by J matrix mat<-matrix(0,length(x[[1]]),length(x)) for (j in 1:length(x))mat[,j]<-x[[j]] } if(is.matrix(x))mat<-x mat<-elimna(mat) # Remove rows with missing values. J<-ncol(mat) Jm<-J-1 jp<-0 dif<-matrix(NA,nrow=nrow(mat),ncol=Jm) for(j in 1:Jm){ jp<-j+1 dif[,j]<-mat[,j]-mat[,jp] } if(is.na(nboot)){ nboot<-5000 if(Jm <= 4)nboot<-1000 } print("Taking bootstrap samples. Please wait.") data <- matrix(sample(nrow(mat), size = nrow(mat) * nboot, replace = T), nrow = nboot) bvec <- matrix(NA, ncol = ncol(dif), nrow = nboot) for(j in 1:ncol(dif)) { temp <- dif[, j] bvec[, j] <- apply(data, 1., rmanogsub, temp, est) } #bvec is an nboot by Jm matrix testvec<-NA for(j in 1:Jm){ testvec[j]<-sum(bvec[,j]>0)/nboot if(testvec[j] > .5)testvec[j]<-1-testvec[j] } critvec<-alpha/c(1:Jm) #testvec<-2*testvec[order(-1*testvec)] test<-2*testvec test.sort<-order(-1*test) chk<-sum((test.sort <= critvec)) if(chk > 0)print("Significant difference found") output<-matrix(0,Jm,6) dimnames(output)<-list(NULL,c("con.num","psihat","sig","crit.sig","ci.lower","ci.upper")) tmeans<-apply(dif,2,est,...) psi<-1 output[,2]<-tmeans for (ic in 1:Jm){ output[ic,1]<-ic output[ic,3]<-test[ic] crit<-critvec[ic] output[test.sort[ic],4]<-crit } for(ic in 1:Jm){ icrit<-output[ic,4] icl<-round(icrit*nboot/2)+1 icu<-round((1-icrit/2)*nboot) temp<-sort(bvec[,ic]) output[ic,5]<-temp[icl] output[ic,6]<-temp[icu] } list(output=output) } ecor<-function(x,y,pcor=F,regfun=tsreg,corfun=pbcor,outkeep=F,outfun=outmgvf){ # # Estimate the explanatory correlation between x and y # # It is assumed that x is a vector or a matrix having one column only xx<-elimna(cbind(x,y)) # Remove rows with missing values x<-xx[,1] y<-xx[,2] x<-as.matrix(x) if(ncol(x) > 1)stop("x must be a vector or matrix with one column") flag<-rep(T,nrow(x)) if(!outkeep){ temp<-outfun(cbind(x,y))$out.id flag[temp]<-F } coef<-regfun(x,y)$coef ip<-ncol(x)+1 yhat<-x %*% coef[2:ip] + coef[1] if(pcor)epow2<-cor(yhat[flag],y[flag])^2 if(!pcor)epow2<-corfun(yhat[flag],y[flag])$cor^2 ecor<-sqrt(epow2)*sign(coef[2]) ecor } ocor<-function(x,y,corfun=pbcor,outfun=outmgvf,pcor=F,plotit=F){ # # Compute a correlation when outliers are ignored. # xx<-elimna(cbind(x,y)) # Remove rows with missing values x<-xx[,1] y<-xx[,2] flag<-rep(T,length(x)) temp<-outfun(cbind(x,y),plotit=plotit)$out.id flag[temp]<-F if(pcor)ocor<-cor(x[flag],y[flag]) if(!pcor)ocor<-corfun(x[flag],y[flag])$cor list(cor=ocor) } rmdzero<-function(x,est=mom,grp=NA,nboot=500,...){ # # Do ANOVA on dependent groups # using # depth of zero among bootstrap values # based on difference scores. # # The data are assumed to be stored in x in list mode # or in a matrix. In the first case # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. Length(x)=the number of groups = J. # If stored in a matrix, columns correspond to groups. # # grp is used to specify some subset of the groups, if desired. # By default, all J groups are used. # # The default number of bootstrap samples is nboot=500 # if(!is.list(x) && !is.matrix(x))stop("Data must be stored in a matrix or in lis\ t mode.") if(is.list(x)){ # put the data in an n by J matrix mat<-matrix(0,length(x[[1]]),length(x)) for (j in 1:length(x))mat[,j]<-x[[j]] } if(is.matrix(x))mat<-x if(!is.na(grp[1])){ mat<-mat[,grp] } mat<-elimna(mat) # Remove rows with missing values. J<-ncol(mat) jp<-0 Jall<-(J^2-J)/2 dif<-matrix(NA,nrow=nrow(mat),ncol=Jall) ic<-0 for(j in 1:J){ for(k in 1:J){ if(jcrit,1,0) id<-vec[chk==1] keep<-vec[chk==0] x<-as.matrix(x) if(plotit && ncol(x)==2){ plot(x[,1],x[,2],xlab="X",ylab="Y",type="n") flag<-rep(T,nrow(x)) flag[id]<-F points(x[flag,1],x[flag,2]) if(sum(chk)!=0)points(x[!flag,1],x[!flag,2],pch="o") } list(out.id=id,keep.id=keep,dis=dis,crit=crit) } runmean2g<-function(x1,y1,x2,y2,fr=.8,est=tmean,xlab="X",ylab="Y", sm=F,nboot=40,SEED=T,eout=F,xout=F,...){ # # Plot of running interval smoother for two groups # # fr controls amount of smoothing # tr is the amount of trimming # # Missing values are automatically removed. # # sm=T results in using bootstrap bagging when estimating the regression line # nboot controls number of bootstrap samples # m<-elimna(cbind(x1,y1)) x1<-m[,1] y1<-m[,2] m<-elimna(cbind(x2,y2)) x2<-m[,1] y2<-m[,2] if(!sm){ temp<-rungen(x1,y1,est=est,fr=fr,pyhat=T,plotit=F,...) rmd1<-temp[1]$output } if(sm){ temp<-runmbo(x1,y1,est=est,fr=fr,pyhat=T,plotit=F,SEED=SEED, nboot=nboot,eout=F,xout=F,...) rmd1<-temp } if(!sm){ temp<-rungen(x2,y2,fr=fr,est=est,pyhat=T,plotit=F,...) rmd2<-temp[1]$output } if(sm){ temp<-runmbo(x2,y2,est=est,fr=fr,pyhat=T,plotit=F,SEED=SEED, nboot=nboot,eout=F,xout=F,...) rmd2<-temp } plot(c(x1,x2),c(y1,y2),type="n",xlab=xlab,ylab=ylab) sx1<-sort(x1) sx2<-sort(x2) xorder1<-order(x1) xorder2<-order(x2) sysm1<-rmd1[xorder1] sysm2<-rmd2[xorder2] points(x1,y1) points(x2,y2,pch="+") lines(sx1,sysm1) lines(sx2,sysm2,lty=2) } rundis<-function(x,y,est=mom,plotit=T,pyhat=F,...){ # # Do a smooth where x is discrete with a # relatively small number of values. # temp<-sort(unique(x)) yhat<-NA for(i in 1:length(temp)){ flag<-(temp[i]==x) yhat[i]<-est(y[flag],...) } plot(x,y) lines(temp,yhat) output<-"Done" if(pyhat)output<-yhat output } bdm<-function(x,grp=NA){ # # Perform the Brunner, Dette, Munk rank-based ANOVA # (JASA, 1997, 92, 1494--1502) # # x can be a matrix with columns corresponding to groups # or it can have list mode. # if(is.matrix(x))x<-listm(x) J<-length(x) xx<-list() if(is.na(grp[1]))grp<-c(1:J) for(j in 1:J)xx[[j]]<-x[[grp[j]]] Ja<-matrix(1,J,J) Ia<-diag(1,J) Pa<-Ia-Ja/J cona<-Pa outA<-bdms1(xx,cona) list(output=outA) } bdm2way<-function(J,K,x,grp=c(1:p),p=J*K){ # # Perform the Brunner, Dette, Munk rank-based ANOVA # (JASA, 1997, 92, 1494--1502) # for a J by K independent groups design. # # x can be a matrix with columns corresponding to groups # or it can have list mode. # if(is.matrix(x))x<-listm(x) xx<-list() for(j in 1:p)xx[[j]]<-x[[grp[j]]] Ja<-matrix(1,J,J) Ia<-diag(1,J) Pa<-Ia-Ja/J Jb<-matrix(1,K,K) Ib<-diag(1,K) Pb<-Ib-Jb/K cona<-kron(Pa,Jb/K) conb<-kron(Ja/J,Pb) conab<-kron(Pa,Pb) outA<-bdms1(xx,cona) outB<-bdms1(xx,conb) outAB<-bdms1(xx,conab) list(outputA=outA,outputB=outB,outputAB=outAB) } cori<-function(x,y,z,pt=median(z),fr=.8,est=mom,corfun=pbcor,testit=F, nboot=599,sm=F,xlab="X",ylab="Y",...){ # # Split the data according to whether z is < or > pt, then # use runmean2g to plot a smooth of the regression # lines corresponding to these two groups. # # If testit=T, the hypothesis of equal correlations is tested using the # the s-plus function twocor # m<-cbind(x,y,z) m<-elimna(m) x<-m[,1] y<-m[,2] z<-m[,3] flag<-(z 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.005,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01)dvec<-alpha/c(1:ncon) test<-NA bcon<-t(con)%*%bvec #ncon by nboot matrix tvec<-t(con)%*%mvec for (d in 1:ncon){ test[d]<-sum(bcon[d,]>0)/nboot if(test[d]> .5)test[d]<-1-test[d] } output<-matrix(0,ncon,6) dimnames(output)<-list(NULL,c("con.num","psihat","p.value","p.crit","ci.lower","ci.upper")) temp2<-order(0-test) zvec<-dvec[1:ncon] sigvec<-(test[temp2]>=zvec) if(sum(sigvec)0] v2<-vec2[vec2>0] slope<-v1/v2 tmin<-wrregfun(slope[1],x,y) ikeep<-1 for(i in 2:length(slope)){ tryit<-wrregfun(slope[i],x,y) if(tryit1){ for(p in 1:ncol(x)){ temp[p]<-wsp1reg(x[,p],y)$coef[2] } res<-y-x%*%temp alpha<-median(res) r<-matrix(NA,ncol=ncol(x),nrow=nrow(x)) tempold<-temp for(it in 1:iter){ for(p in 1:ncol(x)){ r[,p]<-y-x%*%temp-alpha+temp[p]*x[,p] temp[p]<-wsp1reg(x[,p],r[,p],plotit=F)$coef[2] } alpha<-median(y-x%*%temp) if(max(abs(tempold-temp))<.0001)break tempold<-temp } coef<-c(alpha,temp) res<-y-x%*%temp-alpha } list(coef=coef,residuals=res) } mgvar<-function(m,se=F,op=0,cov.fun=covmve,SEED=T){ # # Find the center of a scatterplot, add point that # increases the generalized variance by smallest amount # continue for all points # return the generalized variance # values corresponding to each point. # The central values and point(s) closest to it get NA # # op=0 find central points using pairwise differences # op!=0 find central points using measure of location # used by cov.fun # # choices for cov.fun include # covmve # covmcd # tbs (Rocke's measures of location # rmba (Olive's median ball algorithm) # if(op==0)temp<-apgdis(m,se=se)$distance if(op!=0)temp<-out(m,cov.fun=cov.fun,plotit=F,SEED=SEED)$dis #if(op==2)temp<-out(m,mcd=T,plotit=F)$dis flag<-(temp!=min(temp)) temp2<-temp temp2[!flag]<-max(temp) flag2<-(temp2!=min(temp2)) flag[!flag2]<-F varvec<-NA while(sum(flag)>0){ ic<-0 chk<-NA remi<-NA for(i in 1:nrow(m)){ if(flag[i]){ ic<-ic+1 chk[ic]<-gvar(rbind(m[!flag,],m[i,])) remi[ic]<-i }} sor<-order(chk) k<-remi[sor[1]] varvec[k]<-chk[sor[1]] flag[k]<-F } varvec } outmgv<-function(x,y=NA,plotit=T,outfun=outbox,se=T,op=1,fast=F, cov.fun=rmba,xlab="X",ylab="Y",SEED=T,...){ # # Check for outliers using mgv method # # NOTE: if columns of the input matrix are reordered, this can # have an effect on the results due to rounding error when calling # the R function eigen. # if(is.na(y[1]))m<-x if(!is.na(y[1]))m<-cbind(x,y) m=elimna(m) if(!fast)temp<-mgvar(m,se=se,op=op,cov.fun=cov.fun,SEED=SEED) if(fast)temp<-mgvdep.for(m,se=se)$distance temp[is.na(temp)]<-0 if(ncol(m)==2)temp2<-outfun(temp,...)$out.id if(ncol(m)>2)temp2<-outbox(temp,mbox=T,gval=sqrt(qchisq(.975,ncol(m))))$out.id vec<-c(1:nrow(m)) flag<-rep(T,nrow(m)) flag[temp2]<-F vec<-vec[flag] vals<-c(1:nrow(m)) keep<-vals[flag] if(plotit && ncol(m)==2){ x<-m[,1] y<-m[,2] plot(x,y,type="n",xlab=xlab,ylab=ylab) flag<-rep(T,length(y)) flag[temp2]<-F points(x[flag],y[flag],pch="*") points(x[temp2],y[temp2],pch="o") } list(out.id=temp2,keep=keep) } outmgvf<-function(x,y=NA,plotit=T,outfun=outbox,se=T,...){ # # Check for outliers using inward mgv method # This method is faster than outmgv. # if(is.na(y[1]))m<-x if(!is.na(y[1]))m<-cbind(x,y) m<-elimna(m) # eliminate any rows with missing datatemp2<-out if(se){ for(i in 1:ncol(m))m[,i]<-(m[,i]-median(m[,i]))/mad(m[,i]) } iflag<-rep(T,nrow(m)) dval<-0 for(i in 1:nrow(m)){ iflag[i]<-F dval[i]<-gvar(m[iflag,]) iflag[i]<-T } temp2<-outfun(dval,...)$out.id vals<-c(1:nrow(m)) flag3<-rep(T,nrow(m)) flag3[temp2]<-F keep<-vals[flag3] if(plotit && ncol(m)==2){ x<-m[,1] y<-m[,2] plot(x,y,type="n",xlab="X",ylab="Y") flag<-rep(T,length(y)) flag[temp2]<-F points(x[flag],y[flag]) points(x[temp2],y[temp2],pch="o") } list(out.id=temp2,keep=keep,out.val=m[temp2,],depth.values=dval) } epow<-function(x,y,pcor=F,regfun=tsreg,corfun=pbcor,outkeep=F,outfun=outmgvf,varfun=pbvar,op=T){ # # Estimate the explanatory power between x and y # xx<-elimna(cbind(x,y)) pval<-1 if(is.matrix(x))pval<-ncol(x) pp<-pval+1 x<-xx[,1:pval] y<-xx[,pp] x<-as.matrix(x) flag<-rep(T,nrow(x)) temp<-regfun(x,y) ip<-ncol(x)+1 yhat<-y-temp$res if(!outkeep){ temp<-outfun(cbind(x,y),plotit=F)$out.id flag[temp]<-F } epow1<-varfun(yhat[flag])/varfun(y[flag]) if(pcor)epow2<-cor(yhat[flag],y[flag])^2 if(!pcor)epow2<-corfun(yhat[flag],y[flag])$cor^2 if(op)est<-epow2 if(!op)est<-epow1 est } cmanova<-function(J,K,x,grp=c(1:JK),JK=J*K){ # # Perform the Choi and Marden # multivariate one-way rank-based ANOVA # (Choi and Marden, JASA, 1997, 92, 1581-1590. # # x can be a matrix with columns corresponding to groups # or it can have list mode. # # Have a J by K design with J independent levels and K dependent # measures # # if(is.matrix(x))x<-listm(x) xx<-list() nvec<-NA jk<-0 for(j in 1:J){ for(k in 1:K){ jk<-jk+1 xx[[jk]]<-x[[grp[jk]]] if(k==1)nvec[j]<-length(xx[[jk]]) }} N<-sum(nvec) RVALL<-matrix(0,nrow=N,K) x<-xx jk<-0 rmean<-matrix(NA,nrow=J,ncol=K) for(j in 1:J){ RV<-matrix(0,nrow=nvec[j],ncol=K) jk<-jk+1 temp1<-matrix(x[[jk]],ncol=1) for(k in 2:K){ jk<-jk+1 temp1<-cbind(temp1,x[[jk]]) } X<-temp1 if(j==1)XALL<-X if(j>1)XALL<-rbind(XALL,X) n<-nvec[j] for(i in 1:n){ for (ii in 1:n){ temp3<-sqrt(sum((X[i,]-X[ii,])^2)) if(temp3 != 0)RV[i,]<-RV[i,]+(X[i,]-X[ii,])/temp3 } RV[i,]<-RV[i,]/nvec[j] if(j==1 && i==1)sighat<-RV[i,]%*%t(RV[i,]) if(j>1 || i>1)sighat<-sighat+RV[i,]%*%t(RV[i,]) } } # Assign ranks to pooled data and compute R bar for each group for(i in 1:N){ for (ii in 1:N){ temp3<-sqrt(sum((XALL[i,]-XALL[ii,])^2)) if(temp3 != 0)RVALL[i,]<-RVALL[i,]+(XALL[i,]-XALL[ii,])/temp3 } RVALL[i,]<-RVALL[i,]/N } bot<-1-nvec[1] top<-0 for(j in 1:J){ bot<-bot+nvec[j] top<-top+nvec[j] flag<-c(bot:top) rmean[j,]<-apply(RVALL[flag,],2,mean) } sighat<-sighat/(N-J) shatinv<-solve(sighat) KW<-0 for(j in 1:J){ KW<-KW+nvec[j]*t(rmean[j,])%*%shatinv%*%rmean[j,] } df<-K*(J-1) sig.level<-1-pchisq(KW,df) list(test.stat=KW[1,1],df=df,p.value=sig.level) } rimul<-function(J,K,x,alpha=.05,p=J*K,grp=c(1:p)){ # # Rank-based multiple comparisons for all interactions # in J by K design. The method is based on an # extension of Cliff's heteroscedastic technique for # handling tied values. # # The familywise type I error probability is controlled by using # a critical value from the Studentized maximum modulus distribution. # # It is assumed all groups are independent. # # Missing values are automatically removed. # # The default value for alpha is .05. Any other value results in using # alpha=.01. # # Argument grp can be used to rearrange the order of the data. # if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in a matrix or in list mode.") CCJ<-(J^2-J)/2 CCK<-(K^2-K)/2 CC<-CCJ*CCK test<-matrix(NA,CC,7) test.p<-matrix(NA,CC,7) for(j in 1:J){ xx<-!is.na(x[[j]]) val<-x[[j]] x[[j]]<-val[xx] # Remove missing values } mat<-matrix(grp,ncol=K,byrow=T) dimnames(test)<-list(NULL,c("Factor A","Factor A","Factor B","Factor B","delta","ci.lower","ci.upper")) jcom<-0 crit<-smmcrit(200,CC) if(alpha!=.05)crit<-smmcrit01(200,CC) alpha<-1-pnorm(crit) for (j in 1:J){ for (jj in 1:J){ if (j < jj){ for (k in 1:K){ for (kk in 1:K){ if (k < kk){ jcom<-jcom+1 test[jcom,1]<-j test[jcom,2]<-jj test[jcom,3]<-k test[jcom,4]<-kk temp1<-cid(x[[mat[j,k]]],x[[mat[j,kk]]]) temp2<-cid(x[[mat[jj,k]]],x[[mat[jj,kk]]]) delta<-temp2$d-temp1$d sqse<-temp1$sqse.d+temp2$sqse.d test[jcom,5]<-delta/2 test[jcom,6]<-delta/2-crit*sqrt(sqse/4) test[jcom,7]<-delta/2+crit*sqrt(sqse/4) }}}}}} list(test=test) } signt<-function(x,y=NA,alpha=.05){ # # Do a sign test on data in x and y # If y=NA, assume x is a matrix with # two columns or has list mode. # if(is.na(y[1])){ if(is.matrix(x))dif<-x[,1]-x[,2] if(is.list(x))dif<-x[[1]]-x[[2]] } if(!is.na(y[1]))dif<-x-y n<-length(dif) dif<-dif[dif!=0] # Remove any zero values. flag<-(dif<0) temp<-binomci(y=flag) list(phat=temp$phat,ci=temp$ci,n=n,N=length(flag)) } sisplit<-function(J,K,x){ # # Check for interactions by comparing binomials # Here, have J by K (between by within) design # Only alpha=.05 is allowed. # p<-J*K connum<-(J^2-J)*(K^2-K)/4 if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in a matrix or in list mode") imap<-matrix(c(1:p),J,K,byrow=T) outm<-matrix(NA,ncol=8,nrow=connum) dimnames(outm)<-list(NULL,c("Fac.A","Fac.A","Fac.B","Fac.B","p1","p2","ci.low","ci.up")) ic<-0 if(connum <= 28)qval<-smmcrit(999,connum) if(connum > 28)qval<-2.383904*connum^.1-.202 aval<-4*(1-pnorm(qval)) if(J==2 && K==2)aval<-.05 if(J==5 && K==2)aval<-2*(1-pnorm(qval)) if(J==3 && K==2)aval<-3*(1-pnorm(qval)) if(J==4 && K==2)aval<-3*(1-pnorm(qval)) if(J==2 && K==3)aval<-3*(1-pnorm(qval)) for (j in 1:J){ for (jj in 1:J){ if(j 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.005,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01){ dvec<-alpha/c(1:ncon) dvec[1]<-alpha/2 } dvec<-2*dvec } if(bhop)dvec<-(ncon-c(1:ncon)+1)*alpha/ncon bvec<-matrix(NA,nrow=J,ncol=nboot) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") for(j in 1:J){ paste("Working on group ",j) data<-matrix(sample(x[[j]],size=length(x[[j]])*nboot,replace=T),nrow=nboot) bvec[j,]<-apply(data,1,est,...) # Bootstrapped values for jth group } test<-NA bcon<-t(con)%*%bvec #ncon by nboot matrix tvec<-t(con)%*%mvec for (d in 1:ncon){ test[d]<-(sum(bcon[d,]>0)+.5*sum(bcon[d,]==0))/nboot if(test[d]> .5)test[d]<-1-test[d] } test<-2*test output<-matrix(0,ncon,6) dimnames(output)<-list(NULL,c("con.num","psihat","p.value","p.crit","ci.lower","ci.upper")) temp2<-order(0-test) zvec<-dvec[1:ncon] sigvec<-(test[temp2]>=zvec) output[temp2,4]<-zvec icl<-round(dvec[ncon]*nboot/2)+1 icu<-nboot-icl-1 for (ic in 1:ncol(con)){ output[ic,2]<-tvec[ic,] output[ic,1]<-ic output[ic,3]<-test[ic] temp<-sort(bcon[ic,]) output[ic,5]<-temp[icl] output[ic,6]<-temp[icu] } num.sig<-sum(output[,3]<=output[,4]) list(output=output,con=con,num.sig=num.sig) } rmmcppbd<-function(x,y=NA,alpha=.05,con=0,est=mom,plotit=T,grp=NA,nboot=NA, hoch=F,...){ # # Use a percentile bootstrap method to compare dependent groups # based on difference scores. # By default, # compute a .95 confidence interval for all linear contasts # specified by con, a J by C matrix, where C is the number of # contrasts to be tested, and the columns of con are the # contrast coefficients. # If con is not specified, all pairwise comparisons are done. # # By default, MOM is use and a sequentially rejective method # is used to control the probability of at least one Type I error. # # nboot is the bootstrap sample size. If not specified, a value will # be chosen depending on the number of contrasts there are. # # x can be an n by J matrix or it can have list mode # for two groups, data for second group can be put in y # otherwise, assume x is a matrix (n by J) or has list mode. # # A sequentially rejective method is used to control alpha. # if(!is.na(y[1]))x<-cbind(x,y) if(!is.list(x) && !is.matrix(x))stop("Data must be stored in a matrix or in list mode.") if(is.list(x)){ if(is.matrix(con)){ if(length(x)!=nrow(con))stop("The number of rows in con is not equal to the number of groups.") }} if(is.list(x)){ # put the data in an n by J matrix mat<-matl(x) } if(is.matrix(x) && is.matrix(con)){ if(ncol(x)!=nrow(con))stop("The number of rows in con is not equal to the number of groups.") mat<-x } if(is.matrix(x))mat<-x if(!is.na(sum(grp)))mat<-mat[,grp] x<-mat mat<-elimna(mat) # Remove rows with missing values. x<-mat J<-ncol(mat) Jm<-J-1 if(sum(con^2)==0){ d<-(J^2-J)/2 con<-matrix(0,J,d) id<-0 for (j in 1:Jm){ jp<-j+1 for (k in jp:J){ id<-id+1 con[j,id]<-1 con[k,id]<-0-1 }}} d<-ncol(con) if(is.na(nboot)){ nboot<-5000 if(d<=10)nboot<-3000 if(d<=6)nboot<-2000 if(d<=4)nboot<-1000 } n<-nrow(mat) crit.vec<-alpha/c(1:d) connum<-ncol(con) # Create set of differences based on contrast coefficients xx<-x%*%con xx<-as.matrix(xx) set.seed(2) # set seed of random number generator so that # results can be duplicated. psihat<-matrix(0,connum,nboot) bvec<-matrix(NA,ncol=connum,nrow=nboot) print("Taking bootstrap samples. Please wait.") data<-matrix(sample(n,size=n*nboot,replace=T),nrow=nboot) # data is an nboot by n matrix if(ncol(xx)==1){ for(ib in 1:nboot)psihat[1,ib]<-est(xx[data[ib,]],...) } if(ncol(xx)>1){ for(ib in 1:nboot)psihat[,ib]<-apply(xx[data[ib,],],2,est,...) } # # Now have an nboot by connum matrix of bootstrap values. # test<-1 for (ic in 1:connum){ #test[ic]<-sum((psihat[ic,]>0))/nboot test[ic]<-(sum(psihat[ic,]>0)+.5*sum(psihat[ic,]==0))/nboot test[ic]<-min(test[ic],1-test[ic]) } test<-2*test ncon<-ncol(con) if(alpha==.05){ dvec<-c(.025,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) if(ncon > 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.005,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01){ dvec<-alpha/c(1:ncon) dvec[2]<-alpha/2 } if(hoch)dvec<-alpha/(2*c(1:ncon)) dvec<-2*dvec if(plotit && connum==1){ plot(c(psihat[1,],0),xlab="",ylab="Est. Difference") points(psihat[1,]) abline(0,0) } temp2<-order(0-test) ncon<-ncol(con) zvec<-dvec[1:ncon] sigvec<-(test[temp2]>=zvec) output<-matrix(0,connum,6) dimnames(output)<-list(NULL,c("con.num","psihat","p.value","p.crit","ci.lower","ci.upper")) tmeans<-apply(xx,2,est,...) psi<-1 icl<-round(dvec[ncon]*nboot/2)+1 icu<-nboot-icl-1 for (ic in 1:ncol(con)){ output[ic,2]<-tmeans[ic] output[ic,1]<-ic output[ic,3]<-test[ic] output[temp2,4]<-zvec temp<-sort(psihat[ic,]) output[ic,5]<-temp[icl] output[ic,6]<-temp[icu] } num.sig<-sum(output[,3]<=output[,4]) list(output=output,con=con,num.sig=num.sig) } bdms1<-function(x,con){ # This function is used by bdm # # Pool all data and rank pool<-x[[1]] JK<-length(x) for (j in 2:JK)pool<-c(pool,x[[j]]) N<-length(pool) rval<-rank(pool) rvec<-list() up<-length(x[[1]]) rvec[[1]]<-rval[1:up] rbar<-mean(rvec[[1]]) nvec<-length(rvec[[1]]) for(j in 2:JK){ down<-up+1 up<-down+length(x[[j]])-1 rvec[[j]]<-rval[down:up] nvec[j]<-length(rvec[[j]]) rbar[j]<-mean(rvec[[j]]) } phat<-(rbar-.5)/N phat<-as.matrix(phat) svec<-NA for(j in 1:JK)svec[j]<-sum((rvec[[j]]-rbar[j])^2)/(nvec[j]-1) svec<-svec/N^2 VN<-N*diag(svec/nvec) top<-con[1,1]*sum(diag(VN)) Ftest<-N*(t(phat)%*%con%*%phat)/top nu1<-con[1,1]^2*sum(diag(VN))^2/sum(diag(con%*%VN%*%con%*%VN)) lam<-diag(1/(nvec-1)) nu2<-sum(diag(VN))^2/sum(diag(VN%*%VN%*%lam)) sig<-1-pf(Ftest,nu1,nu2) list(F=Ftest,nu1=nu1,nu2=nu2,q.hat=phat,sig=sig) } r1mcp<-function(x,alpha=.05,bhop=F){ # # Do all pairwise comparisons using a modification of # the Brunner, Dette and Munk (1997) rank-based method. # FWE is controlled using Rom's technique. # # Setting bhop=T, FWE is controlled using the # Benjamini-Hochberg Method. # # The data are assumed to be stored in x in list mode or in a matrix. # # Missing values are automatically removed. # if(is.matrix(x))x <- listm(x) if(!is.list(x)) stop("Data must be stored in list mode or a matrix.") J<-length(x) for(j in 1:J) { xx <- x[[j]] x[[j]] <- xx[!is.na(xx)] # Remove missing values } # CC<-(J^2-J)/2 # Determine critical values ncon<-CC if(!bhop){ if(alpha==.05){ dvec<-c(.05,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) if(ncon > 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.01,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01)dvec<-alpha/c(1:ncon) } if(bhop)dvec<-(ncon-c(1:ncon)+1)*alpha/ncon output<-matrix(0,CC,5) dimnames(output)<-list(NULL,c("Level","Level","test.stat","p.value","p.crit")) ic<-0 for(j in 1:J){ for(jj in 1:J){ if(j < jj){ ic<-ic+1 output[ic,1]<-j output[ic,2]<-jj temp<-bdm(x[c(j,jj)]) output[ic,3]<-temp$output$F output[ic,4]<-temp$output$sig }}} temp2<-order(0-output[,4]) output[temp2,5]<-dvec[1:length(temp2)] list(output=output) } tamhane<-function(x,x2=NA,cil=NA,crit=NA){ # # First stage of Tamhane's method # # x contains first stage data # x2 contains second stage data # # cil is the desired length of the confidence intervals. # That is, cil is the distance between the upper and lower # ends of the confidence intervals. # if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in list mode or in matrix mode.") J<-length(x) tempn<-0 svec<-NA for(j in 1:J){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. tempn[j]<-length(temp) x[[j]]<-temp svec[j]<-var(temp) } A<-sum(1/(tempn-1)) df<-J/A paste("The degrees of freedom are:",df) if(is.na(crit))stop("Enter a critical value and reexecute this function") if(is.na(cil))stop("To proceed, you must specify the length of the confidence intervals.") d<-(cil/(2*crit))^2 n.vec<-NA for(j in 1:J){ n.vec[j]<-max(tempn[j]+1,floor(svec[j]/d)+1) } ci.mat<-NA if(!is.na(x2[1])){ if(is.matrix(x2))x2<-listm(x2) if(!is.list(x2))stop("Data must be stored in list mode or in matrix mode.") TT<-NA U<-NA J<-length(x) nvec2<-NA for(j in 1:length(x)){ nvec2[j]<-length(x2[[j]]) if(nvec2[j] 0))stop("Missing values not allowed") rval<-rank(alldat[2:length(alldat)]) rdd<-mean(rval) # R bar ... xr<-list() il<-1-nvec[1] iu<-0 for(j in 1:p){ il<-il+nvec[j] iu<-iu+nvec[j] xr[[j]]<-rval[il:iu] } v<-matrix(0,p,p) Ja<-matrix(1,J,J) Ia<-diag(1,J) Pa<-Ia-Ja/J Jb<-matrix(1,K,K) Ib<-diag(1,K) Pb<-Ib-Jb/K cona<-kron(Pa,Ib) conb<-kron(Ia,Pb) conab<-kron(Pa,Pb) for(k in 1:K){ temp<-x[[k]] bigm<-matrix(temp,ncol=1) jk<-k for (j in 2:J){ jk<-jk+K tempc<-matrix(x[[jk]],ncol=1) bigm<-rbind(bigm,tempc) temp<-c(temp,x[[jk]]) }} N<-length(temp) rbbd<-NA for(k in 1:K){ bigm<-xr[[k]] jk<-k for (j in 2:J){ jk<-jk+K bigm<-c(bigm,xr[[jk]]) } #rbbd[k]<-mean(bigm) #R bar ..k } rbjk<-matrix(NA,nrow=J,ncol=K) #R_.jk ic<-0 for (j in 1:J){ for(k in 1:K){ ic<-ic+1 rbjk[j,k]<-mean(xr[[ic]]) # R bar_.jk }} for(k in 1:K)rbbd[k]<-mean(rbjk[,k]) rbj<-1 # R_.j. sigv<-0 njsam<-0 # n_j icc<-1-K ivec<-c(1:K)-K for (j in 1:J){ icc<-icc+K ivec<-ivec+K temp<-xr[ivec[1]:ivec[K]] temp<-matl(temp) tempv<-apply(temp,1,mean) njsam[j]<-nvec[icc] rbj[j]<-mean(rbjk[j,]) sigv[j]<-var(tempv) # var of R bar_ij. } nv<-sum(njsam) phat<-(rbjk-.5)/(nv*K) sv2<-sum(sigv/njsam) uv<-sum((sigv/njsam)^2) dv<-sum((sigv/njsam)^2/(njsam-1)) testA<-J*var(rbj)/sv2 klow<-1-K kup<-0 sv<-matrix(0,nrow=K,ncol=K) rvk<-NA for(j in 1:J){ klow<-klow+K kup<-kup+K sel<-c(klow:kup) m<-matl(xr[klow:kup]) m<-elimna(m) xx<-listm(m) xx<-listm(m) vsub<-nv*var(m)/(nv*K*nv*K*as.double(njsam[j])) v[sel,sel]<-vsub sv<-sv+vsub } sv<-sv/J^2 testB<-nv/(nv*K*nv*K*sum(diag(Pb%*%sv)))*sum((rbbd-mean(rbbd))^2) testAB<-0 for (j in 1:J){ for (k in 1:K){ testAB<-testAB+(rbjk[j,k]-rbj[j]-rbbd[k]+rdd)^2 }} testAB<-nv*testAB/(nv*K*nv*K*sum(diag(conab%*%v))) nu1B<-(sum(diag(Pb%*%sv)))^2/sum((diag(Pb%*%sv%*%Pb%*%sv))) nu1A<-(J-1)^2/(1+J*(J-2)*uv/sv2^2) nu2A<-sv2^2/dv nu1AB<-(sum(diag(conab%*%v)))^2/sum(diag(conab%*%v%*%conab%*%v)) sig.A<-1-pf(testA,nu1A,nu2A) sig.B<-1-pf(testB,nu1B,1000000) sig.AB<-1-pf(testAB,nu1AB,1000000) list(test.A=testA,p.value.A=sig.A,test.B=testB,p.value.B=sig.B,test.AB=testAB, p.value.AB=sig.AB,avg.ranks=rbjk,rel.effects=phat) } r2mcp<-function(J,K,x,grp=NA,alpha=.05,bhop=F){ # # Do all pairwise comparisons of # main effects for Factor A and B and all interactions # using a rank-based method that tests for equal distributions. # # The data are assumed to be stored in x in list mode or in a matrix. # If grp is unspecified, it is assumed x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second factor: level 1,2 # x[[j+1]] is the data for level 2,1, etc. # If the data are in the wrong order, grp can be used to rearrange the # groups. For example, for a two by two design, grp<-c(2,4,3,1) # indicates that the second group corresponds to level 1,1; # group 4 corresponds to level 1,2; group 3 is level 2,1; # and group 1 is level 2,2. # # Missing values are automatically removed. # JK <- J * K if(is.matrix(x)) x <- listm(x) if(!is.na(grp[1])) { yy <- x x<-list() for(j in 1:length(grp)) x[[j]] <- yy[[grp[j]]] } if(!is.list(x)) stop("Data must be stored in list mode or a matrix.") for(j in 1:JK) { xx <- x[[j]] x[[j]] <- xx[!is.na(xx)] # Remove missing values } # if(JK != length(x)){ print("Warning: The number of groups does not match") print("the number of contrast coefficients.") } for(j in 1:JK){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. x[[j]]<-temp } # CC<-(J^2-J)/2 # Determine critical values ncon<-CC*(K^2-K)/2 if(!bhop){ if(alpha==.05){ dvec<-c(.05,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) if(ncon > 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.01,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01)dvec<-alpha/c(1:ncon) } if(bhop)dvec<-(ncon-c(1:ncon)+1)*alpha/ncon Fac.A<-matrix(0,CC,5) dimnames(Fac.A)<-list(NULL,c("Level","Level","test.stat","p.value","p.crit")) mat<-matrix(c(1:JK),ncol=K,byrow=T) ic<-0 for(j in 1:J){ for(jj in 1:J){ if(j < jj){ ic<-ic+1 Fac.A[ic,1]<-j Fac.A[ic,2]<-jj temp<-bdm2way(2,K,x[c(mat[j,],mat[jj,])]) Fac.A[ic,3]<-temp$outputA$F Fac.A[ic,4]<-temp$outputA$sig }}} temp2<-order(0-Fac.A[,4]) Fac.A[temp2,5]<-dvec[1:length(temp2)] CCB<-(K^2-K)/2 ic<-0 Fac.B<-matrix(0,CCB,5) dimnames(Fac.B)<-list(NULL,c("Level","Level","test.stat","p.value","p.crit")) for(k in 1:K){ for(kk in 1:K){ if(k .5 || temp$ci.p[2] < .5) }}} list(test=test) } spmcpa<-function(J,K,x,est=tmean,JK=J*K,grp=c(1:JK),con=0,avg=F,alpha=.05, nboot=NA,pr=T,...){ # # A percentile bootstrap for multiple comparisons among # all main effects for independent groups in a split-plot design # The analysis is done by generating bootstrap samples and # using an appropriate linear contrast. # # The s-plus variable x is assumed to contain the raw # data stored in list mode or in a matrix. # If in list mode, x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second: level 1,2 # x[[K]] is the data for level 1,K # x[[K+1]] is the data for level 2,1, x[[2K]] is level 2,K, etc. # # If the data are in a matrix, column 1 is assumed to # correspond to x[[1]], column 2 to x[[2]], etc. # # When in list mode x is assumed to have length JK, the total number of # groups being tested, but a subset of the data can be analyzed # using grp # if(is.matrix(x)) { y <- list() for(j in 1:ncol(x)) y[[j]] <- x[, j] x <- y } if(pr)print("As of Sept. 2005, est defaults to tmean") JK<-J*K data<-list() for(j in 1:length(x)){ data[[j]]<-x[[grp[j]]] # Now have the groups in proper order. } x<-data jp<-1-K kv<-0 kv2<-0 for(j in 1:J){ jp<-jp+K xmat<-matrix(NA,ncol=K,nrow=length(x[[jp]])) for(k in 1:K){ kv<-kv+1 xmat[,k]<-x[[kv]] } xmat<-elimna(xmat) for(k in 1:K){ kv2<-kv2+1 x[[kv2]]<-xmat[,k] }} xx<-x set.seed(2) # set seed of random number generator so that # results can be duplicated. # Next determine the n_j values nvec<-NA jp<-1-K for(j in 1:J){ jp<-jp+K nvec[j]<-length(x[[jp]]) } if(avg){ d<-(J^2-J)/2 con<-matrix(0,J,d) id<-0 Jm<-J-1 for (j in 1:Jm){ jp<-j+1 for(k in jp:J){ id<-id+1 con[j,id]<-1 con[k,id]<-0-1 }}} if(!avg){ MJK<-K*(J^2-J)/2 # NUMBER OF COMPARISONS JK<-J*K MJ<-(J^2-J)/2 cont<-matrix(0,nrow=J,ncol=MJ) ic<-0 for(j in 1:J){ for(jj in 1:J){ if(j1){ for(k in 2:K){ con1<-push(con1) con<-cbind(con,con1) }}} d<-ncol(con) if(is.na(nboot)){ if(d<=4)nboot<-1000 if(d>4)nboot<-5000 } # # Now take bootstrap samples from jth level # of Factor A and average K corresponding estimates # of location. # bloc<-matrix(NA,nrow=J,ncol=nboot) print("Taking bootstrap samples. Please wait.") mvec<-NA ik<-0 for(j in 1:J){ paste("Working on level ",j," of Factor A") x<-matrix(NA,nrow=nvec[j],ncol=K) # for(k in 1:K){ ik<-ik+1 x[,k]<-xx[[ik]] if(!avg)mvec[ik]<-est(xx[[ik]],...) } tempv<-apply(x,2,est,...) data<-matrix(sample(nvec[j],size=nvec[j]*nboot,replace=T),nrow=nboot) bvec<-matrix(NA,ncol=K,nrow=nboot) mat<-listm(x) for(k in 1:K){ temp<-x[,k] bvec[,k]<-apply(data,1,rmanogsub,temp,est,...) # An nboot by K matrix } if(avg){ mvec[j]<-mean(tempv) bloc[j,]<-apply(bvec,1,mean) } if(!avg){ if(j==1)bloc<-bvec if(j>1)bloc<-cbind(bloc,bvec) } } if(avg)bloc<-t(bloc) connum<-d psihat<-matrix(0,connum,nboot) test<-1 for (ic in 1:connum){ psihat[ic,]<-apply(bloc,1,bptdpsi,con[,ic]) #test[ic]<-sum((psihat[ic,]>0))/nboot test[ic]<-(sum(psihat[ic,]>0)+.5*sum(psihat[ic,]==0))/nboot test[ic]<-min(test[ic],1-test[ic]) } ncon<-ncol(con) if(alpha==.05){ dvec<-c(.025,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) if(ncon > 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.005,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01){ dvec<-alpha/c(1:ncon) dvec[1]<-alpha/2 } temp2<-order(0-test) ncon<-ncol(con) zvec<-dvec[1:ncon] sigvec<-(test[temp2]>=zvec) output<-matrix(0,connum,6) dimnames(output)<-list(NULL,c("con.num","psihat","p.value","p.sig","ci.lower","ci.upper")) tmeans<-mvec psi<-1 output[temp2,4]<-zvec for (ic in 1:ncol(con)){ output[ic,2]<-sum(con[,ic]*tmeans) output[ic,1]<-ic output[ic,3]<-test[ic] temp<-sort(psihat[ic,]) temp3<-round(output[ic,4]*nboot)+1 icl<-round(dvec[ncon]*nboot)+1 icu<-nboot-(icl-1) output[ic,5]<-temp[icl] output[ic,6]<-temp[icu] } output[,3]<-2*output[,3] output[,4]<-2*output[,4] num.sig<-sum(output[,3]<=output[,4]) list(output=output,con=con,num.sig=num.sig) } spmcpi<-function(J,K,x,est=tmean,JK=J*K,grp=c(1:JK),alpha=.05,nboot=NA, SEED=T,pr=T,...){ # # Multiple comparisons for interactions # in a split-plot design. # The analysis is done by taking difference scores # among all pairs of dependent groups and # determining which of # these differences differ across levels of Factor A. # # The s-plus variable x is assumed to contain the raw # data stored in list mode or in a matrix. # If in list mode, x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second: level 1,2 # x[[K]] is the data for level 1,K # x[[K+1]] is the data for level 2,1, x[[2K]] is level 2,K, etc. # # If the data are in a matrix, column 1 is assumed to # correspond to x[[1]], column 2 to x[[2]], etc. # # When in list mode x is assumed to have length JK, the total number of # groups being tested, but a subset of the data can be analyzed # using grp # if(is.matrix(x)) { y <- list() for(j in 1:ncol(x)) y[[j]] <- x[, j] x <- y } if(pr)print("As of Sept. 2005, est defaults to tmean") JK<-J*K if(JK!=length(x)){ print("Something is wrong.") paste(" Expected ",JK," groups but x contains ", length(x), "groups instead.") stop() } MJ<-(J^2-J)/2 MK<-(K^2-K)/2 JMK<-J*MK Jm<-J-1 data<-list() for(j in 1:length(x)){ data[[j]]<-x[[grp[j]]] # Now have the groups in proper order. } x<-data jp<-1-K kv<-0 kv2<-0 for(j in 1:J){ jp<-jp+K xmat<-matrix(NA,ncol=K,nrow=length(x[[jp]])) for(k in 1:K){ kv<-kv+1 xmat[,k]<-x[[kv]] } xmat<-elimna(xmat) for(k in 1:K){ kv2<-kv2+1 x[[kv2]]<-xmat[,k] }} xx<-x if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. # Next determine the n_j values nvec<-NA jp<-1-K for(j in 1:J){ jp<-jp+K nvec[j]<-length(x[[jp]]) } # MJMK<-MJ*MK con<-matrix(0,nrow=JMK,ncol=MJMK) cont<-matrix(0,nrow=J,ncol=MJ) ic<-0 for(j in 1:J){ for(jj in 1:J){ if(j1){ for(k in 2:MK){ con1<-push(con1) con<-cbind(con,con1) }} d<-ncol(con) if(is.na(nboot)){ if(d<=4)nboot<-1000 if(d>4)nboot<-5000 } connum<-d psihat<-matrix(0,connum,nboot) # # Now take bootstrap samples from jth level # of Factor A and average K corresponding estimates # of location. # bloc<-matrix(NA,ncol=J,nrow=nboot) print("Taking bootstrap samples. Please wait.") mvec<-NA it<-0 for(j in 1:J){ paste("Working on level ",j," of Factor A") x<-matrix(NA,nrow=nvec[j],ncol=MK) # im<-0 for(k in 1:K){ for(kk in 1:K){ if(k1)bloc<-cbind(bloc,bvec) } test<-1 for (ic in 1:connum){ psihat[ic,]<-apply(bloc,1,bptdpsi,con[,ic]) #test[ic]<-sum((psihat[ic,]>0))/nboot test[ic]<-(sum(psihat[ic,]>0)+.5*sum(psihat[ic,]==0))/nboot test[ic]<-min(test[ic],1-test[ic]) } ncon<-ncol(con) if(alpha==.05){ dvec<-c(.025,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) if(ncon > 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.005,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01){ dvec<-alpha/c(1:ncon) dvec[1]<-alpha/2 } temp2<-order(0-test) ncon<-ncol(con) zvec<-dvec[1:ncon] sigvec<-(test[temp2]>=zvec) output<-matrix(0,connum,6) dimnames(output)<-list(NULL,c("con.num","psihat","sig","crit.sig","ci.lower","ci.upper")) tmeans<-mvec psi<-1 for (ic in 1:ncol(con)){ output[ic,2]<-sum(con[,ic]*tmeans) output[ic,1]<-ic output[ic,3]<-test[ic] output[temp2,4]<-zvec temp<-sort(psihat[ic,]) icl<-round(dvec[ncon]*nboot)+1 icu<-nboot-(icl-1) output[ic,5]<-temp[icl] output[ic,6]<-temp[icu] } output[,3]<-2*output[,3] output[,4]<-2*output[,4] num.sig<-sum(output[,3]<=output[,4]) list(output=output,con=con,num.sig=num.sig) } sppba<-function(J,K,x,est=mom,JK=J*K,grp=c(1:JK),avg=F,nboot=500,SEED=T,...){ # # A percentile bootstrap for main effects # among independent groups in a split-plot design # # avg=T: The analysis is done by averaging K measures of # location for each level of Factor A, # and then comparing averages by testing the hypothesis # that all pairwise differences are equal to zero. # # avg=F: The analysis is done by testing whether $K$ equalities are # simultaneously true. For kth level of Factor B, the kth equality is # theta_{1k}= ... theta_{Jk}, k=1,...,K. # # The s-plus variable x is assumed to contain the raw # data stored in list mode or in a matrix. # If in list mode, x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second: level 1,2 # x[[K]] is the data for level 1,K # x[[K+1]] is the data for level 2,1, x[[2K]] is level 2,K, etc. # # If the data are in a matrix, column 1 is assumed to # correspond to x[[1]], column 2 to x[[2]], etc. # # When in list mode x is assumed to have length JK, the total number of # groups being tested, but a subset of the data can be analyzed # using grp # library(MASS) if(is.matrix(x)) { y <- list() for(j in 1:ncol(x)) y[[j]] <- x[, j] x <- y } JK<-J*K data<-list() for(j in 1:length(x)){ data[[j]]<-x[[grp[j]]] # Now have the groups in proper order. } x<-data jp<-1-K kv<-0 kv2<-0 for(j in 1:J){ jp<-jp+K xmat<-matrix(NA,ncol=K,nrow=length(x[[jp]])) for(k in 1:K){ kv<-kv+1 xmat[,k]<-x[[kv]] } xmat<-elimna(xmat) for(k in 1:K){ kv2<-kv2+1 x[[kv2]]<-xmat[,k] } } xx<-x if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. # Next determine the n_j values nvec<-NA jp<-1-K for(j in 1:J){ jp<-jp+K nvec[j]<-length(x[[jp]]) } # # Now take bootstrap samples from jth level # of Factor A. # bloc<-matrix(NA,nrow=J,ncol=nboot) print("Taking bootstrap samples. Please wait.") mvec<-NA ik<-0 for(j in 1:J){ paste("Working on level ",j," of Factor A") x<-matrix(NA,nrow=nvec[j],ncol=K) # for(k in 1:K){ ik<-ik+1 x[,k]<-xx[[ik]] if(!avg)mvec[ik]<-est(xx[[ik]],...) } tempv<-apply(x,2,est,...) data<-matrix(sample(nvec[j],size=nvec[j]*nboot,replace=T),nrow=nboot) bvec<-matrix(NA,ncol=K,nrow=nboot) for(k in 1:K){ temp<-x[,k] bvec[,k]<-apply(data,1,rmanogsub,temp,est,...) # An nboot by K matrix } if(avg){ mvec[j]<-mean(tempv) bloc[j,]<-apply(bvec,1,mean) } if(!avg){ if(j==1)bloc<-bvec if(j>1)bloc<-cbind(bloc,bvec) } } if(avg){ d<-(J^2-J)/2 con<-matrix(0,J,d) id<-0 Jm<-J-1 for (j in 1:Jm){ jp<-j+1 for(k in jp:J){ id<-id+1 con[j,id]<-1 con[k,id]<-0-1 }}} if(!avg){ MJK<-K*(J^2-J)/2 # NUMBER OF COMPARISONS JK<-J*K MJ<-(J^2-J)/2 cont<-matrix(0,nrow=J,ncol=MJ) ic<-0 for(j in 1:J){ for(jj in 1:J){ if(j1){ for(k in 2:K){ con1<-push(con1) con<-cbind(con,con1) }}} if(!avg)bcon<-t(con)%*%t(bloc) #C by nboot matrix if(avg)bcon<-t(con)%*%(bloc) tvec<-t(con)%*%mvec tvec<-tvec[,1] tempcen<-apply(bcon,1,mean) vecz<-rep(0,ncol(con)) bcon<-t(bcon) smat<-var(bcon-tempcen+tvec) bcon<-rbind(bcon,vecz) chkrank<-qr(smat)$rank if(chkrank==ncol(smat))dv<-mahalanobis(bcon,tvec,smat) if(chkrank=dv[1:nboot])/nboot list(p.value=sig.level,psihat=tvec,con=con) } sppbb<-function(J,K,x,est=mom,JK=J*K,grp=c(1:JK),nboot=500,SEED=T,...){ # # A percentile bootstrap for main effects # among dependent groups in a split-plot design # The analysis is done based on all pairs # of difference scores. The null hypothesis is that # all such differences have a typical value of zero. # # The s-plus variable x is assumed to contain the raw # data stored in list mode or in a matrix. # If in list mode, x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second: level 1,2 # x[[K]] is the data for level 1,K # x[[K+1]] is the data for level 2,1, x[[2K]] is level 2,K, etc. # # If the data are in a matrix, column 1 is assumed to # correspond to x[[1]], column 2 to x[[2]], etc. # # When in list mode x is assumed to have length JK, the total number of # groups being tested, but a subset of the data can be analyzed # using grp # if(is.matrix(x)) { y <- list() for(j in 1:ncol(x)) y[[j]] <- x[, j] x <- y } JK<-J*K data<-list() for(j in 1:length(x)){ data[[j]]<-x[[grp[j]]] # Now have the groups in proper order. } x<-data jp<-1-K kv<-0 kv2<-0 for(j in 1:J){ jp<-jp+K xmat<-matrix(NA,ncol=K,nrow=length(x[[jp]])) for(k in 1:K){ kv<-kv+1 xmat[,k]<-x[[kv]] } xmat<-elimna(xmat) for(k in 1:K){ kv2<-kv2+1 x[[kv2]]<-xmat[,k] }} xx<-x if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. # Next determine the n_j values nvec<-NA jp<-1-K for(j in 1:J){ jp<-jp+K nvec[j]<-length(x[[jp]]) } # # Now stack the data in an N by K matrix # x<-matrix(NA,nrow=nvec[1],ncol=K) # for(k in 1:K)x[,k]<-xx[[k]] kc<-K for(j in 2:J){ temp<-matrix(NA,nrow=nvec[j],ncol=K) for(k in 1:K){ kc<-kc+1 temp[,k]<-xx[[kc]] } x<-rbind(x,temp) } # Now call function rmdzero to do the analysis temp<-rmdzero(x,est=est,nboot=nboot,...) list(p.value=temp$p.value,center=temp$center) } sppbi<-function(J,K,x,est=mom,JK=J*K,grp=c(1:JK),nboot=500,SEED=T,...){ # # A percentile bootstrap for interactions # in a split-plot design. # The analysis is done by taking difference scores # among all pairs of dependent groups and seeing whether # these differences differ across levels of Factor A. # # The s-plus variable x is assumed to contain the raw # data stored in list mode or in a matrix. # If in list mode, x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second: level 1,2 # x[[K]] is the data for level 1,K # x[[K+1]] is the data for level 2,1, x[[2K]] is level 2,K, etc. # # If the data are in a matrix, column 1 is assumed to # correspond to x[[1]], column 2 to x[[2]], etc. # # When in list mode x is assumed to have length JK, the total number of # groups being tested, but a subset of the data can be analyzed # using grp # library(MASS) if(is.matrix(x)) { y <- list() for(j in 1:ncol(x)) y[[j]] <- x[, j] x <- y } JK<-J*K MJ<-(J^2-J)/2 MK<-(K^2-K)/2 JMK<-J*MK Jm<-J-1 data<-list() for(j in 1:length(x)){ data[[j]]<-x[[grp[j]]] # Now have the groups in proper order. } x<-data jp<-1-K kv<-0 kv2<-0 for(j in 1:J){ jp<-jp+K xmat<-matrix(NA,ncol=K,nrow=length(x[[jp]])) for(k in 1:K){ kv<-kv+1 xmat[,k]<-x[[kv]] } xmat<-elimna(xmat) for(k in 1:K){ kv2<-kv2+1 x[[kv2]]<-xmat[,k] }} xx<-x if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. # Next determine the n_j values nvec<-NA jp<-1-K for(j in 1:J){ jp<-jp+K nvec[j]<-length(x[[jp]]) } # # Now take bootstrap samples from jth level # of Factor A and average K corresponding estimates # of location. # bloc<-matrix(NA,ncol=J,nrow=nboot) print("Taking bootstrap samples. Please wait.") mvec<-NA it<-0 for(j in 1:J){ paste("Working on level ",j," of Factor A") x<-matrix(NA,nrow=nvec[j],ncol=MK) # im<-0 for(k in 1:K){ for(kk in 1:K){ if(k1)bloc<-cbind(bloc,bvec) } # MJMK<-MJ*MK con<-matrix(0,nrow=JMK,ncol=MJMK) cont<-matrix(0,nrow=J,ncol=MJ) ic<-0 for(j in 1:J){ for(jj in 1:J){ if(j1){ for(k in 2:MK){ con1<-push(con1) con<-cbind(con,con1) }} bcon<-t(con)%*%t(bloc) #C by nboot matrix tvec<-t(con)%*%mvec tvec<-tvec[,1] tempcen<-apply(bcon,1,mean) vecz<-rep(0,ncol(con)) bcon<-t(bcon) smat<-var(bcon-tempcen+tvec) chkrank<-qr(smat)$rank bcon<-rbind(bcon,vecz) if(chkrank==ncol(smat))dv<-mahalanobis(bcon,tvec,smat) if(chkrank=dv[1:nboot])/nboot list(p.value=sig.level,psihat=tvec,con=con) } spmcpb<-function(J,K,x,est=mom,JK=J*K,grp=c(1:JK),dif=T,alpha=.05, nboot=NA,pr=T,...){ # # A percentile bootstrap for all pairwise # multiple comparisons # among dependent groups in a split-plot design # # If dif=T, the analysis is done based on all pairs # of difference scores. # Otherwise, marginal measures of location are used. # # The s-plus variable x is assumed to contain the raw # data stored in list mode or in a matrix. # If in list mode, x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second: level 1,2 # x[[K]] is the data for level 1,K # x[[K+1]] is the data for level 2,1, x[[2K]] is level 2,K, etc. # # If the data are in a matrix, column 1 is assumed to # correspond to x[[1]], column 2 to x[[2]], etc. # # When in list mode x is assumed to have length JK, the total number of # groups being tested, but a subset of the data can be analyzed # using grp # if(is.matrix(x)) { y <- list() for(j in 1:ncol(x)) y[[j]] <- x[, j] x <- y } if(pr)print("As of Sept. 2005, est defaults to tmean") JK<-J*K data<-list() for(j in 1:length(x)){ data[[j]]<-x[[grp[j]]] # Now have the groups in proper order. } x<-data jp<-1-K kv<-0 kv2<-0 for(j in 1:J){ jp<-jp+K xmat<-matrix(NA,ncol=K,nrow=length(x[[jp]])) for(k in 1:K){ kv<-kv+1 xmat[,k]<-x[[kv]] } xmat<-elimna(xmat) for(k in 1:K){ kv2<-kv2+1 x[[kv2]]<-xmat[,k] }} xx<-x set.seed(2) # set seed of random number generator so that # results can be duplicated. # Next determine the n_j values nvec<-NA jp<-1-K for(j in 1:J){ jp<-jp+K nvec[j]<-length(x[[jp]]) } # # Now stack the data in an N by K matrix # x<-matrix(NA,nrow=nvec[1],ncol=K) # for(k in 1:K)x[,k]<-xx[[k]] kc<-K for(j in 2:J){ temp<-matrix(NA,nrow=nvec[j],ncol=K) for(k in 1:K){ kc<-kc+1 temp[,k]<-xx[[kc]] x<-rbind(x,temp) }} # Now call function rmmcppb to do the analysis temp<-rmmcppb(x,est=est,nboot=nboot,dif=dif,alpha=alpha,plotit=F,...) list(output=temp$output,con=temp$con,num.sig=temp$num.sig) } bwamcp<-function(J,K,x,tr=.2,JK=J*K,grp=c(1:JK),alpha=.05,KB=F,op=T){ # # All pairwise comparisons among levels of Factor A # in a split-plot design using trimmed means. # # Data among dependent groups are pooled for each level # of Factor A. # Then this function calls lincon. # # The s-plus variable x is assumed to contain the raw # data stored in list mode or in a matrix. # If in list mode, x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second: level 1,2 # x[[K]] is the data for level 1,K # x[[K+1]] is the data for level 2,1, x[[2K]] is level 2,K, etc. # # If the data are in a matrix, column 1 is assumed to # correspond to x[[1]], column 2 to x[[2]], etc. # # When in list mode x is assumed to have length JK, the total number of # groups being tested, but a subset of the data can be analyzed # using grp # if(is.matrix(x)) { y <- list() for(j in 1:ncol(x)) y[[j]] <- x[, j] x <- y } JK<-J*K if(!op){ data<-list() for(j in 1:length(x)){ data[[j]]<-x[[grp[j]]] # Now have the groups in proper order. } x<-data data<-list() jp<-1-K kv<-0 for(j in 1:J){ jp<-jp+K for(k in 1:K){ kv<-kv+1 if(k==1)temp<-x[[jp]] if(k>1)temp<-c(temp,x[[kv]]) } data[[j]]<-temp } print("Group numbers refer to levels of Factor A") temp<-lincon(data,tr=tr,alpha=alpha,KB=KB) } if(op){ MJK<-K*(J^2-J)/2 # NUMBER OF COMPARISONS JK<-J*K MJ<-(J^2-J)/2 cont<-matrix(0,nrow=J,ncol=MJ) ic<-0 for(j in 1:J){ for(jj in 1:J){ if(j1){ for(k in 2:K){ con1<-push(con1) con<-cbind(con,con1) }} print("Contrast Matrix Used:") print(con) temp<-lincon(x,con=con,tr=tr,KB=KB,alpha=alpha) } temp } bwbmcp<-function(J,K,x,tr=.2,JK=J*K,grp=c(1:JK),con=0,alpha=.05,dif=T,pool=F){ # # All pairwise comparisons among levels of Factor B # in a split-plot design using trimmed means. # # Data are pooled for each level # of Factor B. # Then this function calls rmmcp. # # The s-plus variable x is assumed to contain the raw # data stored in list mode or in a matrix. # If in list mode, x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second: level 1,2 # x[[K]] is the data for level 1,K # x[[K+1]] is the data for level 2,1, x[[2K]] is level 2,K, etc. # # If the data are in a matrix, column 1 is assumed to # correspond to x[[1]], column 2 to x[[2]], etc. # # When in list mode x is assumed to have length JK, the total number of # groups being tested, but a subset of the data can be analyzed # using grp # if(is.matrix(x)) { y <- list() for(j in 1:ncol(x)) y[[j]] <- x[, j] x <- y } JK<-J*K data<-list() for(j in 1:length(x)){ data[[j]]<-x[[grp[j]]] # Now have the groups in proper order. } x<-data if(pool){ data<-list() m1<-matrix(c(1:JK),J,K,byrow=T) for(k in 1:K){ for(j in 1:J){ flag<-m1[j,k] if(j==1)temp<-x[[flag]] if(j>1){ temp<-c(temp,x[[flag]]) }} data[[k]]<-temp } print("Group numbers refer to levels of Factor B") temp<-rmmcp(data,con=con,tr=tr,alpha=alpha,dif=dif) return(temp) } if(!pool){ mat<-matrix(c(1:JK),ncol=K,byrow=T) for(j in 1:J){ data<-list() ic<-0 for(k in 1:K){ ic<-ic+1 data[[ic]]<-x[[mat[j,k]]] } paste("For level ", j, " of Factor A:") temp<-rmmcp(data,con=con,tr=tr,alpha=alpha,dif=dif) print(temp$test) print(temp$psihat) }} } pcor<-function(x,y=NA){ if(!is.na(y[1]))temp<-wincor(x,y,tr=0) if(is.na(y[1]))temp<-winall(x,tr=0) list(cor=temp$cor,siglevel=temp$siglevel) } apgdis<-function(m,est=sum,se=T,...){ # # For multivariate data, # compute distance between each pair # of points and measure depth of a point # in terms of its distance to all # other points # # Using se=T ensures that ordering of distance # will not change with a change in scale. # # m is an n by p matrix # m<-elimna(m) # eliminate any missing values temp<-0 if(se){ for(j in 1:ncol(m))m[,j]<-(m[,j]-median(m[,j]))/mad(m[,j]) } for(j in 1:ncol(m)){ disx<-outer(m[,j],m[,j],"-") temp<-temp+disx^2 } temp<-sqrt(temp) dis<-apply(temp,1,est,...) temp2<-order(dis) center<-m[temp2[1],] list(center=center,distance=dis) } rd2plot<-function(x,y,fr=.8,xlab="",ylab=""){ # # Expected frequency curve # for two groups. # # fr controls amount of smoothing x<-elimna(x) y<-elimna(y) rmdx<-NA rmdy<-NA for(i in 1:length(x)){ rmdx[i]<-sum(near(x,x[i],fr)) } for(i in 1:length(y)){ rmdy[i]<-sum(near(y,y[i],fr)) } rmdx<-rmdx/length(x) rmdy<-rmdy/length(y) plot(c(x,y),c(rmdx,rmdy),type="n",ylab=ylab,xlab=xlab) sx<-sort(x) xorder<-order(x) sysm<-rmdx[xorder] lines(sx,sysm) sy<-sort(y) yorder<-order(y) sysm<-rmdy[yorder] lines(sy,sysm,lty=2) } depth2<-function(x,pts=NA,plotit=T,xlab="VAR 1",ylab="VAR 2"){ # # Compute exact depths for bivariate data if(ncol(x)!=2)stop("x must be a matrix with 2 columns") x<-elimna(x) if(is.na(pts[1]))pts<-x if(ncol(pts)!=2)stop("Argument pts must be stored as a matrix with 2 columns") pts<-as.matrix(pts) ndepth<-NA for(i in 1:nrow(pts)){ ndepth[i]<-depth(pts[i,1],pts[i,2],x) } if(plotit){ m<-x plot(m,xlab=xlab,ylab=ylab) flag<-(ndepth==max(ndepth)) if(sum(flag)==1)center<-m[flag,] if(sum(flag)>1)center<-apply(m[flag,],2,mean) points(center[1],center[2],pch="+") temp<-ndepth flag<-(temp>=median(temp)) xx<-x[flag,] xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) lines(xx[temp,]) lines(xx[c(temp[1],temp[length(temp)]),]) } ndepth } fdepth<-function(m,pts=NA,plotit=T,cop=2,center=NA,xlab="VAR 1", ylab="VAR 2"){ # # Determine depth of points in pts, relative to # points in m. If pts is not specified, # depth of all points in m are determined. # # m and pts can be vectors or matrices with # p columns (the number of variables). # # Determine center, for each point, draw a line # connecting it with center, project points onto this line # and determine depth of the projected points. # The final depth of a point is its minimum depth # among all projections. # # plotit=T creates a scatterplot when working with # bivariate data and pts=NA # # There are three options for computing the center of the # cloud of points when computing projections, assuming center=NA: # # cop=2 uses MCD center # cop=3 uses median of the marginal distributions. # cop=4 uses MVE center # # If a value for center is passed to this function, # this value is used to determine depths. # # When plotting, # center is marked with a cross, +. # library(MASS) if(cop!=2 && cop!=3 && cop!=4)stop("Only cop=2, 3 or 4 is allowed") if(is.list(m))stop("Store data in a matrix; might use function listm") m<-as.matrix(m) pts<-as.matrix(pts) if(!is.na(pts[1]))remm<-m nm<-nrow(m) nm1<-nm+1 if(!is.na(pts[1])){ if(ncol(m)!=ncol(pts))stop("Number of columns of m is not equal to number of columns for pts") } m<-elimna(m) # Remove missing values m<-as.matrix(m) if(ncol(m)==1)dep<-unidepth(as.vector(m[,1]),pts=pts) if(ncol(m)>1){ if(is.na(center[1])){ if(cop==2){ center<-cov.mcd(m)$center } if(cop==4){ center<-cov.mve(m)$center } if(cop==3){ center<-apply(m,2,median) }} if(is.na(pts[1])){ mdep <- matrix(NA,nrow=nrow(m),ncol=nrow(m)) } if(!is.na(pts[1])){ mdep <- matrix(NA,nrow=nrow(m),ncol=nrow(pts)) } for (i in 1:nrow(m)){ B<-m[i,]-center dis<-NA BB<-B^2 bot<-sum(BB) if(bot!=0){ if(is.na(pts[1])){ for (j in 1:nrow(m)){ A<-m[j,]-center temp<-sum(A*B)*B/bot dis[j]<-sign(sum(A*B))*sqrt(sum(temp^2)) }} if(!is.na(pts[1])){ m<-rbind(remm,pts) for (j in 1:nrow(m)){ A<-m[j,]-center temp<-sum(A*B)*B/bot dis[j]<-sign(sum(A*B))*sqrt(sum(temp^2)) }} # # For ith projection, store depths of # points in mdep[i,] # if(is.na(pts[1]))mdep[i,]<-unidepth(dis) if(!is.na(pts[1])){ mdep[i,]<-unidepth(dis[1:nm],dis[nm1:nrow(m)]) }} if(bot==0)mdep[i,]<-rep(0,ncol(mdep)) } dep<-apply(mdep,2,min) if(ncol(m)==2 && is.na(pts[1])){ flag<-chull(m) dep[flag]<-min(dep) } } if(ncol(m)==2){ if(is.na(pts[1]) && plotit){ plot(m,xlab=xlab,ylab=ylab) points(center[1],center[2],pch="+") x<-m temp<-dep flag<-(temp>=median(temp)) xx<-x[flag,] xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) lines(xx[temp,]) lines(xx[c(temp[1],temp[length(temp)]),]) }} dep<-round(dep*nrow(m))/nrow(m) dep } unidepth<-function(x,pts=NA){ # # Determine depth of points in the vector x # if(!is.vector(x))stop("x should be a vector") if(is.na(pts[1]))pts<-x pup<-apply(outer(pts,x,FUN="<="),1,sum)/length(x) pdown<-apply(outer(pts,x,FUN="<"),1,sum)/length(x) pdown<-1-pdown m<-matrix(c(pup,pdown),nrow=2,byrow=T) dep<-apply(m,2,min) dep } opreg<-function(x,y,regfun=tsreg,cop=3){ # # Do regression on points not labled outliers # using projection-type outlier detection method # x<-as.matrix(x) m<-cbind(x,y) m<-elimna(m) # eliminate any rows with missing data ivec<-outpro(m,plotit=F,cop=cop)$keep np1<-ncol(x)+1 coef<-regfun(m[ivec,1:ncol(x)],m[ivec,np1])$coef vec<-rep(1,length(y)) residuals<-y-cbind(vec,x)%*%coef list(coef=coef,residuals=residuals) } outpro<-function(m,gval=NA,center=NA,plotit=T,op=T,MM=F,cop=3, xlab="VAR 1",ylab="VAR 2"){ # # Detect outliers using a modification of the # Stahel-Donoho projection method. # # Determine center of data cloud, for each point, # connect it with center, project points onto this line # and use distances between projected points to detect # outliers. A boxplot method is used on the # projected distances. # # plotit=T creates a scatterplot when working with # bivariate data. # # op=T # means the .5 depth contour is plotted # based on data with outliers removed. # # op=F # means .5 depth contour is plotted without removing outliers. # # MM=F Use interquatile range when checking for outliers # MM=T uses MAD. # # If value for center is not specified, # there are four options for computing the center of the # cloud of points when computing projections: # # cop=2 uses MCD center # cop=3 uses median of the marginal distributions. # cop=4 uses MVE center # cop=5 uses TBS # cop=6 uses rmba (Olive's median ball algorithm) # cop=7 uses the spatial (L1) median # # When using cop=2, 3 or 4, default critical value for outliers # is square root of the .975 quantile of a # chi-squared distribution with p degrees # of freedom. # # Donoho-Gasko (Tukey) median is marked with a cross, +. # library(MASS) m<-as.matrix(m) if(ncol(m)==1){ dis<-(m-median(m))^2/mad(m)^2 dis<-sqrt(dis) crit<-sqrt(qchisq(.975,1)) chk<-ifelse(dis>crit,1,0) vec<-c(1:nrow(m)) outid<-vec[chk==1] keep<-vec[chk==0] } if(ncol(m)>1){ if(is.na(gval) && cop==1)gval<-sqrt(qchisq(.95,ncol(m))) if(is.na(gval) && cop!=1)gval<-sqrt(qchisq(.975,ncol(m))) m<-elimna(m) # Remove missing values if(cop==1 && is.na(center[1])){ if(ncol(m)>2)center<-dmean(m,tr=.5,cop=1) if(ncol(m)==2){ tempd<-NA for(i in 1:nrow(m)) tempd[i]<-depth(m[i,1],m[i,2],m) mdep<-max(tempd) flag<-(tempd==mdep) if(sum(flag)==1)center<-m[flag,] if(sum(flag)>1)center<-apply(m[flag,],2,mean) }} if(cop==2 && is.na(center[1])){ center<-cov.mcd(m)$center } if(cop==4 && is.na(center[1])){ center<-cov.mve(m)$center } if(cop==3 && is.na(center[1])){ center<-apply(m,2,median) } if(cop==5 && is.na(center[1])){ center<-tbs(m)$center } if(cop==6 && is.na(center[1])){ center<-rmba(m)$center } if(cop==7 && is.na(center[1])){ center<-spat(m) } flag<-rep(0, nrow(m)) outid <- NA vec <- c(1:nrow(m)) for (i in 1:nrow(m)){ B<-m[i,]-center dis<-NA BB<-B^2 bot<-sum(BB) if(bot!=0){ for (j in 1:nrow(m)){ A<-m[j,]-center temp<-sum(A*B)*B/bot dis[j]<-sqrt(sum(temp^2)) } temp<-idealf(dis) if(!MM)cu<-median(dis)+gval*(temp$qu-temp$ql) if(MM)cu<-median(dis)+gval*mad(dis) outid<-NA temp2<-(dis> cu) flag[temp2]<-1 }} if(sum(flag) == 0) outid <- NA if(sum(flag) > 0)flag<-(flag==1) outid <- vec[flag] idv<-c(1:nrow(m)) keep<-idv[!flag] if(ncol(m)==2){ if(plotit){ plot(m[,1],m[,2],type="n",xlab=xlab,ylab=ylab) points(m[keep,1],m[keep,2],pch="*") if(length(outid)>0)points(m[outid,1],m[outid,2],pch="o") if(op){ tempd<-NA keep<-keep[!is.na(keep)] mm<-m[keep,] for(i in 1:nrow(mm))tempd[i]<-depth(mm[i,1],mm[i,2],mm) mdep<-max(tempd) flag<-(tempd==mdep) if(sum(flag)==1)center<-mm[flag,] if(sum(flag)>1)center<-apply(mm[flag,],2,mean) m<-mm } points(center[1],center[2],pch="+") x<-m temp<-fdepth(m,plotit=F) flag<-(temp>=median(temp)) xx<-x[flag,] xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) lines(xx[temp,]) lines(xx[c(temp[1],temp[length(temp)]),]) }}} list(out.id=outid,keep=keep) } bmp<-function(x,y,alpha=.05,crit=NA,plotit=F,pop=0,fr=.8,rval=15){ # # Brunner and Munzel (2000) heteroscedastic analog of WMW test. # x<-x[!is.na(x)] # Remove any missing values y<-y[!is.na(y)] n1<-length(x) n2<-length(y) N<-n1+n2 n1p1<-n1+1 flag1<-c(1:n1) flag2<-c(n1p1:N) R<-rank(c(x,y)) R1<-mean(R[flag1]) R2<-mean(R[flag2]) Rg1<-rank(x) Rg2<-rank(y) S1sq<-sum((R[flag1]-Rg1-R1+(n1+1)/2)^2)/(n1-1) S2sq<-sum((R[flag2]-Rg2-R2+(n2+1)/2)^2)/(n2-1) sig1<-S1sq/n2^2 sig2<-S2sq/n1^2 se<-sqrt(N)*sqrt(N*(sig1/n1+sig2/n2)) bmtest<-(R2-R1)/se phat<-(R2-(n2+1)/2)/n1 dhat<-1-2*phat df<-(S1sq/n2 + S2sq/n1)^2/((S1sq/n2)^2/(n1-1)+(S2sq/n1)^2/(n2-1)) sig<-2 * (1 - pt(abs(bmtest),df)) if(is.na(crit))vv<-qt(alpha/2,df) if(!is.na(crit))vv<-crit ci.p<-c(phat+vv*se/N,phat-vv*se/N) if(plotit){ msave<-outer(x,y,FUN="-") if(pop==1 || pop==2){ if(length(x)*length(y)>2500){ print("Product of sample sizes exceeds 2500.") print("Execution time might be high when using pop=0 or 1") print("If this is case, might consider changing the argument pop") }} if(pop==0)akerd(as.vector(msave),fr=fr) if(pop==1)rdplot(as.vector(msave),fr=fr) if(pop==2)kdplot(as.vector(msave),rval=rval) if(pop==3)boxplot(as.vector(msave)) if(pop==4)stem(as.vector(msave)) if(pop==5)hist(as.vector(msave)) if(pop==6)skerd(as.vector(msave)) } list(test.stat=bmtest,phat=phat,dhat=dhat,p.value=sig,ci.p=ci.p,df=df) } mgvdep<-function(m,se=F){ # # Find the center of a scatterplot, add point that # increases the generalized variance by smallest amount # continue for all points # return the MGV depths. # # Essentially the same as mgvar which # determine MGV distances, only here, # follow convention that deepest points # have the largest numerical value. Here # depth of the deepest values equal one. # temp<-apgdis(m,se=se)$distance icen<-ncol(m) temp3<-order(temp) chkit<-sum(duplicated(temp[temp3[1:icen]])) icen<-icen+chkit flag<-rep(T,length(temp)) flag[temp3[1:icen]]<-F # set duplicated central values to F varvec<-0 varvec[!flag]<-NA while(sum(flag)>0){ ic<-0 chk<-NA remi<-NA for(i in 1:nrow(m)){ if(flag[i]){ ic<-ic+1 chk[ic]<-gvar(rbind(m[!flag,],m[i,])) remi[ic]<-i }} sor<-order(chk) k<-remi[sor[1]] varvec[k]<-chk[sor[1]] flag[k]<-F } varvec[is.na(varvec)]<-0 varvec<-1/(1+varvec) varvec } fdepthv2<-function(m,pts=NA,plotit=T){ # # Determine depth of points in pts relative to # points in m # # Draw a line between each pair of distinct points # and determine depth of the projected points. # The final depth of a point is its minimum depth # among all projections. # # This function is slower than fdepth and requires # space for a nc by nc matrix, nc=(n^2-n)/2. # But it allows # data to have a singular covariance matrix # and it provides a more accurate approximation of # halfspace depth. # # plotit=T creates a scatterplot when working with # bivariate data and pts=NA # # When plotting, # center is marked with a cross, +. # m<-elimna(m) # Remove missing values if(!is.na(pts[1]))remm<-m if(!is.matrix(m))dep<-unidepth(m) if(is.matrix(m)){ nm<-nrow(m) nt<-nm nm1<-nm+1 if(!is.na(pts[1])){ if(ncol(m)!=ncol(pts))stop("Number of columns of m is not equal to number of columns for pts") nt<-nm+nrow(pts) }} if(ncol(m)==1)depth<-unidepth(m) if(ncol(m)>1){ m<-elimna(m) # Remove missing values nc<-(nrow(m)^2-nrow(m))/2 if(is.na(pts[1]))mdep <- matrix(0,nrow=nc,ncol=nrow(m)) if(!is.na(pts[1])){ mdep <- matrix(0,nrow=nc,ncol=nrow(pts)) } ic<-0 for (iall in 1:nm){ for (i in 1:nm){ if(iall < i){ ic<-ic+1 B<-m[i,]-m[iall,] dis<-NA BB<-B^2 bot<-sum(BB) if(bot!=0){ if(is.na(pts[1])){ for (j in 1:nrow(m)){ A<-m[j,]-m[iall,] temp<-sum(A*B)*B/bot dis[j]<-sign(sum(A*B))*sqrt(sum(temp^2)) }} if(!is.na(pts[1])){ m<-rbind(remm,pts) for (j in 1:nrow(m)){ A<-m[j,]-m[iall,] temp<-sum(A*B)*B/bot dis[j]<-sign(sum(A*B))*sqrt(sum(temp^2)) }} # # For ic_th projection, store depths of # points in mdep[ic,] # if(is.na(pts[1]))mdep[ic,]<-unidepth(dis) if(!is.na(pts[1])){ mdep[ic,]<-unidepth(dis[1:nm],dis[nm1:nrow(m)]) }} if(bot==0)mdep[ic,]<-rep(0,ncol(mdep)) }}} dep<-apply(mdep,2,min) } if(ncol(m)==2 &&is.na(pts[1])){ flag<-chull(m) dep[flag]<-min(dep) } if(ncol(m)==2){ if(is.na(pts[1]) && plotit){ plot(m) x<-m temp<-dep flag<-(temp>=median(temp)) xx<-x[flag,] xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) lines(xx[temp,]) lines(xx[c(temp[1],temp[length(temp)]),]) }} dep } pbadepth<-function(x,est=onestep,con=0,alpha=.05,nboot=2000,grp=NA,op=1, allp=T,MM=F,cop=3,SEED=T,...){ # # Test the hypothesis that C linear contrasts all have a value of zero. # By default, the MOM estimator is used # # Independent groups are assumed. # # The data are assumed to be stored in x in list mode or in a matrix. # If stored in list mode, # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. Length(x)=the number of groups = J, say. # If stored in a matrix, columns correspond to groups. # # By default, all pairwise differences are used, but contrasts # can be specified with the argument con. # The columns of con indicate the contrast coefficients. # Con should have J rows, J=number of groups. # For example, con[,1]=c(1,1,-1,-1,0,0) and con[,2]=c(,1,-1,0,0,1,-1) # will test two contrasts: (1) the sum of the first # two measures of location is # equal to the sum of the second two, and (2) the difference between # the first two is equal to the difference between the # measures of location for groups 5 and 6. # # The default number of bootstrap samples is nboot=2000 # # op controls how depth is measured # op=1, Mahalanobis # op=2, Mahalanobis based on MCD covariance matrix # op=3, Projection distance # op=4, Projection distance using FORTRAN version # # for arguments MM and cop, see pdis. # con<-as.matrix(con) if(is.matrix(x)){ xx<-list() for(i in 1:ncol(x)){ xx[[i]]<-x[,i] } x<-xx } if(!is.list(x))stop("Data must be stored in list mode or in matrix mode.") if(!is.na(grp)){ # Only analyze specified groups. xx<-list() for(i in 1:length(grp))xx[[i]]<-x[[grp[1]]] x<-xx } J<-length(x) mvec<-NA for(j in 1:J){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. x[[j]]<-temp mvec[j]<-est(temp,...) } Jm<-J-1 d<-ifelse(con==0,(J^2-J)/2,ncol(con)) if(sum(con^2)==0){ if(allp){ con<-matrix(0,J,d) id<-0 for (j in 1:Jm){ jp<-j+1 for (k in jp:J){ id<-id+1 con[j,id]<-1 con[k,id]<-0-1 }}} if(!allp){ con<-matrix(0,J,Jm) for (j in 1:Jm){ jp<-j+1 con[j,j]<-1 con[jp,j]<-0-1 }}} bvec<-matrix(NA,nrow=J,ncol=nboot) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") for(j in 1:J){ print(paste("Working on group ",j)) data<-matrix(sample(x[[j]],size=length(x[[j]])*nboot,replace=T),nrow=nboot) bvec[j,]<-apply(data,1,est,...) # J by nboot matrix, jth row contains # bootstrapped estimates for jth group } bcon<-t(con)%*%bvec #C by nboot matrix tvec<-t(con)%*%mvec tvec<-tvec[,1] tempcen<-apply(bcon,1,mean) vecz<-rep(0,ncol(con)) bcon<-t(bcon) smat<-var(bcon-tempcen+tvec) temp<-bcon-tempcen+tvec bcon<-rbind(bcon,vecz) if(op==1)dv<-mahalanobis(bcon,tvec,smat) if(op==2){ library(MASS) smat<-cov.mcd(temp)$cov dv<-mahalanobis(bcon,tvec,smat) } if(op==3){ print("Computing Confidence interval. Might take a while with op=3") print("If execution time is high and using unix, try op=4") print("being sure that file pdis.o has been stored; see chapter 1") dv<-pdis(bcon,MM=MM,cop=cop) } if(op==4)dv<-pdis.for(bcon,MM=MM,cop=cop) bplus<-nboot+1 sig.level<-1-sum(dv[bplus]>=dv[1:nboot])/nboot list(p.value=sig.level,psihat=tvec,con=con) } g2plot<-function(x1,x2,op=4,rval=15,fr=.8,aval=.5,xlab="X",ylab=""){ # # plot estimates of the density functions for two groups. # # op=1: Use Rosenblatt shifted histogram # # op=2: # Use kernel density estimate # Using the built-in S+ function density, # # op=3: Use expected frequency curve. # # op=4: Use adaptive kernel estimator # x1<-elimna(x1) x2<-elimna(x2) if(op==3){ rd2plot(x1,x2,fr=fr,xlab=xlab,ylab=ylab) print("Might consider using op=4 if graph is ragged") } if(op==2){ #tempx<-density(x1,na.rm=T,width=bandwidth.sj(x1,method="dpi"),n=256) tempx<-density(x1,na.rm=T,kernel="epanechnikov") #tempy<-density(x2,na.rm=T,width=bandwidth.sj(x2,method="dpi"),n=256) tempy<-density(x2,na.rm=T,kernel="epanechnikov") plot(c(tempx$x,tempy$x),c(tempx$y,tempy$y),type="n",xlab=xlab,ylab=ylab) lines(tempx$x,tempx$y) lines(tempy$x,tempy$y,lty=2) } if(op==1){ y1 <- sort(x1) z1 <- 1 z2 <- 1 par(yaxt = "n") temp <- floor(0.01 * length(x1)) if(temp == 0) temp <- 5 ibot <- y1[temp] itop <- y1[floor(0.99 * length(x1))] xaxis1 <- seq(ibot, itop, length = rval) for(i in 1:rval) z1[i] <- kerden(x1, 0, xaxis1[i]) y2 <- sort(x2) temp <- floor(0.01 * length(x2)) if(temp == 0) temp <- 5 ibot <- y2[temp] itop <- y2[floor(0.99 * length(x2))] xaxis2 <- seq(ibot, itop, length = rval) for(i in 1:rval) z2[i] <- kerden(x2, 0, xaxis2[i]) plot(c(xaxis1,xaxis2),c(z1,z2), xlab =xlab, ylab =ylab, type = "n") lines(xaxis1,z1) lines(xaxis2,z2,lty=2) } if(op==4){ x1<-sort(x1) x2<-sort(x2) z1<-akerd(x1,aval=aval,fr=fr,pyhat=T,plotit=F) z2<-akerd(x2,aval=aval,fr=fr,pyhat=T,plotit=F) plot(c(x1,x2),c(z1,z2), xlab =xlab, ylab =ylab, type = "n") lines(x1,z1) lines(x2,z2,lty=2) } } mulwmw<-function(m1,m2,plotit=T,cop=3,alpha=.05,nboot=1000,pop=4,fr=.8,pr=F){ # # # Determine center correpsonding to two # independent groups, project all points onto line # connecting the centers, # then based on the projected distances, # estimate p=probability that a randomly sampled # point from group 1 is less than a point from group 2 # based on the projected distances. # # plotit=T creates a plot of the projected data # pop=1 plot two dotplots based on projected distances # pop=2 boxplots # pop=3 expected frequency curve. # pop=4 adaptive kernel density # # There are three options for computing the center of the # cloud of points when computing projections: # cop=1 uses Donoho-Gasko median # cop=2 uses MCD center # cop=3 uses median of the marginal distributions. # # When using cop=2 or 3, default critical value for outliers # is square root of the .975 quantile of a # chi-squared distribution with p degrees # of freedom. # # Donoho-Gasko (Tukey) median is marked with a cross, +. # if(!is.matrix(m1)){print("Data are assumed to be stored in") print(" a matrix having two or more columns.") stop(" For univariate data, use the function outbox or out") } m1<-elimna(m1) # Remove missing values m2<-elimna(m2) if(cop==1){ if(ncol(m1)>2){ center1<-dmean(m1,tr=.5) center2<-dmean(m2,tr=.5) } if(ncol(m1)==2){ tempd<-NA for(i in 1:nrow(m1)) tempd[i]<-depth(m1[i,1],m1[i,2],m1) mdep<-max(tempd) flag<-(tempd==mdep) if(sum(flag)==1)center1<-m1[flag,] if(sum(flag)>1)center1<-apply(m1[flag,],2,mean) for(i in 1:nrow(m2)) tempd[i]<-depth(m2[i,1],m2[i,2],m2) mdep<-max(tempd) flag<-(tempd==mdep) if(sum(flag)==1)center2<-m2[flag,] if(sum(flag)>1)center2<-apply(m2[flag,],2,mean) }} if(cop==2){ center1<-cov.mcd(m1)$center center2<-cov.mcd(m2)$center } if(cop==3){ center1<-apply(m1,2,median) center2<-apply(m2,2,median) } if(cop==4){ center1<-smean(m1) center2<-smean(m2) } center<-(center1+center2)/2 B<-center1-center2 if(sum(center1^2)2){ center1<-dmean(m1,tr=.5) center2<-dmean(m2,tr=.5) } if(ncol(m1)==2){ tempd<-NA for(i in 1:nrow(m1)) tempd[i]<-depth(m1[i,1],m1[i,2],m1) mdep<-max(tempd) flag<-(tempd==mdep) if(sum(flag)==1)center1<-m1[flag,] if(sum(flag)>1)center1<-apply(m1[flag,],2,mean) for(i in 1:nrow(m2)) tempd[i]<-depth(m2[i,1],m2[i,2],m2) mdep<-max(tempd) flag<-(tempd==mdep) if(sum(flag)==1)center2<-m2[flag,] if(sum(flag)>1)center2<-apply(m2[flag,],2,mean) }} if(cop==2){ center1<-cov.mcd(m1)$center center2<-cov.mcd(m2)$center } if(cop==3){ center1<-apply(m1,2,median) center2<-apply(m2,2,median) } center<-(center1+center2)/2 B<-center1-center2 if(sum(center1^2)>sum(center2^2))B<-(0-1)*B BB<-B^2 bot<-sum(BB) disx<-NA disy<-NA if(bot!=0){ for (j in 1:nrow(m1)){ AX<-m1[j,]-center tempx<-sum(AX*B)*B/bot disx[j]<-sign(sum(AX*B))*sqrt(sum(tempx^2)) } for (j in 1:nrow(m2)){ AY<-m2[j,]-center tempy<-sum(AY*B)*B/bot disy[j]<-sign(sum(AY*B))*sqrt(sum(tempy^2)) }} m<-outer(disx,disy,FUN="-") m<-sign(m) val[it]<-(1-mean(m))/2 if(bot==0)val[it]<-.5 if(pr)print(paste("Iteration ",it," of ",iter," complete")) } val<-sort(val) low<-round(alpha*iter/2)+1 up<-iter-low crit<-NA crit[1]<-val[low] crit[2]<-val[up] crit } dmean<-function(m,tr=.2,dop=1,cop=2){ # # Compute multivariate measure of location # using Donoho-Gasko method. # # dop=1, use fdepth to compute depths # dop=2, use fdepthv2 to compute depths # # cop=1, Tukey median; can't be used here. # cop=2, use MCD in fdepth # cop=3, use marginal medians in fdepth # cop=4, use MVE in fdepth # if(is.list(m))m<-matl(m) if(!is.matrix(m))stop("Data must be stored in a matrix or in list mode.") if(ncol(m)==1){ if(tr==.5)val<-median(m) if(tr>.5)stop("Amount of trimming must be at most .5") if(tr<.5)val<-mean(m,tr) } if(ncol(m)>1){ temp<-NA if(ncol(m)!=2){ # Use approximate depth if(dop==1)temp<-fdepth(m,plotit=F,cop=cop) if(dop==2)temp<-fdepthv2(m) } # Use exact depth if ncol=2 if(ncol(m)==2){ for(i in 1:nrow(m)) temp[i]<-depth(m[i,1],m[i,2],m) } mdep<-max(temp) flag<-(temp==mdep) if(tr==.5){ if(sum(flag)==1)val<-m[flag,] if(sum(flag)>1)val<-apply(m[flag,],2,mean) } if(tr<.5){ flag2<-(temp>=tr) if(sum(flag2)==0)val<-apply(m[flag,],2,mean) if(sum(flag2)==1)val<-m[flag2,] if(sum(flag2)>1)val<-apply(m[flag2,],2,mean) }} val } lsqs2<-function(x,y,MD=F,tr=.05,plotit=T){ # cf Liu and Singh, JASA 1993, 252-260 # if(is.list(x))x<-matl(x) if(is.list(y))y<-matl(y) disyx<-NA # depth of y in x disxy<-NA # depth of x in y if(!is.matrix(x) && !is.matrix(y)){ x<-x[!is.na(x)] y<-y[!is.na(y)] # tempxx<-NA for(i in 1:length(x)){ tempxx[i]<-sum(x[i]<=x)/length(x) if(tempxx[i]>.5)tempxx[i]<-1-tempxx[i] } for(i in 1:length(x)){ temp<-sum(x[i]<=y)/length(y) if(temp>.5)temp<-1-temp disxy[i]<-mean(temp>tempxx) } tempyy<-NA for(i in 1:length(y)){ tempyy[i]<-sum(y[i]<=y)/length(y) if(tempyy[i]>.5)tempyy[i]<-1-tempyy[i] } for(i in 1:length(y)){ temp<-sum(y[i]<=x)/length(x) if(temp>.5)temp<-1-temp # depth of y_i in x disyx[i]<-mean(temp>tempyy) } qhatxy<-mean(disyx) qhatyx<-mean(disxy) qhat<-(qhatxy+qhatyx)/2 } if(is.matrix(x) && is.matrix(x)){ if(!MD){ if(ncol(x)!=2 || ncol(y)!=2){ # Use approximate depth tempyy<-fdepth(y) temp<-fdepth(y,x) for(i in 1:nrow(x)){ disxy[i]<-mean(temp[i]>tempyy) } tempxx<-NA tempxx<-fdepth(x) temp<-fdepth(x,pts=y) for(i in 1:nrow(y)){ disyx[i]<-mean(temp[i]>tempxx) }} if(ncol(x)==2 && ncol(y)==2){ if(plotit){ plot(rbind(x,y),type="n",xlab="Var 1",ylab="VAR 2") points(x) points(y,pch="o") temp<-NA for(i in 1:nrow(x)){ temp[i]<-depth(x[i,1],x[i,2],x) } flag<-(temp>=median(temp)) xx<-x[flag,] xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) lines(xx[temp,]) lines(xx[c(temp[1],temp[length(temp)]),]) temp<-NA for(i in 1:nrow(y)){ temp[i]<-depth(y[i,1],y[i,2],y) } flag<-(temp>=median(temp)) xx<-y[flag,] xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) flag<-(temp>=median(temp)) xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) lines(xx[temp,],lty=2) lines(xx[c(temp[1],temp[length(temp)]),],lty=2) } tempyy<-NA for(i in 1:nrow(y))tempyy[i]<-depth(y[i,1],y[i,2],y) for(i in 1:nrow(x)){ temp<-depth(x[i,1],x[i,2],y) disxy[i]<-mean(temp>tempyy) } tempxx<-NA for(i in 1:nrow(x))tempxx[i]<-depth(x[i,1],x[i,2],x) for(i in 1:nrow(y)){ temp<-depth(y[i,1],y[i,2],x) disyx[i]<-mean(temp>tempxx) } }} if(MD){ mx<-apply(x,2,median) my<-apply(y,2,median) vx<-apply(x,2,winval,tr=tr)-apply(x,2,mean,trim=tr)+mx vx<-var(vx) vy<-apply(y,2,winval,tr=tr)-apply(y,2,mean,trim=tr)+my vy<-var(vy) tempxx<-1/(1+mahalanobis(x,mx,vx)) tempyx<-1/(1+mahalanobis(y,mx,vx)) for(i in 1:nrow(y)){ disyx[i]<-mean(tempyx[i]>tempxx) } tempyy<-1/(1+mahalanobis(y,my,vy)) tempxy<-1/(1+mahalanobis(x,my,vy)) for(i in 1:nrow(x)){ disxy[i]<-mean(tempxy[i]>tempyy) } } qhatxy<-sum(disxy) qhatyx<-sum(disyx) qhat<-(qhatxy+qhatyx)/(length(disxy)+length(disyx)) } qhatyx<-mean(disyx) qhatxy<-mean(disxy) list(qhatxy,qhatyx,qhat) } depthg2<-function(x,y,alpha=.05,nboot=500,MD=F,plotit=T,op=T,fast=F,SEED=T, xlab="VAR 1",ylab="VAR 2"){ # # Compare two independent groups based on p measures # for each group. # # The method is based on Tukey's depth if MD=F; # otherwise the Mahalanobis depth is used. # If p>2, then Mahalanobis depth is used automatically # # The method is designed to be sensitive to differences in scale # if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. x=elimna(x) y=elimna(y) if(is.matrix(x) && is.matrix(y)){ nv1<-nrow(x) nv2<-nrow(y) if(ncol(x)!=ncol(y))stop("Number of columns of x is not equal to number for y") if(ncol(x) >2)MD<-T if(ncol(x)==2 && plotit){ plot(rbind(x,y),type="n",xlab=xlab,ylab=ylab) points(x,pch="*") points(y,pch="o") temp<-NA for(i in 1:nrow(x)){ temp[i]<-depth(x[i,1],x[i,2],x) } flag<-(temp>=median(temp)) xx<-x[flag,] xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) lines(xx[temp,]) lines(xx[c(temp[1],temp[length(temp)]),]) temp<-NA for(i in 1:nrow(y)){ temp[i]<-depth(y[i,1],y[i,2],y) } flag<-(temp>=median(temp)) xx<-y[flag,] xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) flag<-(temp>=median(temp)) xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) lines(xx[temp,],lty=2) lines(xx[c(temp[1],temp[length(temp)]),],lty=2) } print("Taking bootstrap samples. Please wait.") data1<-matrix(sample(nv1,size=nv1*nboot,replace=T),nrow=nboot) data2<-matrix(sample(nv2,size=nv2*nboot,replace=T),nrow=nboot) qhatd<-NA dhatb<-NA for(ib in 1:nboot){ if(op)print(paste("Bootstrap sample ",ib," of ",nboot, "is complete.")) if(!fast)temp<-lsqs2(x[data1[ib,],],y[data2[ib,],],plotit=F,MD=MD) if(fast)temp<-lsqs2.for(x[data1[ib,],],y[data2[ib,],],plotit=F,MD=MD) qhatd[ib]<-temp[[1]]-temp[[2]] } temp<-sort(qhatd) lv<-round(alpha*nboot/2) uv<-nboot-lv difci<-c(temp[lv+1],temp[uv]) } # if(!is.matrix(x) && !is.matrix(y)){ nv1<-length(x) nv2<-length(y) print("Taking bootstrap samples. Please wait.") data1<-matrix(sample(nv1,size=nv1*nboot,replace=T),nrow=nboot) data2<-matrix(sample(nv2,size=nv2*nboot,replace=T),nrow=nboot) qhatd<-NA dhatb<-NA for(ib in 1:nboot){ if(!fast)temp<-lsqs2(x[data1[ib,]],y[data2[ib,]],plotit=F,MD=MD) if(fast)temp<-lsqs2.for(x[data1[ib,]],y[data2[ib,]],plotit=F,MD=MD) qhatd[ib]<-temp[[1]]-temp[[2]] dhatb[ib]<-(temp[[1]]+temp[[2]])/2 print(paste("Bootstrap sample ",ib," of ",nboot, "is complete.")) }} temp<-sort(qhatd) temp2<-sort(dhatb) lv<-round(alpha*nboot/2) uv<-nboot-lv difci<-c(temp[lv+1],temp[uv]) list(difci=difci) } hochberg<-function(x,x2=NA,cil=NA,crit=NA,con=0,tr=.2,alpha=.05,iter=10000,SEED=T){ # # A generalization of Hochberg's method # method to trimmed mean. # # x contains first stage data # x2 contains second stage data # # cil is the desired length of the confidence intervals. # That is, cil is the distance between the upper and lower # ends of the confidence intervals. # x3<-x2 if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in list mode or in matrix mode.") J<-length(x) tempn<-0 svec<-NA for(j in 1:J){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. tempn[j]<-length(temp) x[[j]]<-temp svec[j]<-winvar(temp,tr=tr)/(1-2*tr)^2 } tempt<-floor((1-2*tr)*tempn) A<-sum(1/(tempt-1)) df<-J/A print(paste("If using the tables of Studentized range distribution,")) print(paste("the degrees of freedom are:",df)) if(!is.list(x2) && !is.matrix(x2)){ x2<-list() for(j in 1:J)x2[[j]]<-NA } if(is.na(cil))stop("To proceed, you must specify the maximum length of the confidence intervals.") if(is.na(crit)){ print("Approximating critical value") crit<-trange(tempn-1,alpha=alpha,iter=iter,SEED=SEED) print(paste("The critical value is ",crit)) } # if(con[1] == 0){ Jm<-J-1 ncon <- (J^2 - J)/2 con <- matrix(0, J, ncon) id <- 0 for(j in 1:Jm) { jp <- j + 1 for(k in jp:J) { id <- id + 1 con[j, id] <- 1 con[k, id] <- 0 - 1 } } } ncon <- ncol(con) avec<-NA for(i in 1:ncon){ temp<-con[,i] avec[i]<-sum(temp[temp>0]) } dvec<-(cil/(2*crit*avec))^2 d<-max(dvec) n.vec<-NA for(j in 1:J){ n.vec[j]<-max(tempn[j],floor(svec[j]/d)+1) print(paste("Need an additional ", n.vec[j]-tempn[j], " observations for group", j)) } # # Do second stage if data are supplied # if(is.matrix(x2))x2<-listm(x2) temp2<-n.vec-tempn if(!is.list(x3) && !is.matrix(x3) && sum(temp2)>0)stop("No second stage data supplied; this function is terminating") if(length(x) != length(x2))warning("Number of groups in first stage data does not match the number in the second stage.") ci.mat<-NA if(!is.na(x2[1]) || sum(temp2)==0){ xtil<-NA nvec2<-NA for(j in 1:J){ nvec2[j]<-0 temp<-x2[[j]] if(!is.na(temp[1]))nvec2[j]<-length(x2[[j]]) if(nvec2[j] 0]) C<-0-sum(bvec[bvec<0]) D<-max(A,C) ci.mat[ic,2]<-sum(con[,ic]*xtil)-crit*D ci.mat[ic,3]<-sum(con[,ic]*xtil)+crit*D }} list(ci.mat=ci.mat,con=con) } trange<-function(dfvec,iter=10000,alpha=.05,SEED=T){ if(SEED)set.seed(1) dfv<-length(dfvec)/sum(1/dfvec) vals<-NA tvals<-NA J<-length(dfvec) for(i in 1:iter){ for(j in 1:J){ tvals[j]<-rt(1,dfvec[j]) } vals[i]<-max(tvals)-min(tvals) } vals<-sort(vals) ival<-round((1-alpha)*iter) qval<-vals[ival] qval } prplot<-function(x,y,pval=ncol(x),regfun=tsreg,fr=.8,est=mom,...){ # # Goal: check for curvature associated with predictor # indicated by pvec. # This is done by creating a partial residual plot. # That is subtracting out the linear prediction based # on the other predictors and then # smooth the result versus the predictor indicated by pbvec # if(!is.matrix(x))stop("Should have two or more variables stored in a matrix") flag<-rep(T,ncol(x)) flag[pval]<-F temp<-regfun(x[,flag],y)$residual rungen(x[,!flag],temp,est=est,fr=fr,...) } pbad2way<-function(J,K,x,est=mom,conall=T,alpha=.05,nboot=2000,grp=NA, op=F,...){ # # This function is like the function pbadepth, # only it is assumed that main effects and interactions for a # two-way design are to be tested. # # The data are assumed to be stored in x in list mode or in a matrix. # If grp is unspecified, it is assumed x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second factor: level 1,2 # x[[j+1]] is the data for level 2,1, etc. # If the data are in wrong order, grp can be used to rearrange the # groups. For example, for a two by two design, grp<-c(2,4,3,1) # indicates that the second group corresponds to level 1,1; # group 4 corresponds to level 1,2; group 3 is level 2,1; # and group 1 is level 2,2. # # Missing values are automatically removed. # JK <- J * K if(is.matrix(x)) x <- listm(x) if(!is.na(grp[1])) { yy <- x for(j in 1:length(grp)) x[[j]] <- yy[[grp[j]]] } if(!is.list(x)) stop("Data must be stored in list mode or a matrix.") for(j in 1:JK) { xx <- x[[j]] x[[j]] <- xx[!is.na(xx)] } # # Create the three contrast matrices # if(!conall){ ij <- matrix(c(rep(1, J)), 1, J) ik <- matrix(c(rep(1, K)), 1, K) jm1 <- J - 1 cj <- diag(1, jm1, J) for(i in 1:jm1) cj[i, i + 1] <- 0 - 1 km1 <- K - 1 ck <- diag(1, km1, K) for(i in 1:km1) ck[i, i + 1] <- 0 - 1 conA <- t(kron(cj, ik)) conB <- t(kron(ij, ck)) conAB <- t(kron(cj, ck)) } if(conall){ temp<-con2way(J,K) conA<-temp$conA conB<-temp$conB conAB<-temp$conAB } ncon <- max(nrow(conA), nrow(conB), nrow(conAB)) if(JK != length(x)) warning("The number of groups does not match the number of contrast coefficients.") if(!is.na(grp[1])){ # Only analyze specified groups. xx<-list() for(i in 1:length(grp))xx[[i]]<-x[[grp[i]]] x<-xx } mvec<-NA for(j in 1:JK){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. x[[j]]<-temp mvec[j]<-est(temp,...) } bvec<-matrix(NA,nrow=JK,ncol=nboot) set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") for(j in 1:JK){ print(paste("Working on group ",j)) data<-matrix(sample(x[[j]],size=length(x[[j]])*nboot,replace=T),nrow=nboot) bvec[j,]<-apply(data,1,est,...) # J by nboot matrix, jth row contains # bootstrapped estimates for jth group } bconA<-t(conA)%*%bvec #C by nboot matrix tvecA<-t(conA)%*%mvec tvecA<-tvecA[,1] tempcenA<-apply(bconA,1,mean) veczA<-rep(0,ncol(conA)) bconA<-t(bconA) smatA<-var(bconA-tempcenA+tvecA) bconA<-rbind(bconA,veczA) if(!op)dv<-mahalanobis(bconA,tvecA,smatA) if(op){ dv<-out(bconA)$dis } bplus<-nboot+1 sig.levelA<-1-sum(dv[bplus]>=dv[1:nboot])/nboot bconB<-t(conB)%*%bvec #C by nboot matrix tvecB<-t(conB)%*%mvec tvecB<-tvecB[,1] tempcenB<-apply(bconB,1,mean) veczB<-rep(0,ncol(conB)) bconB<-t(bconB) smatB<-var(bconB-tempcenB+tvecB) bconB<-rbind(bconB,veczB) if(!op)dv<-mahalanobis(bconB,tvecB,smatB) if(op){ dv<-out(bconA)$dis } sig.levelB<-1-sum(dv[bplus]>=dv[1:nboot])/nboot bconAB<-t(conAB)%*%bvec #C by nboot matrix tvecAB<-t(conAB)%*%mvec tvecAB<-tvecAB[,1] tempcenAB<-apply(bconAB,1,mean) veczAB<-rep(0,ncol(conAB)) bconAB<-t(bconAB) smatAB<-var(bconAB-tempcenAB+tvecAB) bconAB<-rbind(bconAB,veczAB) if(!op)dv<-mahalanobis(bconAB,tvecAB,smatAB) if(op){ dv<-out(bconAB)$dis } sig.levelAB<-1-sum(dv[bplus]>=dv[1:nboot])/nboot list(p.value.A=sig.levelA,p.value.B=sig.levelB,p.value.AB=sig.levelAB,conA=conA,conB=conB,conAB=conAB) } lsqs3<-function(x,y,plotit=T,cop=2,ap.dep=F,v2=F){ # # Compute the typical depth of x in y, # Compute the typical depth of y in x, # use the maximum of the two typical depths # as a test statistic. # This method is designed to be sensitive to # shifts in location. # # Use Tukey's depth; bivariate case only. # # cop=2 use MCD location estimator when # computing depth with function fdepth # cop=3 uses medians # cop=3 uses MVE # if(is.list(x))x<-matl(x) if(is.list(y))y<-matl(y) x<-elimna(x) y<-elimna(y) x<-as.matrix(x) y<-as.matrix(y) if(ncol(x) != ncol(y))stop("Number of variables not equal") disyx<-NA # depth of y in x disxy<-NA # depth of x in y # if(ncol(x)==2){ if(plotit){ plot(rbind(x,y),type="n",xlab="VAR 1",ylab="VAR 2") points(x) points(y,pch="o") if(nrow(x)>50){ if(!ap.dep){ print("If execution time is high, might use ap.dep=F") } if(!ap.dep)temp<-depth2(x,plotit=F) if(ap.dep)temp<-fdepth(x,plotit=F,cop=cop) } if(!ap.dep)temp<-depth2(x,plotit=F) if(ap.dep)temp<-fdepth(x,plotit=F,cop=cop) flag<-(temp>=median(temp)) xx<-x[flag,] xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) lines(xx[temp,]) lines(xx[c(temp[1],temp[length(temp)]),]) if(ap.dep)temp<-fdepth(y,plotit=F,cop=cop) if(!ap.dep)temp<-depth2(y,plotit=F) if(!ap.dep)temp<-depth2(y,plotit=F) if(!ap.dep)temp<-fdepth(y,plotit=F) flag<-(temp>=median(temp)) xx<-y[flag,] xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) flag<-(temp>=median(temp)) xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) lines(xx[temp,],lty=2) lines(xx[c(temp[1],temp[length(temp)]),],lty=2) } tempyx<-NA tempxy<-NA if(ap.dep)tempyx<-fdepth(x,y,plotit=F,cop=cop) if(!ap.dep)tempyx<-depth2(x,y,plotit=F) if(ap.dep)tempxy<-fdepth(y,x,plotit=F,cop=cop) tempxy<-depth2(y,x,plotit=F) } if(ncol(x)==1){ tempyx<-unidepth(as.vector(x),as.vector(y)) tempxy<-unidepth(as.vector(y),as.vector(x)) } if(ncol(x)>2){ if(!v2){ tempxy<-fdepth(y,x,plotit=F,cop=cop) tempyx<-fdepth(x,y,plotit=F,cop=cop) } if(v2){ tempxy<-fdepthv2(y,x,plotit=F,cop=cop) tempyx<-fdepthv2(x,y,plotit=F,cop=cop) }} qhatxy<-mean(tempxy) qhatyx<-mean(tempyx) qhat<-max(c(qhatxy,qhatyx)) n1<-nrow(x) n2<-nrow(y) nv<-(3*min(c(n1,n2))+max(c(n1,n2)))/4 if(ncol(x)==1)crit<-.2536-.4578/sqrt(nv) if(ncol(x)==2)crit<-.1569-.3/sqrt(nv) if(ncol(x)==3)crit<-.0861-.269/sqrt(nv) if(ncol(x)==4)crit<-.054-.1568/sqrt(nv) if(ncol(x)==5)crit<-.0367-.0968/sqrt(nv) if(ncol(x)==6)crit<-.0262-.0565/sqrt(nv) if(ncol(x)==7)crit<-.0174-.0916/sqrt(nv) if(ncol(x)>7)crit<-.13 rej<-"Fail to reject" if(qhat<=crit)rej<-"Reject" list(avg.depth.of.x.in.y=qhatxy,avg.depth.of.y.in.x=qhatyx,test=qhat,crit=crit,Decision=rej) } kercon<-function(x,y,pyhat=F,cval=NA,pts=NA,plotit=T,eout=F,xout=F, outfun=out,iran=.05,xlab="X",ylab="Y"){ # # Compute conditional local weighted regression with Epanechnikov kernel # # cf. Fan, Annals of Statistics, 1993, 21, 196-217. # d<-ncol(x) if(d!=2)stop("Argument x should have two columns only") np1<-d+1 m<-elimna(cbind(x,y)) x<-m[,1:d] y<-m[,np1] yhat1<-NA if(eout && xout)stop("Can't have both eout and xout=F") if(eout){ flag<-outfun(m)$keep m<-m[flag,] } if(xout){ flag<-outfun(x)$keep m<-m[flag,] } x<-m[,1:d] y<-m[,np1] if(is.na(cval[1])){temp<-idealf(x[,2]) cval<-c(temp$ql,median(x[,2]),temp$qu) } xrem<-x x2<-x[,2] n<-nrow(x) sig<-sqrt(var(x2)) temp<-idealf(x2) iqr<-(temp$qu-temp$ql)/1.34 A1<-min(c(sig,iqr)) A<-1.77 hval<-A*(1/n)^(1/6) # Silverman, 1986, p. 86 svec<-NA for(j in 1:d){ sig<-sqrt(var(x[,j])) temp<-idealf(x[,j]) iqr<-(temp$qu-temp$ql)/1.34 A<-min(c(sig,iqr)) svec[j]<-A x[,j]<-x[,j]/A } hval<-hval*sqrt(mean(svec^2)) ilow<-round(iran*length(y)) iup<-round((1-iran)*length(y)) for(il in 1:length(cval)){ temp4<-NA for(j in 1:nrow(x)){ temp4[j]<-((x2[j]-cval[il])/A1)^2 } yhat<-NA epan1<-ifelse(temp4<1,.75*(1-temp4),0) # Epanechnikov kernel for x2 for(j in 1:n){ yhat[j]<-NA temp1<-cbind(x[,1]-x[j,1],x[,2]-cval[il]/A)/hval temp1<-temp1^2 temp1<-apply(temp1,1,FUN="sum") temp<-.5*(d+2)*(1-temp1)/gamma(.5)^2 epan<-ifelse(temp1<1,temp,0) # Epanechnikov kernel, for both x1 and x2 if(epan1[j]>0)epan[j]<-epan[j]/epan1[j] if(epan1[j]==0)epan[j]<-0 chkit<-sum(epan!=0) if(chkit >= np1){ vals<-lsfit(x[,1],y,wt=epan)$coef yhat[j]<-x[j,1]*vals[2]+vals[1] }} if(plotit){ xorder<-order(xrem[,1]) if(il==1)plot(xrem[,1],y,xlab=xlab,ylab=ylab) lines(xrem[xorder[ilow:iup],1],yhat[xorder[ilow:iup]],lty=il) }} m<-"Done" if(pyhat)m<-yhat m } mscor<-function(m,corfun=spear,cop=2,MM=F,gval=NA,ap=T,pw=T){ # # m is an n by p matrix # # Compute a skipped correlation matrix # # corfun indicates the correlation to be used # corfun=pcor uses Pearson's correlation # corfun=spear uses Spearman's correlation # # This function returns the p by p matrix of correlations # # Method: Eliminate outliers using a projection technique. # That is, compute Donoho-Gasko median, for each point # consider the line between it and the median, # project all points onto this line, and # check for outliers using a boxplot rule. # Repeat this for all points. A point is declared # an outlier if for any projection it is an outlier # using a modification of the usual boxplot rule. # # cop determines how center of the scatterplot is # estimated; see the function outpro. # cop=l Donoho-Gasko halfspace median # cop=2 MCD measure of location # cop=3 marginal medians # cop=4 MVE measure of location # # Eliminate any outliers and compute # correlations using remaining data. # # gval is critical value for determining whether a point # is an outlier. It is determined automatically if not specified, # assuming that Spearman's correlation is used. Critical # values when using some other correlation have not been # determined. # # Hypothesis of zero correlations tested with FWE=.05 # # AGRUMENTS: # MM; see function outpro # ap=T all pairwise comparisons are tested # ap=F first variable is tested versus all others # (for a total of p-1 tests). # pw=T, print message about high execution time # pw=F, suppress the message. # m<-elimna(m) p<-ncol(m) pm<-p-1 n<-nrow(m) if(p<2)stop("Something wrong; number of variables is < 2") if(pw && cop==1){ print("If execution time is too high,") print("use cop=2 or 4 rather than the default value of 1") } if(ap){ inter<-c(2.374,2.780,3.030,3.208,3.372,3.502,3.722,3.825,3.943) slope<-c(5.333,8.8,25.67,32.83,51.53,75.02,111.34,123.16,126.72) expo<-c(-1,-1,-1.2,-1.2,-1.3,-1.4,-1.5,-1.5,-1.5) if(p>10){ qvec<-NA for(i in 1:9)qvec[i]<-inter[i]+slope[i]*n^expo[i] pval<-c(2:10) temp<-lsfit(pval,qvec)$coef } } if(!ap){ inter<-c(2.374,2.54,2.666,2.92,2.999,3.097,3.414,3.286,3.258) slope<-c(5.333,8.811,14.89,20.59,51.01,52.15,58.498,64.934,59.127) expo<-c(-1,-1,-1.2,-1.2,-1.5,-1.5,-1.5,-1.5,-1.5) if(p>10){ qvec<-NA for(i in 1:9)qvec[i]<-inter[i]+slope[i]*n^expo[i] pval<-c(1:9) temp<-lsfit(pval,qvec)$coef } } if(p<=10)crit<-inter[pm]+slope[pm]*n^expo[pm] if(p>10)crit<-temp[2]*p+temp[1] if(cop!=1 && is.na(gval))gval<-sqrt(qchisq(.975,ncol(m))) temp<-outpro(m,plotit=F,MM=MM,gval=gval,cop=cop)$keep mcor<-corfun(m[temp,])$cor test<-abs(mcor*sqrt((nrow(m)-2)/(1-mcor^2))) diag(test) <- NA if(!ap){ test<-as.matrix(test[1,]) } list(cor=mcor,crit.val=crit,test.stat=test) } dfried<-function(m,plotit=T,pop=0,fr=.8,v2=F,op=F){ # # Compare dependent groups using halfspace depth of # 0 relative to distribution of differences. # # When plotting differences scores: # pop=1 Plot expected frequency curve # pop=2 kernel density estimate # pop=3 S+ kernel density estimate # pop=4 boxplot # if(is.list(m))m<-matl(m) if(!is.matrix(m))stop("m should be a matrix having at least 2 columns.") m<-elimna(m) library(MASS) K<-ncol(m) n<-nrow(m) if(n<=10 && !op)print("With n<=10, might want to use op=T") J<-(K^2-K)/2 dcen<-cov.mcd(m)$center center<-NA pval<-matrix(NA,ncol=J,nrow=nrow(m)) zvec<-rep(0,J) ic<-0 for(k in 1:K){ for(kk in 1:K){ if(k1)temp<-fdepth(pval0,center=center) } if(v2){ if(ncol(pval)>1)temp<-fdepthv2(pval0) } big.dep<-max(temp) if(op){ v3<-dmean(pval,tr=.5,dop=2) v3<-t(as.matrix(v3)) big.dep<-max(max(temp),fdepthv2(pval0,v3)) } phat<-temp[nrow(m)+1]/big.dep # Determine critical value if(K==2)crit<-0.95-1.46/n^.5 if(K==3)crit<-1.00-1.71/n^.5 if(K==4)crit<-1.06-1.77/n^.5 if(K==5)crit<-1.11-1.76/n^.5 if(K==6)crit<-1.41-1.62/n^.3 if(K==7)crit<-1.49-1.71/n^.3 if(K>=8)crit<-1.39-1.38/n^.3 crit<-min(c(crit,1)) if(plotit && ncol(pval)==1){ if(pop==0)akerd(pval,fr=fr) if(pop==1)rdplot(pval,fr=fr) if(pop==2)kdplot(pval) if(pop==3)skerd(pval) if(pop==4)boxplot(pval) } list(phat=phat,crit.val=crit) } wrregfun<-function(slope,x=x,y=y){ x<-as.matrix(x) res<-y-x%*%slope v1<-rank(res) v2<-sqrt(12)*(v1/(length(y)+1)-.5) wrregfun<-sum(v2*res) wrregfun } spat.sub<-function(x,theta){ xx<-x for(i in 1:ncol(x))xx[,i]<-x[,i]-theta[i] xx<-xx^2 temp<-sqrt(apply(xx,1,sum)) val<-mean(temp) val } spat<-function(x){ # # compute spatial median # x is an n by p matrix # if(!is.matrix(x))stop("x must be a matrix") x<-elimna(x) START<-apply(x,2,median) val<-nelder(x,ncol(x),spat.sub,START=START) val } snmreg<-function(x,y){ # # Compute regression S-estimator via Nelder-Mead method # The measure of scale is taken to be the percentage bend midvariance # x <- as.matrix(x) X<-cbind(x,y) X<-elimna(X) np<-ncol(X) N<-np-1 temp<-chreg(x,y)$coef START<-temp[2:np] temp<-nelder(X,N,FN=snmreg.sub,START=START) alpha <- median(y - x %*% temp) coef <- c(alpha,temp) res <- y - x %*% temp - alpha list(coef = coef, residuals = res) } rungen<-function(x,y,est=onestep,fr=1,plotit=T,scat=T,pyhat=F,eout=F,xout=F, xlab="x",ylab="y",outfun=out,...){ # # running interval smoother that can be used with any measure # of location or scale. By default, the MOM estimator is used. # # fr controls amount of smoothing plotit<-as.logical(plotit) scat<-as.logical(scat) m<-cbind(x,y) m<-elimna(m) if(eout && xout)stop("Not allowed to have eout=xout=T") if(eout){ flag<-outfun(m,plotit=F)$keep m<-m[flag,] } if(xout){ flag<-outfun(x)$keep m<-m[flag,] } x<-m[,1] y<-m[,2] rmd<-c(1:length(x)) for(i in 1:length(x))rmd[i]<-est(y[near(x,x[i],fr)],...) if(plotit){ if(scat){ plot(c(x,x),c(y,rmd),xlab=xlab,ylab=ylab,type="n") points(x,y) } if(!scat)plot(c(x,x),c(y,rmd),type="n",ylab=ylab,xlab=xlab) points(x,rmd,type="n") sx<-sort(x) xorder<-order(x) sysm<-rmd[xorder] lines(sx,sysm) } if(pyhat)output<-rmd if(!pyhat)output<-"Done" list(output=output) } adpchk<-function(x,y,adfun=adrun,gfun=runm3d,...){ # # Compare adfun, usually an additive fit, to fit # based on gfun. # fit1<-adfun(x,y,pyhat=T,plotit=F) fit2<-gfun(x,y,pyhat=T,plotit=F) plot(fit1,fit2,xplot="Additive Fit",yplot="Gen. Fit") abline(0,1) } pmodchk<-function(x,y,regfun=tsreg,gfun=runm3d,op=1,eout=F,xout=F,fr=.8,...){ # # Compare regression fit to smooth # fit1<-y-regfun(x,y)$res fit2<-gfun(x,y,pyhat=T,plotit=F,...) if(op==0)plot(fit1,fit2,xlab="Reg. Fit",ylab="Gen. Fit") if(op==1)lplot(fit1,fit2,eout=eout,xout=xout) if(op==2)runmean(fit1,fit2,eout=eout,xout=xout,fr=fr) abline(0,1) } adpchk<-function(x,y,adfun=adrun,gfun=runm3d,...){ # # Compare adfun, usually an additive fit, to fit # based on gfun. # fit1<-adfun(x,y,pyhat=T,plotit=F) fit2<-gfun(x,y,pyhat=T,plotit=F) plot(fit1,fit2,xlab="Additive Fit",ylab="Gen. Fit") abline(0,1) } adrun<-function(x,y,est=tmean,iter=10,pyhat=F,plotit=T,fr=1,xlab="X", ylab="Y",zlab="", theta=50,phi=25,expand=.5,scale=F,zscale=T,xout=F,eout=xout,outfun=out,...){ # # additive model based on running interval smoother # and backfitting algorithm # m<-elimna(cbind(x,y)) if(xout){ keepit<-rep(T,nrow(x)) flag<-outfun(x,plotit=F)$out.id keepit[flag]<-F x<-x[keepit,] y<-y[keepit] } x<-as.matrix(x) p<-ncol(x) if(p==1)val<-rungen(x[,1],y,est=est,pyhat=T,plotit=plotit,fr=fr, xlab=xlab,ylab=ylab,...)$output if(p>1){ library(MASS) library(akima) np<-p+1 x<-m[,1:p] y<-m[,np] fhat<-matrix(NA,ncol=p,nrow=length(y)) fhat.old<-matrix(NA,ncol=p,nrow=length(y)) res<-matrix(NA,ncol=np,nrow=length(y)) dif<-1 for(i in 1:p) fhat.old[,i]<-rungen(x[,i],y,est=est,pyhat=T,plotit=F,fr=fr,...)$output eval<-NA for(it in 1:iter){ for(ip in 1:p){ res[,ip]<-y for(ip2 in 1:p){ if(ip2 != ip)res[,ip]<-res[,ip]-fhat.old[,ip2] } fhat[,ip]<-rungen(x[,ip],res[,ip],est=est,pyhat=T,plotit=F,fr=fr,...)$output } eval[it]<-sum(abs(fhat/sqrt(sum(fhat^2))-fhat.old/sqrt(sum(fhat.old^2)))) if(it > 1){ itm<-it-1 dif<-abs(eval[it]-eval[itm]) } fhat.old<-fhat if(dif<.01)break } val<-apply(fhat,1,sum) aval<-est(y-val,...) val<-val+aval if(plotit && p==2){ fitr<-val iout<-c(1:length(fitr)) nm1<-length(fitr)-1 for(i in 1:nm1){ ip1<-i+1 for(k in ip1:length(fitr))if(sum(x[i,]==x[k,])==2)iout[k]<-0 } fitr<-fitr[iout>=1] # Eliminate duplicate points in the x-y plane # This is necessary when doing three dimensional plots # with the S-PLUS function interp mkeep<-x[iout>=1,] fitr<-interp(mkeep[,1],mkeep[,2],fitr) persp(fitr,theta=theta,phi=phi,xlab=xlab,ylab=ylab,zlab=zlab,expand=expand, scale=scale) }} if(!pyhat)val<-"Done" val } riplot<-function(x,y,adfun=adrun,plotfun=lplot,eout=T,xout=T,scale=F){ # # Plot used to investigate regression interaction # (the extent a generalized additive model does not fit data). # Compute additive fit, plot residuals # versus x, an n by 2 matrix. # if(!is.matrix(x))stop(" x must be a matrix") if(ncol(x)!=2)stop(" x must have two columns only") yhat<-adfun(x,y,pyhat=T,eout=eout,xout=xout,plotit=F) plotfun(x,y-yhat,eout=eout,xout=xout,scale=scale) } adtests1<-function(vstar,yhat,res,mflag,x,fr){ ystar<-yhat+res*vstar bres<-adrun(x,ystar,fr=fr,pyhat=T,plotit=F) bres<-ystar-bres rval<-0 for (i in 1:nrow(x)){ rval[i]<-sum(bres[mflag[,i]]) } rval } runsm2g<-function(x1,y1,x2,val=median(x2),est=tmean,sm=F,xlab="X", ylab="Y",...){ # # Plot of running interval smoother for two groups # Groups are defined according to whether x2=1. # if(!is.matrix(x))stop("Predictors are not stored in a matrix.") if(!is.matrix(pts))stop("The third argument, pts, must be a matrix.") library(MASS) m<-cov.mve(x) rmd<-1 # Initialize rmd nval<-1 for(i in 1:nrow(pts)){ rmd[i]<-est(y[near3d(x,pts[i,],fr,m)],...) nval[i]<-length(y[near3d(x,pts[i,],fr,m)]) } list(rmd=rmd,nval=nval) } lta.sub<-function(X,theta,h){ np<-ncol(X) p<-np-1 x<-X[,1:p] y<-X[,np] temp<-t(t(x)*theta[2:np]) yhat<-apply(temp,1,sum)+theta[1] res<-abs(y-yhat) res<-sort(res) val<-sum(res[1:h]) val } ltareg<-function(x, y, tr = 0.2, h = NA,op=2) { # # Compute the least trimmed absolute value regression estimator. # The default amount of trimming is .2 # op=1, use ltsreg as initial estimate # op!=1, use tsreg # # If h is specfied, use h smallest residuals, and ignore tr # x<-as.matrix(x) library(MASS) if(is.na(h)) h <- length(y) - floor(tr * length(y)) X<-cbind(x,y) X<-elimna(X) np<-ncol(X) p<-np-1 x<-X[,1:p] x<-as.matrix(x) y<-X[,np] if(op==1)temp<-ltsreg(x,y)$coef if(op!=1)temp<-tsreg(x,y)$coef START<-temp coef<-nelderv2(X,np,FN=lta.sub,START=START,h=h) res <- y - x%*%coef[2:np] - coef[1] list(coef = coef, residuals = res) } nelderv2<-function(x,N,FN,START=c(rep(1,N)),STEP=c(rep(1,N)), XMIN=c(rep(0,N)),XSEC=c(rep(0,N)),...){ # NELDER-MEAD method for minimzing a function # # TAKEN FROM OLSSON, J QUALITY TECHNOLOGY, 1974, 6, 56. # # x= n by p matrix containing data; it is used by # function to be minimized. # N= number of parameters # # FN=the function to be minimized # FORM: FN(x,theta), theta is vector containing # values for N parameters. # # START = starting values. # STEP=initial step. # This function returns the N values for theta that minimize FN # ICOUNT<-500 REQMIN<-.0000001 NN<-N+1 P<-matrix(NA,nrow=N,ncol=NN) P[,NN]<-START PBAR<-NA RCOEFF<-1 ECOEFF<-2 CCOEFF<-.5 KCOUNT<-ICOUNT ICOUNT<-0 DABIT<-2.04067e-35 BIGNUM<-1.e38 KONVGE<-5 XN<-N DN<-N Y<-rep(0,NN) Y[NN]<-FN(x,START,...) ICOUNT<-ICOUNT+1 for(J in 1:N){ DCHK<-START[J] START[J]<-DCHK+STEP[J] for(I in 1:N){ P[I,J]<-START[I] } Y[J]<-FN(x,START,...) ICOUNT<-ICOUNT+1 START[J]<-DCHK } I1000<-T while(I1000){ YLO<-Y[1] YNEWLO<-YLO ILO<-1 IHI<-1 for(I in 2:NN){ if(Y[I] < YLO){ YLO<-Y[I] ILO<-I} if(Y[I] > YNEWLO){ YNEWLO<-Y[I] IHI<-I} } DCHK<-(YNEWLO+DABIT)/(YLO+DABIT)-1 if(abs(DCHK) < REQMIN){ I1000<-F next } KONVGE<-KONVGE-1 if(KONVGE == 0){ KONVGE<-5 for(I in 1:N){ COORD1<-P[I,1] COORD2<-COORD1 for(J in 2:NN){ if(P[I,J] < COORD1)COORD1<-P[I,J] if(P[I,J] > COORD2)COORD2<-P[I,J] } # 2010 CONTINUE DCHK<-(COORD2+DABIT)/(COORD1+DABIT)-1 if(abs(DCHK) > REQMIN)break } } if(ICOUNT >= KCOUNT){ I1000<-F next } for(I in 1:N){ Z<-0.0 Z<-sum(P[I,1:NN]) # 6 Z<-Z-P[I,IHI] PBAR[I]<-Z/DN } PSTAR<-(1.+RCOEFF)*PBAR-RCOEFF*P[,IHI] YSTAR<-FN(x,PSTAR,...) ICOUNT<-ICOUNT+1 if(YSTAR < YLO && ICOUNT >= KCOUNT){ P[,IHI]<-PSTAR Y[IHI]<-YSTAR next } IFLAG<-T if(YSTAR < YLO){ P2STAR<-ECOEFF*PSTAR+(1-ECOEFF)*PBAR Y2STAR<-FN(x,P2STAR,...) ICOUNT<-ICOUNT+1 if(Y2STAR >= YSTAR){ P[,IHI]<-PSTAR Y[IHI]<-YSTAR next #In essence, go to 19 which goes to 1000 } IFLAG<-T while(YSTAR < Y[IHI]){ P[,IHI]<-P2STAR Y[IHI]<-Y2STAR IFLAG<-F break L<-sum(Y[1:NN] > YSTAR) if(L > 1){ P[,IHI]<-PSTAR Y[IHI]<-YSTAR IFLAG<-T break } if(L > 1)break # go to 19 if(L != 0){ P[1:N,IHI]<-PSTAR[1:N] Y[IHI]<-YSTAR } I1000<-F break if(ICOUNT >= KCOUNT){ I1000<-F next } P2STAR(1:N)<-CCOEFF*P[1:N,IHI]+(1-CCOEFF)*PBAR[1:N] Y2STAR<-FN(x,P2STAR,...) ICOUNT<-ICOUNT+1 } # END WHILE } if(IFLAG){ for(J in 1:NN){ P[,J]=(P[,J]+P[,ILO])*.5 XMIN<-P[,J] Y[J]<-FN(x,XMIN,...) } ICOUNT<-ICOUNT+NN if(ICOUNT < KCOUNT)next I1000<-F next } P[1:N,IHI]<-PSTAR[1:N] Y[IHI]<-YSTAR } for(J in 1:NN){ XMIN[1:N]<-P[1:N,J] } Y[J]<-FN(x,XMIN,...) YNEWLO<-BIGNUM for(J in 1:NN){ if (Y[J] < YNEWLO){ YNEWLO<-Y[J] IBEST<-J }} Y[IBEST]<-BIGNUM YSEC<-BIGNUM for(J in 1:NN){ if(Y[J] < YSEC){ YSEC<-Y[J] ISEC<-J }} XMIN[1:N]<-P[1:N,IBEST] XSEC[1:N]<-P[1:N,ISEC] XMIN } nelder<-function(x,N,FN,START=c(rep(1,N)),STEP=c(rep(1,N)), XMIN=c(rep(0,N)),XSEC=c(rep(0,N))){ # NELDER-MEAD method for minimzing a function # # TAKEN FROM OLSSON, J QUALITY TECHNOLOGY, 1974, 6, 56. # # x= n by p matrix containing data; it is used by # function to be minimized. # N= number of parameters # # FN=the function to be minimized # FORM: FN(x,theta), theta is vector containing # values for N parameters. # # START = starting values. # STEP=initial step. # This function returns the N values for theta that minimize FN # ICOUNT<-500 REQMIN<-.0000001 NN<-N+1 P<-matrix(NA,nrow=N,ncol=NN) P[,NN]<-START PBAR<-NA RCOEFF<-1 ECOEFF<-2 CCOEFF<-.5 KCOUNT<-ICOUNT ICOUNT<-0 DABIT<-2.04067e-35 BIGNUM<-1.e38 KONVGE<-5 XN<-N DN<-N Y<-rep(0,NN) Y[NN]<-FN(x,START) ICOUNT<-ICOUNT+1 for(J in 1:N){ DCHK<-START[J] START[J]<-DCHK+STEP[J] for(I in 1:N){ P[I,J]<-START[I] } Y[J]<-FN(x,START) ICOUNT<-ICOUNT+1 START[J]<-DCHK } I1000<-T while(I1000){ YLO<-Y[1] YNEWLO<-YLO ILO<-1 IHI<-1 for(I in 2:NN){ if(Y[I] < YLO){ YLO<-Y[I] ILO<-I} if(Y[I] > YNEWLO){ YNEWLO<-Y[I] IHI<-I} } DCHK<-(YNEWLO+DABIT)/(YLO+DABIT)-1 if(abs(DCHK) < REQMIN){ I1000<-F next } KONVGE<-KONVGE-1 if(KONVGE == 0){ KONVGE<-5 for(I in 1:N){ COORD1<-P[I,1] COORD2<-COORD1 for(J in 2:NN){ if(P[I,J] < COORD1)COORD1<-P[I,J] if(P[I,J] > COORD2)COORD2<-P[I,J] } # 2010 CONTINUE DCHK<-(COORD2+DABIT)/(COORD1+DABIT)-1 if(abs(DCHK) > REQMIN)break } } if(ICOUNT >= KCOUNT){ I1000<-F next } for(I in 1:N){ Z<-0.0 Z<-sum(P[I,1:NN]) # 6 Z<-Z-P[I,IHI] PBAR[I]<-Z/DN } PSTAR<-(1.+RCOEFF)*PBAR-RCOEFF*P[,IHI] YSTAR<-FN(x,PSTAR) ICOUNT<-ICOUNT+1 if(YSTAR < YLO && ICOUNT >= KCOUNT){ P[,IHI]<-PSTAR Y[IHI]<-YSTAR next } IFLAG<-T if(YSTAR < YLO){ P2STAR<-ECOEFF*PSTAR+(1-ECOEFF)*PBAR Y2STAR<-FN(x,P2STAR) ICOUNT<-ICOUNT+1 if(Y2STAR >= YSTAR){ P[,IHI]<-PSTAR Y[IHI]<-YSTAR next #In essence, go to 19 which goes to 1000 } IFLAG<-T while(YSTAR < Y[IHI]){ P[,IHI]<-P2STAR Y[IHI]<-Y2STAR IFLAG<-F break L<-sum(Y[1:NN] > YSTAR) if(L > 1){ P[,IHI]<-PSTAR Y[IHI]<-YSTAR IFLAG<-T break } if(L > 1)break # go to 19 if(L != 0){ P[1:N,IHI]<-PSTAR[1:N] Y[IHI]<-YSTAR } I1000<-F break if(ICOUNT >= KCOUNT){ I1000<-F next } P2STAR(1:N)<-CCOEFF*P[1:N,IHI]+(1-CCOEFF)*PBAR[1:N] Y2STAR<-FN(x,P2STAR) ICOUNT<-ICOUNT+1 } # END WHILE } if(IFLAG){ for(J in 1:NN){ P[,J]<-(P[,J]+P[,ILO])*.5 XMIN<-P[,J] Y[J]<-FN(x,XMIN) } ICOUNT<-ICOUNT+NN if(ICOUNT < KCOUNT)next I1000<-F next } P[1:N,IHI]<-PSTAR[1:N] Y[IHI]<-YSTAR } for(J in 1:NN){ XMIN[1:N]<-P[1:N,J] } Y[J]<-FN(x,XMIN) YNEWLO<-BIGNUM for(J in 1:NN){ if (Y[J] < YNEWLO){ YNEWLO<-Y[J] IBEST<-J }} Y[IBEST]<-BIGNUM YSEC<-BIGNUM for(J in 1:NN){ if(Y[J] < YSEC){ YSEC<-Y[J] ISEC<-J }} XMIN[1:N]<-P[1:N,IBEST] XSEC[1:N]<-P[1:N,ISEC] XMIN } splotg2<-function(x,y,op=T,xlab="X",ylab="Rel. Freq."){ # # Frequency plot # x<-x[!is.na(x)] temp<-sort(unique(x)) freqx<-NA for(i in 1:length(temp)){ freqx[i]<-sum(x==temp[i]) } freqx<-freqx/length(x) y<-y[!is.na(y)] tempy<-sort(unique(y)) freqy<-NA for(i in 1:length(tempy)){ freqy[i]<-sum(y==tempy[i]) } freqy<-freqy/length(y) plot(c(temp,tempy),c(freqx,freqy),type="n",xlab=xlab,ylab=ylab) points(temp,freqx) points(tempy,freqy,pch="o") if(op){ lines(temp,freqx) lines(tempy,freqy,lty=2) } } stein1.tr<-function(x,del,alpha=.05,pow=.8,tr=.2){ # # Extension of Stein's method when performing all pairwise # comparisons among J dependent groups. # # If x represents a single group, one-sample analysis is performed. # if(tr < 0 || tr >=.5)stop("Argument tr must be between 0 and .5") if(is.matrix(x))m<-x if(is.list(x))m<-matl(x) if(!is.matrix(x) && !is.list(x))m<-matrix(x,ncol=1) m<-elimna(m) m<-as.matrix(m) ntest<-1 n<-nrow(m) J<-ncol(m) if(ncol(m) > 1)ntest<-(J^2-J)/2 g<-floor(tr*nrow(m)) df<-n-2*g-1 t1<-qt(pow,df) t2<-qt(alpha/(2*ntest),df) dv<-(del/(t1-t2))^2 nvec<-NA if(ntest > 1){ ic<-0 for (j in 1:ncol(m)){ for (jj in 1:ncol(m)){ if(j=.5)stop("Argument tr must be between 0 and .5") if(is.matrix(x))m<-x if(is.list(x))m<-matl(x) if(is.list(y))y<-matl(y) if(!is.matrix(x) && !is.list(x))m<-matrix(x,ncol=1) if(!is.matrix(y) && !is.list(y))y<-matrix(y,ncol=1) m<-elimna(m) m<-as.matrix(m) g<-floor(tr*nrow(m)) df<-nrow(m)-2*g-1 m<-rbind(m,y) ic<-0 ntest<-(ncol(m)^2-ncol(m))/2 if(ntest==0)ntest<-1 test<-matrix(NA,ncol=3,nrow=ntest) for (j in 1:ncol(m)){ for (jj in 1:ncol(m)){ if(j1){ if(is.na(center[1])){ if(cop==1)center<-dmean(m,tr=.5,dop=dop) if(cop==2)center<-cov.mcd(m,print=F)$center if(cop==3)center<-apply(m,2,median) if(cop==4)center<-cov.mve(m,print=F)$center if(cop==5)center<-smean(m) } dmat<-matrix(NA,ncol=nrow(m),nrow=nrow(m)) for (i in 1:nrow(m)){ B<-m[i,]-center dis<-NA BB<-B^2 bot<-sum(BB) if(bot!=0){ for (j in 1:nrow(m)){ A<-m[j,]-center temp<-sum(A*B)*B/bot dis[j]<-sqrt(sum(temp^2)) } if(!MM){ temp<-idealf(dis) dmat[,i]<-dis/(temp$qu-temp$ql) } if(MM)dmat[,i]<-dis/mad(dis) }} pdis<-apply(dmat,1,max,na.rm=T) } pdis } runmbo<-function(x,y,fr=1,est=tmean,xlab="X",ylab="Y",pts=x,RNA=F,atr=0, pyhat=F,eout=F,outfun=out,plotit=T,xout=F,scat=T,nboot=40,SEED=T,...){ # # running interval smooth with bagging # # fr controls amount of smoothing # tr is the amount of trimming # # Missing values are automatically removed. # # RNA=F, do not remove missing values when averaging # (computing the smooth) at x # xout=T removes points for which x is an outlier # eout=F removes points for which (x,y) is an outlier # nmin estimate y|x only when number of points close # to x is > nmin # atr is amount of trimming when averaging over the bagged # values # est is the measure of location to be estimated # est=tmean means estimate 20% trimmed mean of y given x # if(SEED)set.seed(2) temp<-cbind(x,y) if(ncol(temp)>2)stop("Use run3bo with more than 1 predictor") temp<-elimna(temp) # Eliminate any rows with missing values if(eout && xout)stop("Not allowed to have eout=xout=T") if(eout){ flag<-outfun(temp,plotit=F)$keep temp<-temp[flag,] } if(xout){ flag<-outfun(x,plotit=F)$keep temp<-temp[flag,] } x<-temp[,1] y<-temp[,2] pts<-as.matrix(pts) mat<-matrix(NA,nrow=nboot,ncol=nrow(pts)) vals<-NA for(it in 1:nboot){ idat<-sample(c(1:length(y)),replace=T) xx<-temp[idat,1] yy<-temp[idat,2] mat[it,]<-runhat(xx,yy,pts=pts,est=est,fr=fr,...) } rmd<-apply(mat,2,mean,na.rm=RNA,tr=atr) if(plotit){ if(scat){ plot(c(x,x),c(y,rmd),xlab=xlab,ylab=ylab,type="n") points(x,y) } if(!scat)plot(c(x,x),c(y,rmd),type="n",xlab=xlab,ylab=ylab) points(x, rmd, type = "n") sx <- sort(x) xorder <- order(x) sysm <- rmd[xorder] lines(sx, sysm) } output="Done" if(pyhat)output<-rmd output } run3bo<-function(x,y,fr=1,est=tmean,theta = 50, phi = 25,nmin=0, pyhat=F,eout=F,outfun=out,plotit=T,xout=F,nboot=40,SEED=T, expand=.5,scale=F,xlab="X",ylab="Y",zlab="",...){ # # running mean using interval method # # fr controls amount of smoothing # tr is the amount of trimming # # Missing values are automatically removed. # library(MASS) library(akima) if(SEED)set.seed(2) temp<-cbind(x,y) x<-as.matrix(x) p<-ncol(x) p1<-p+1 if(p>2)plotit<-F temp<-elimna(temp) # Eliminate any rows with missing values. x<-temp[,1:p] x<-as.matrix(x) y<-temp[,p1] if(xout){ keepit<-rep(T,nrow(x)) flag<-outfun(x,plotit=F)$out.id keepit[flag]<-F x<-x[keepit,] y<-y[keepit] } mat<-matrix(NA,nrow=nboot,ncol=length(y)) vals<-NA for(it in 1:nboot){ idat<-sample(c(1:length(y)),replace=T) xx<-temp[idat,1:p] yy<-temp[idat,p1] tmy<-rung3hat(xx,yy,pts=x,est=est,fr=fr,...)$rmd mat[it,]<-tmy } rmd<-apply(mat,2,mean,na.rm=T) flag<-!is.na(rmd) rmd<-elimna(rmd) x<-x[flag,] y<-y[flag] nval<-NA m<-cov.mve(x) for(i in 1:nrow(x))nval[i]<-length(y[near3d(x,x[i,],fr,m)]) if(plotit && ncol(x)==2){ #if(ncol(x)!=2)stop("When plotting, x must be an n by 2 matrix") fitr<-rmd[nval>nmin] y<-y[nval>nmin] x<-x[nval>nmin,] iout<-c(1:length(fitr)) nm1<-length(fitr)-1 for(i in 1:nm1){ ip1<-i+1 for(k in ip1:length(fitr))if(sum(x[i,]==x[k,])==2)iout[k]<-0 } fitr<-fitr[iout>=1] # Eliminate duplicate points in the x-y plane # This is necessary when doing three dimensional plots # with the S-PLUS function interp mkeep<-x[iout>=1,] fit<-interp(mkeep[,1],mkeep[,2],fitr) persp(fit,theta=theta,phi=phi,xlab=xlab,ylab=ylab,zlab=zlab,expand=expand, scale=scale) } last<-"Done" if(pyhat)last<-rmd list(output=last) } ancom<-function(x1,y1,x2,y2,dchk=F,plotit=T,plotfun=rplot,nboot=500, alpha=.05,SEED=T,PARTEST=F,tr=0,...){ # # Omnibus ANCOVA # tr=0 is recommended for general use. tr>0 might result in # poor control over the probability of a Type I error. # PARTEST=T will test the hypothesis of parallel regression lines. # # Setting plotfun=rplotsm will smooth the plots via bagging # # dchk=T, points in design space with a halfspace of zero are eliminated # # PARTEST=F tests hypothesis that regression surface is a horizontal # plane through the origin # PARTEST=T tests the hypothesis that the two regression surfaces # are parallel. # flag1<-rep(T,length(y1)) flag2<-rep(T,length(y2)) if(dchk){ dep1<-fdepth(x2,x1) # depth of points in x1 relative to x2 dep2<-fdepth(x1,x2) flag1<-(dep1>0) flag2<-(dep2>0) } n1<-sum(flag1) n2<-sum(flag2) n<-n1+n2 y<-c(n2*y1[flag1]/n,0-n1*y2[flag2]/n) x1<-as.matrix(x1) x1<-x1[flag1,] x2<-as.matrix(x2) x2<-x2[flag2,] x1<-as.matrix(x1) x2<-as.matrix(x2) x<-rbind(x1,x2) if(plotit){ if(ncol(x)<=2)plotfun(x,y,...) } if(PARTEST)output<-indt(x,y,tr=tr,nboot=nboot,alpha=alpha,SEED=SEED) if(!PARTEST)output<-indt0(x,y,nboot=nboot,alpha=alpha,SEED=SEED) list(dstat=output$dstat,critd=output$critd) } indt0<-function(x,y,nboot=500,alpha=.05,flag=1,SEED=T){ # # Test the hypothesis that the regression plane # between x and y is a flat horizontal plane with intercept 0 # The method is based on results in # Stute et al. (1998, JASA, 93, 141-149). # # flag=1 gives Kolmogorov-Smirnov test statistic # flag=2 gives the Cramer-von Mises test statistic # flag=3 causes both test statistics to be reported. # if(SEED)set.seed(2) x<-as.matrix(x) # First, eliminate any rows of data with missing values. temp <- cbind(x, y) temp <- elimna(temp) pval<-ncol(temp)-1 x <- temp[,1:pval] y <- temp[, pval+1] x<-as.matrix(x) mflag<-matrix(NA,nrow=length(y),ncol=length(y)) for (j in 1:length(y)){ for (k in 1:length(y)){ mflag[j,k]<-(sum(x[j,]<=x[k,])==ncol(x)) } } # ith row of mflag indicates which rows of the matrix x are less # than or equal to ith row of x # yhat<-0 res<-y-yhat print("Taking bootstrap sample, please wait.") data<-matrix(runif(length(y)*nboot),nrow=nboot) data<-(data-.5)*sqrt(12) # standardize the random numbers. rvalb<-apply(data,1,indt0sub,yhat,res,mflag,x,tr) # An n x nboot matrix of R values rvalb<-rvalb/sqrt(length(y)) dstatb<-apply(abs(rvalb),2,max) wstatb<-apply(rvalb^2,2,mean) mstatb<-apply(abs(rvalb),2,median) dstatb<-sort(dstatb) wstatb<-sort(wstatb) mstatb<-sort(mstatb) # compute test statistic v<-c(rep(1,length(y))) rval<-indt0sub(v,yhat,res,mflag,x,tr) rval<-rval/sqrt(length(y)) dstat<-NA wstat<-NA critd<-NA critw<-NA ib<-round(nboot*(1-alpha)) if(flag==1 || flag==3){ dstat<-max(abs(rval)) critd<-dstatb[ib] } if(flag==2 || flag==3){ wstat<-mean(rval^2) critw<-wstatb[ib] } list(dstat=dstat,wstat=wstat,critd=critd,critw=critw) } indt0sub<-function(vstar,yhat,res,mflag,x,tr){ bres<-res*vstar rval<-0 for (i in 1:nrow(x)){ rval[i]<-sum(bres[mflag[,i]]) } rval } smeancr<-function(m,nullv=rep(0,ncol(m)),cop=3,MM=F,SEED=NA, nboot=500,plotit=T){ # # m is an n by p matrix # # Test hypothesis that multivariate skipped estimators # are all equal to the null value, which defaults to zero. # The level of the test is .05. # # Eliminate outliers using a projection method # That is, determine center of data using: # # cop=1 Donoho-Gasko median, # cop=2 MCD, # cop=3 marginal medians. # cop=4 MVE # # For each point # consider the line between it and the center # project all points onto this line, and # check for outliers using # # MM=F, a boxplot rule. # MM=T, rule based on MAD and median # # Repeat this for all points. A point is declared # an outlier if for any projection it is an outlier # using a modification of the usual boxplot rule. # # Eliminate any outliers and compute means # using remaining data. # if(is.na(SEED))set.seed(2) if(!is.na(SEED))set.seed(SEED) m<-elimna(m) n<-nrow(m) crit.level<-.05 if(n<=120)crit.level<-.045 if(n<=80)crit.level<-.04 if(n<=60)crit.level<-.035 if(n<=40)crit.level<-.03 if(n<=30)crit.level<-.025 if(n<=20)crit.level<-.02 data<-matrix(sample(n,size=n*nboot,replace=T),nrow=nboot) val<-matrix(NA,ncol=ncol(m),nrow=nboot) for(j in 1: nboot){ mm<-m[data[j,],] temp<-outpro(mm,plotit=F,cop=cop)$keep val[j,]<-apply(mm[temp,],2,mean) } temp<-pdis(rbind(val,nullv)) sig.level<-sum(temp[nboot+1]1){ if(ncol(x)==2 && !scale){ if(pr){print("scale=F is specified.") print("If there is dependence, might want to use scale=T") }} if(is.na(fr))fr<-1 val<-rung3d(x,y,est=est,fr=fr,plotit=plotit,pyhat=T,SEED=SEED,nmin=nmin, xout=xout,outfun=outfun,scale=scale,phi=phi,theta=theta,expand=expand, duplicate="error",...) } if(ncol(x)==1){ E.power<-varfun(val2[!is.na(val2)])/varfun(y) stra=sqrt(E.power) } if(ncol(x)>1){ E.power<-NULL stra=NULL } # With p>1 predictors, estimate of explanatory power is generally poor. #E.power<-varfun(val[!is.na(val)])/varfun(y) if(!pyhat)val <- NULL list(Strength.Assoc=stra,Explanatory.Power = E.power, yhat = val) } rplotsm<-function(x,y,est=tmean,fr=1,plotit=T,pyhat=F,nboot=40,atr=0,nmin=0, outfun=out,eout=F,xlab="X",ylab="Y",scat=T,SEED=T,expand=.5,scale=F, varfun=pbvar,pr=T,...){ x<-as.matrix(x) if(ncol(x)==1){ val<-runmbo(x,y,est=est,scat=scat,fr=fr,plotit=plotit,pyhat=T, xlab=xlab,ylab=ylab,eout=eout,nboot=nboot,outfun=outfun,SEED=SEED,atr=atr,...) } if(ncol(x)>1){ if(ncol(x)==2 && !scale){ print("scale=F is specified.") print("If there is dependence, use scale=T") } if(ncol(x)>2)plotit<-F val<-run3bo(x,y,est=est,fr=fr,nmin=nmin,plotit=plotit,pyhat=T,phi=25, theta=50,xlab=xlab,ylab=ylab, eout=eout,outfun=outfun,SEED=SEED,expand=expand,scale=scale,nboot=nboot,...) val<-val$output } E.power<-varfun(val[!is.na(val)])/varfun(y) #if(pr)print(paste("Explanatory.power=",E.power)) #if(!pyhat)val<-"Done" if(!pyhat)val <- NULL #val list(Strength.Assoc=sqrt(E.power),Explanatory.Power = E.power, yhat = val) } zdepth<-function(m,pts=m,zloc=median,zscale=mad){ # # Compute depth of points as in Zuo, Annals, 2003 # if(!is.matrix(m))stop("argument m should be a matrix") if(!is.matrix(pts))stop("argument pts should be a matrix") if(ncol(m)!=ncol(pts))stop("Number of columns for m and pts are not equal") np<-ncol(m) val<-NA for(i in 1:nrow(pts)){ pval<-pts[i,] START<-rep(1,np)/sqrt(np) temp<-nelderv2(m,np,FN=zdepth.sub,START=START,zloc=zloc,zscale=zscale,pts=pval) temp<-temp/sqrt(sum(temp^2)) y<-t(t(m)*temp) y<-apply(y,1,sum) ppro<-sum(pval*temp) val[i]<-abs(ppro-zloc(y))/zscale(y) } val } zdepth.sub<-function(x,theta,zloc=median,zscale=mad,pts=NA){ theta<-theta/sqrt(sum(theta^2)) temp<-t(t(x)*theta) ppro<-sum(t(t(pts)*theta)) yhat<-apply(temp,1,sum) val<-0-abs(ppro-zloc(yhat))/zscale(yhat) val } opregpb<-function(x,y,nboot=1000,alpha=.05,om=T,ADJ=T, nullvec=rep(0,ncol(x)+1),plotit=T,opdis=2,gval=sqrt(qchisq(.95,ncol(x)+1))){ # # generate bootstrap estimates # use projection-type outlier detection method followed by # TS regression. # # om=T and ncol(x)>1, means an omnibus test is performed, # otherwise only individual tests of parameters are performed. # # opdis=2, means that Mahalanobis distance is used # opdis=1, means projection-type distance is used # # gval is critical value for projection-type outlier detection # method # # ADJ=T, Adjust p-values as described in Section 11.1.5 of the text. # x<-as.matrix(x) m<-cbind(x,y) p1<-ncol(x)+1 m<-elimna(m) # eliminate any rows with missing data x<-m[,1:ncol(x)] x<-as.matrix(x) y<-m[,p1] if(nrow(x)!=length(y))stop("Sample size of x differs from sample size of y") if(!is.matrix(x))stop("Data should be stored in a matrix") print("Taking bootstrap samples. Please wait.") data<-matrix(sample(length(y),size=length(y)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,regboot,x,y,regfun=opreg) # bvec is a p+1 by nboot matrix. The first row # contains the bootstrap intercepts, the second row # contains the bootstrap values for first predictor, etc. # using Hochberg method bvec<-t(bvec) dvec<-alpha/(c(1:ncol(x))) test<-NA icl0<-round(alpha*nboot/2) icl<-round(alpha*nboot/(2*ncol(x))) icu0<-nboot-icl0 icu<-nboot-icl output<-matrix(0,p1,6) dimnames(output)<-list(NULL,c("Param.","p.value","p.crit", "ci.lower","ci.upper","s.e.")) pval<-NA for(i in 1:p1){ output[i,1]<-i-1 se.val<-var(bvec[,i]) temp<-sort(bvec[,i]) output[i,6]<-sqrt(se.val) if(i==1){ output[i,4]<-temp[icl0+1] output[i,5]<-temp[icu0] } if(i>1){ output[i,4]<-temp[icl+1] output[i,5]<-temp[icu] } pval[i]<-sum((temp>nullvec[i]))/length(temp) if(pval[i]>.5)pval[i]<-1-pval[i] } fac<-2 if(ADJ){ # Adjust p-value if n<60 nval<-length(y) if(nval<20)nval<-20 if(nval>60)nval<-60 fac<-2-(60-nval)/40 } pval[1]<-2*pval[1] pval[2:p1]<-fac*pval[2:p1] output[,2]<-pval temp2<-order(0-pval[2:p1]) zvec<-dvec[1:ncol(x)] sigvec<-(test[temp2]>=zvec) output[temp2+1,3]<-zvec output[1,3]<-NA output[,2]<-pval om.pval<-NA temp<-opreg(x,y)$coef if(om && ncol(x)>1){ temp2<-rbind(bvec[,2:p1],nullvec[2:p1]) if(opdis==1)dis<-pdis(temp2,pr=F,center=temp[2:p1]) if(opdis==2){ cmat<-var(bvec[,2:p1]-apply(bvec[,2:p1],2,mean)+temp[2:p1]) dis<-mahalanobis(temp2,temp[2:p1],cmat) } om.pval<-sum((dis[nboot+1]<=dis[1:nboot]))/nboot } # do adjusted p-value nval<-length(y) if(nval<20)nval<-20 if(nval>60)nval<-60 adj.pval<-om.pval/2+(om.pval-om.pval/2)*(nval-20)/40 if(ncol(x)==2 && plotit){ plot(bvec[,2],bvec[,3],xlab="Slope 1",ylab="Slope 2") temp.dis<-order(dis[1:nboot]) ic<-round((1-alpha)*nboot) xx<-bvec[temp.dis[1:ic],2:3] xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) lines(xx[temp,]) lines(xx[c(temp[1],temp[length(temp)]),]) } list(output=output,om.pval=om.pval,adj.om.pval=adj.pval) } logrsm<-function(x,y,fr=1,plotit=T,pyhat=F,xlab="X",ylab="Y"){ # # Do a smooth as described by Hosmer and Lemeshow, p. 85 # # Assuming there is only one predictor # temp<-cbind(x,y) temp<-elimna(temp) x<-temp[,1] y<-temp[,2] x<-(x-median(x))/mad(x) m1<-outer(x,x,"-") m2<-exp(-1*m1^2)*(m1<=fr) m3<-matrix(y,length(y),length(y))*m2 yhat<-apply(m3,2,sum)/apply(m2,2,sum) #sum over rows for each column if(plotit){ plot(x,y,xlab=xlab,ylab=ylab) xor<-order(x) lines(x[xor],yhat[xor]) } output <- "Done" if(pyhat) output <- yhat list(output = output) } kslope<-function(x,y,pyhat=F,pts=x){ # # Estimate slope at points in pts using kernel method # # See Doksum et al. 1994, JASA, 89, 571- # m<-elimna(cbind(x,y)) x<-m[,1] y<-m[,2] n<-length(y) sig<-sqrt(var(x)) temp<-idealf(x) iqr<-(temp$qu-temp$ql)/1.34 A<-min(c(sig,iqr)) yhat<-NA vval<-NA vals<-NA rhosq<-NA for(k in 1:n){ temp1<-NA for(j in 1:n){ temp1[j]<-((x[j]-x[k])/A)^2 } epan<-ifelse(temp1<1,.75*(1-temp1),0) # Epanechnikov kernel, p. 76 chkit<-sum(epan!=0) if(chkit >= 2){ temp4<-lsfit(x,y,wt=epan) vals[k]<-temp4$coef[2] }} vals } nearl<-function(x,pt,fr=1){ # determine which values in x are near and less than pt # based on fr * mad m<-mad(x) if(m==0){ temp<-idealf(x) m<-(temp$qu-temp$ql)/(qnorm(.75)-qnorm(.25)) } if(m==0)m<-sqrt(winvar(x)/.4129) if(m==0)stop("All measures of dispersion are equal to 0") dis<-abs(x-pt) dflag<-dis <= fr*m flag2<-(xpt) dflag<-dflag*flag2 dflag } mgvmean<-function(m,op=0,outfun=outbox,se=T){ # # m is an n by p matrix # # Compute a multivariate skipped measure of location # using the MGV method # # Eliminate outliers using MGV method # # op=0 pairwise distances of points # op=1 MVE distances # op=2 MCD distances # # outfun indicates outlier rule to be applied to # the MGV distances. # By default, use boxplot rule # # Eliminate any outliers and compute means # using remaining data. # m<-elimna(m) temp<-outmgv(m,op=op,plotit=F)$keep val<-apply(m[temp,],2,mean) val } smgvcr<-function(m,nullv=rep(0,ncol(m)),SEED=T,op=0, nboot=500,plotit=T){ # # m is an n by p matrix # # Test hypothesis that estimand of the MGV estimator # is equal to the null value, which defaults to zero vector. # The level of the test is .05. # # Argument op: See function outmgv # if(SEED)set.seed(2) m<-elimna(m) n<-nrow(m) crit.level<-.05 if(n<=120)crit.level<-.045 if(n<=80)crit.level<-.04 if(n<=60)crit.level<-.035 if(n<=40)crit.level<-.03 if(n<=30)crit.level<-.025 if(n<=20)crit.level<-.02 data<-matrix(sample(n,size=n*nboot,replace=T),nrow=nboot) val<-matrix(NA,ncol=ncol(m),nrow=nboot) for(j in 1: nboot){ mm<-m[data[j,],] temp<-outmgv(mm,plotit=F,op=op)$keep val[j,]<-apply(mm[temp,],2,mean) } temp<-mgvar(rbind(val,nullv),op=op) flag2<-is.na(temp) if(sum(flag2)>0)temp[flag2]<-0 sig.level<-sum(temp[nboot+1]=1)stop("q must be > 0 and < 1") n<-length(x) xsort<-sort(x) iq <- floor(q * n + 0.5) flag<-(iq<=0 || iq>n) qest<-NA if(!flag)qest<-xsort[iq] qest } smean2<-function(m1,m2,nullv=rep(0,ncol(m1)),cop=3,MM=F,SEED=NA, nboot=500,plotit=T){ # # m is an n by p matrix # # For two independent groups, # test hypothesis that multivariate skipped estimators # are all equal. # # The level of the test is .05. # # Skipped estimator is used, i.e., # eliminate outliers using a projection method # That is, determine center of data using: # # cop=1 Donoho-Gasko median, # cop=2 MCD, # cop=3 marginal medians. # cop=4 MVE # # For each point # consider the line between it and the center # project all points onto this line, and # check for outliers using # # MM=F, a boxplot rule. # MM=T, rule based on MAD and median # # Repeat this for all points. A point is declared # an outlier if for any projection it is an outlier # using a modification of the usual boxplot rule. # # Eliminate any outliers and compute means # using remaining data. # if(ncol(m1) != ncol(m2)){ stop("Number of variables in group 1 does not equal the number in group 2.") } if(is.na(SEED))set.seed(2) if(!is.na(SEED))set.seed(SEED) m1<-elimna(m1) m2<-elimna(m2) n1<-nrow(m1) n2<-nrow(m2) n<-min(c(n1,n2)) crit.level<-.05 if(n<=120)crit.level<-.045 if(n<=80)crit.level<-.04 if(n<=60)crit.level<-.035 if(n<=40)crit.level<-.03 if(n<=30)crit.level<-.025 if(n<=20)crit.level<-.02 #data1<-matrix(sample(n1,size=n1*nboot,replace=T),nrow=nboot) #data2<-matrix(sample(n2,size=n2*nboot,replace=T),nrow=nboot) val<-matrix(NA,ncol=ncol(m1),nrow=nboot) for(j in 1: nboot){ data1<-sample(n1,size=n1,replace=T) data2<-sample(n2,size=n2,replace=T) mm1<-m1[data1,] temp<-outpro(mm1,plotit=F,cop=cop)$keep v1<-apply(mm1[temp,],2,mean) mm2<-m2[data2,] temp<-outpro(mm2,plotit=F,cop=cop)$keep v2<-apply(mm2[temp,],2,mean) val[j,]<-v1-v2 } temp<-pdis(rbind(val,nullv)) #print(temp) sig.level<-sum(temp[nboot+1]0, function plots a smooth using # middle 80% of the x values versus y # With np=0, it plots using all x values # or all values in pts if values are stored in it. # With np=0, pts=x is used. # # pyhat=T, the function returns the estimated y values # corresponding to x values in pts. If pts=NA, pts=x # is assumed. # m<-elimna(cbind(x,y)) if(eout){ keep<-outfun(m,plotit=F)$keep m<-m[keep,] } x<-m[,1] y<-m[,2] n<-length(x) sig<-sqrt(var(x)) temp<-idealf(x) iqr<-(temp$qu-temp$ql)/1.34 A<-min(c(sig,iqr)) yhat<-NA temp<-NA if(is.na(pts[1])){ if(np>0)pts<-seq(min(x),max(x),length=np) if(np==0)pts<-x } pts<-sort(pts) for(i in 1:length(pts)){ yhat[i]<-NA for(j in 1:length(x)){ temp[j]<-((x[j]-pts[i])/A)^2 } epan<-ifelse(temp<1,.75*(1-temp),0) chkit<-sum(epan!=0) if(chkit > 1){ vals<-lsfit(x,y,wt=epan)$coef yhat[i]<-vals[2]*pts[i]+vals[1] } } if(plotit){ plot(x,y,xlab=xlab,ylab=ylab) if(np>0){ ilow<-round(.1*np) iup<-round(.9*np) } if(np==0){ ilow<-1 iup<-length(pts) } lines(pts[ilow:iup],yhat[ilow:iup]) } m<-"Done" if(pyhat)m<-yhat m } qreg.sub<-function(X,theta,qval=.5){ np<-ncol(X) p<-np-1 x<-X[,1:p] y<-X[,np] temp<-t(t(x)*theta[2:np]) yhat<-apply(temp,1,sum)+theta[1] res<-y-yhat flag<-(res<=0) rval<-(qval-flag)*res val<-sum(rval) val } rmmcppb<-function(x,y=NA,alpha=.05,con=0,est=mest,plotit=T,dif=T,grp=NA,nboot=NA,BA=F,hoch=F,xlab="Group 1",ylab="Group 2",pr=T,SEED=T,...){ # # Use a percentile bootstrap method to compare dependent groups. # By default, # compute a .95 confidence interval for all linear contasts # specified by con, a J by C matrix, where C is the number of # contrasts to be tested, and the columns of con are the # contrast coefficients. # If con is not specified, all pairwise comparisons are done. # # By default, an M-estimator is used and a sequentially rejective method # is used to control the probability of at least one Type I error. # # dif=T indicates that difference scores are to be used # dif=F indicates that measure of location associated with # marginal distributions are used instead. # # nboot is the bootstrap sample size. If not specified, a value will # be chosen depending on the number of contrasts there are. # # x can be an n by J matrix or it can have list mode # for two groups, data for second group can be put in y # otherwise, assume x is a matrix (n by J) or has list mode. # # A sequentially rejective method is used to control alpha. # # Argument BA: When using dif=F, BA=T uses a correction term # that is recommended when using MOM. # if(dif){ if(pr)print("dif=T, so analysis is done on difference scores") temp<-rmmcppbd(x,y=y,alpha=.05,con=con,est,plotit=plotit,grp=grp,nboot=nboot, hoch=hoch,...) output<-temp$output con<-temp$con } if(!dif){ if(pr){ print("dif=F, so analysis is done on marginal distributions") if(!BA)print("With medians, M-estimator or MOM, suggest using BA=T and hoch=T") } if(!is.na(y[1]))x<-cbind(x,y) if(!is.list(x) && !is.matrix(x))stop("Data must be stored in a matrix or in list mode.") if(is.list(x)){ if(is.matrix(con)){ if(length(x)!=nrow(con))stop("The number of rows in con is not equal to the number of groups.") }} if(is.list(x)){ # put the data in an n by J matrix mat<-matl(x) } if(is.matrix(x) && is.matrix(con)){ if(ncol(x)!=nrow(con))stop("The number of rows in con is not equal to the number of groups.") mat<-x } if(is.matrix(x))mat<-x if(!is.na(sum(grp)))mat<-mat[,grp] mat<-elimna(mat) # Remove rows with missing values. x<-mat J<-ncol(mat) xcen<-x for(j in 1:J)xcen[,j]<-x[,j]-est(x[,j]) Jm<-J-1 if(sum(con^2)==0){ d<-(J^2-J)/2 con<-matrix(0,J,d) id<-0 for (j in 1:Jm){ jp<-j+1 for (k in jp:J){ id<-id+1 con[j,id]<-1 con[k,id]<-0-1 }}} d<-ncol(con) if(is.na(nboot)){ if(d<=4)nboot<-1000 if(d>4)nboot<-5000 } n<-nrow(mat) crit.vec<-alpha/c(1:d) connum<-ncol(con) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. xbars<-apply(mat,2,est) psidat<-NA for (ic in 1:connum)psidat[ic]<-sum(con[,ic]*xbars) psihat<-matrix(0,connum,nboot) psihatcen<-matrix(0,connum,nboot) bvec<-matrix(NA,ncol=J,nrow=nboot) bveccen<-matrix(NA,ncol=J,nrow=nboot) if(pr)print("Taking bootstrap samples. Please wait.") data<-matrix(sample(n,size=n*nboot,replace=T),nrow=nboot) for(ib in 1:nboot){ bvec[ib,]<-apply(x[data[ib,],],2,est,...) bveccen[ib,]<-apply(xcen[data[ib,],],2,est,...) } # # Now have an nboot by J matrix of bootstrap values. # test<-1 bias<-NA for (ic in 1:connum){ psihat[ic,]<-apply(bvec,1,bptdpsi,con[,ic]) psihatcen[ic,]<-apply(bveccen,1,bptdpsi,con[,ic]) bias[ic]<-sum((psihatcen[ic,]>0))/nboot-.5 ptemp<-(sum(psihat[ic,]>0)+.5*sum(psihat[ic,]==0))/nboot #if(BA)test[ic]<-sum((psihat[ic,]>0))/nboot-.1*bias[ic] if(BA)test[ic]<-ptemp-.1*bias[ic] #if(!BA)test[ic]<-sum((psihat[ic,]>0))/nboot if(!BA)test[ic]<-ptemp test[ic]<-min(test[ic],1-test[ic]) test[ic]<-max(test[ic],0) } test<-2*test ncon<-ncol(con) if(alpha==.05){ dvec<-c(.025,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) dvecba<-c(.05,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) if(ncon > 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.005,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) dvecba<-c(.01,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01){ dvec<-alpha/c(1:ncon) dvecba<-dvec dvec[2]<-alpha } if(hoch)dvec<-alpha/c(1:ncon) dvec<-2*dvec dvecba<-dvec if(plotit && ncol(bvec)==2){ z<-c(0,0) one<-c(1,1) plot(rbind(bvec,z,one),xlab=xlab,ylab=ylab,type="n") points(bvec) totv<-apply(x,2,est,...) cmat<-var(bvec) dis<-mahalanobis(bvec,totv,cmat) temp.dis<-order(dis) ic<-round((1-alpha)*nboot) xx<-bvec[temp.dis[1:ic],] xord<-order(xx[,1]) xx<-xx[xord,] temp<-chull(xx) lines(xx[temp,]) lines(xx[c(temp[1],temp[length(temp)]),]) abline(0,1) } temp2<-order(0-test) ncon<-ncol(con) zvec<-dvec[1:ncon] if(BA)zvec<-dvecba[1:ncon] sigvec<-(test[temp2]>=zvec) output<-matrix(0,connum,6) dimnames(output)<-list(NULL,c("con.num","psihat","p.value","p.sig","ci.lower","ci.upper")) tmeans<-apply(mat,2,est,...) psi<-1 output[temp2,4]<-zvec for (ic in 1:ncol(con)){ output[ic,2]<-sum(con[,ic]*tmeans) output[ic,1]<-ic output[ic,3]<-test[ic] temp<-sort(psihat[ic,]) icl<-round(output[ic,4]*nboot/2)+1 icu<-nboot-(icl-1) output[ic,5]<-temp[icl] output[ic,6]<-temp[icu] } } num.sig<-sum(output[,3]<=output[,4]) list(output=output,con=con,num.sig=num.sig) } linconb<-function(x,con=0,tr=.2,alpha=.05,nboot=600,pr=T){ # # Compute a 1-alpha confidence interval for a set of d linear contrasts # involving trimmed means using the percentile t bootstrap method. # Independent groups are assumed. # # The data are assumed to be stored in x in list mode. Thus, # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. Length(x)=the number of groups = J, say. # # Missing values are automatically removed. # # con is a J by d matrix containing the contrast coefficents of interest. # If unspecified, all pairwise comparisons are performed. # For example, con[,1]=c(1,1,-1,-1,0,0) and con[,2]=c(,1,-1,0,0,1,-1) # will test two contrasts: (1) the sum of the first two trimmed means is # equal to the sum of the second two, and (2) the difference between # the first two is equal to the difference between the trimmed means of # groups 5 and 6. # # The default number of bootstrap samples is nboot=599 # # This function uses functions trimparts and trimpartt written for this # book. # # # # if(pr){ print("Note: confidence intervals are adjusted to control FWE") print("But p-values are not adjusted to control FWE") } con<-as.matrix(con) if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in a matrix or in list mode.") J<-length(x) for(j in 1:J){ xx<-x[[j]] x[[j]]<-xx[!is.na(xx)] # Remove any missing values. } Jm<-J-1 d<-(J^2-J)/2 if(sum(con^2)==0){ con<-matrix(0,J,d) id<-0 for (j in 1:Jm){ jp<-j+1 for (k in jp:J){ id<-id+1 con[j,id]<-1 con[k,id]<-0-1 }}} if(nrow(con)!=length(x))stop("The number of groups does not match the number of contrast coefficients.") bvec<-array(0,c(J,2,nboot)) set.seed(2) # set seed of random number generator so that # results can be duplicated. if(pr)print("Taking bootstrap samples. Please wait.") for(j in 1:J){ paste("Working on group ",j) xcen<-x[[j]]-mean(x[[j]],tr) data<-matrix(sample(xcen,size=length(x[[j]])*nboot,replace=T),nrow=nboot) bvec[j,,]<-apply(data,1,trimparts,tr) # A 2 by nboot matrix. The first row # contains the bootstrap trimmed means, the second row # contains the bootstrap squared standard errors. } m1<-bvec[,1,] # J by nboot matrix containing the bootstrap trimmed means m2<-bvec[,2,] # J by nboot matrix containing the bootstrap sq. se. boot<-matrix(0,ncol(con),nboot) for (d in 1:ncol(con)){ top<-apply(m1,2,trimpartt,con[,d]) # A vector of length nboot containing psi hat values consq<-con[,d]^2 bot<-apply(m2,2,trimpartt,consq) boot[d,]<-abs(top)/sqrt(bot) } testb<-apply(boot,2,max) ic<-floor((1-alpha)*nboot) testb<-sort(testb) psihat<-matrix(0,ncol(con),4) test<-matrix(0,ncol(con),4) dimnames(psihat)<-list(NULL,c("con.num","psihat","ci.lower","ci.upper")) dimnames(test)<-list(NULL,c("con.num","test","se","p.value")) for (d in 1:ncol(con)){ test[d,1]<-d psihat[d,1]<-d testit<-lincon(x,con[,d],tr,pr=F) test[d,2]<-testit$test[1,2] pval<-mean((abs(testit$test[1,2])nmin] y<-y[nval>nmin] x<-x[nval>nmin,] iout<-c(1:length(fitr)) nm1<-length(fitr)-1 for(i in 1:nm1){ ip1<-i+1 for(k in ip1:length(fitr))if(sum(x[i,]==x[k,])==2)iout[k]<-0 } fitr<-fitr[iout>=1] # Eliminate duplicate points in the x-y plane # This is necessary when doing three dimensional plots # with the S-PLUS function interp mkeep<-x[iout>=1,] fit<-interp(mkeep[,1],mkeep[,2],fitr) persp(fit,theta=theta,phi=phi,expand=expand,xlab=xlab,ylab=ylab,zlab="", scale=scale) }} if(pyhat)last<-rmd if(!pyhat)last <- "Done" last } adtestl<-function(x,y,est=tmean,nboot=100,alpha=.05,fr=NA,SEED=T,...){ # # Test the hypothesis that the regression model is additive. # Use a variation of Stute et al. (1998, JASA, 93, 141-149). # method, and running interval version of the backfitting # algorithm # if(!is.matrix(x))stop("X values should be stored in a matrix") if(ncol(x)==1)stop("There should be two or more predictors") temp<-cbind(x,y) p<-ncol(x) p1<-p+1 temp<-elimna(temp) x<-temp[,1:p] x<-as.matrix(x) y<-temp[,p1] if(alpha<.05 && nboot<=100)warning("You used alpha<.05 and nboot<=100") if(is.na(fr)){ fr<-.8 if(ncol(x)==2){ nval<-c(20,30,50,80,150) fval<-c(0.40,0.36,0.18,0.15,0.09) if(length(y)<=150)fr<-approx(nval,fval,length(y))$y if(length(y)>150)fr<-.09 } } if(SEED)set.seed(2) x<-as.matrix(x) mflag<-matrix(NA,nrow=length(y),ncol=length(y)) for (j in 1:length(y)){ for (k in 1:length(y)){ mflag[j,k]<-(sum(x[j,]<=x[k,])==ncol(x)) } } yhat<-adrunl(x,y,plotit=F,fr=fr,pyhat=T) regres<-y-yhat print("Taking bootstrap sample, please wait.") data<-matrix(runif(length(y)*nboot),nrow=nboot) data<-sqrt(12)*(data-.5) # standardize the random numbers. rvalb<-apply(data,1,adtestls1,yhat,regres,mflag,x,fr) # An n x nboot matrix of R values rvalb<-rvalb/sqrt(length(y)) dstatb<-apply(abs(rvalb),2,max) wstatb<-apply(rvalb^2,2,mean) dstatb<-sort(dstatb) wstatb<-sort(wstatb) # compute test statistic v<-c(rep(1,length(y))) rval<-adtestls1(v,yhat,regres,mflag,x,fr) rval<-rval/sqrt(length(y)) dstat<-max(abs(rval)) wstat<-mean(rval^2) ib<-round(nboot*(1-alpha)) critd<-dstatb[ib] critw<-wstatb[ib] list(dstat=dstat,wstat=wstat,critd=critd,critw=critw) } adtestls1<-function(vstar,yhat,res,mflag,x,fr){ ystar<-yhat+res*vstar bres<-adrunl(x,ystar,fr=fr,pyhat=T,plotit=F) bres<-ystar-bres rval<-0 for (i in 1:nrow(x)){ rval[i]<-sum(bres[mflag[,i]]) } rval } adcom<-function(x,y,est=mean,tr=0,nboot=600,alpha=.05,fr=NA, jv=NA,SEED=T,...){ # # Test the hypothesis that component # jv # is zero. That is, in a generalized additive model, test # H_0: f_jv(X_jv) = 0. # Use a variation of Stute et al. (1998, JASA, 93, 141-149). # method, and running interval version of the backfitting # algorithm # # if jv=NA, all components are tested. # # Current version allows only 0 or 20% trimming # if(!is.matrix(x))stop("X values should be stored in a matrix") if(ncol(x)==1)stop("There should be two or more predictors") temp<-cbind(x,y) p<-ncol(x) p1<-p+1 temp<-elimna(temp) x<-temp[,1:p] x<-as.matrix(x) y<-temp[,p1] if(is.na(fr)){ if(tr==.2){ nval<-c(20,40,60,80,120,160) fval<-c(1.2,1,.85,.75,.65,.65) if(length(y)<=160)fr<-approx(nval,fval,length(y))$y if(length(y)>160)fr<-.65 } if(tr==0){ nval<-c(20,40,60,80,120,160) fval<-c(.8,.7,.55,.5,.5,.5) if(length(y)<=160)fr<-approx(nval,fval,length(y))$y if(length(y)>160)fr<-.6 } } if(is.na(fr))stop("Span can be deteremined only for 0 or .2 trimming") if(SEED)set.seed(2) x<-as.matrix(x) mflag<-matrix(NA,nrow=length(y),ncol=length(y)) for (j in 1:length(y)){ for (k in 1:length(y)){ mflag[j,k]<-(sum(x[j,]<=x[k,])==ncol(x)) } } if(!is.na(jv))prval<-jv if(is.na(jv))prval<-c(1:ncol(x)) c.sum<-matrix(NA,nrow=length(prval),ncol=2) dimnames(c.sum)<-list(NULL,c("d.stat","p.value")) for(ip in 1:length(prval)){ flag<-rep(T,ncol(x)) flag[prval[ip]]<-F yhat<-adrun(x[,flag],y,plotit=F,fr=fr,pyhat=T) regres<-y-yhat temp<-indt(x[,!flag],regres) c.sum[ip,1]<-temp$dstat c.sum[ip,2]<-temp$p.value.d } list(results=c.sum) } logadr<-function(x,y,iter=10,pyhat=F,plotit=T,fr=.8,xout=F,eout=xout, outfun=out,theta=50,phi=25,expand=.5,...){ # # additive model based on a variation of Copas' (1983) smooth # for binary outcomes. # (Use backfitting algorithm.) # m<-elimna(cbind(x,y)) x<-as.matrix(x) p<-ncol(x) p1<-p+1 y<-m[,p1] x<-m[,1:p] for (ip in 1:p)x[,ip]<-(x[,ip]-median(x[,ip]))/mad(x[,ip]) if(xout){ keepit<-rep(T,nrow(x)) flag<-outfun(x,plotit=F)$out.id keepit[flag]<-F x<-x[keepit,] y<-y[keepit] } x<-as.matrix(x) if(p==1)val<-logrsm(x[,1],y,pyhat=T,plotit=plotit)$output if(p>1){ np<-p+1 x<-m[,1:p] y<-m[,np] fhat<-matrix(NA,ncol=p,nrow=length(y)) fhat.old<-matrix(NA,ncol=p,nrow=length(y)) res<-matrix(NA,ncol=np,nrow=length(y)) dif<-1 for(i in 1:p) fhat.old[,i]<-logrsm(x[,i],y,pyhat=T,plotit=F)$output eval<-NA for(it in 1:iter){ for(ip in 1:p){ res[,ip]<-y for(ip2 in 1:p){ if(ip2 != ip)res[,ip]<-res[,ip]-fhat.old[,ip2] } fhat[,ip]<-logrsm(x[,ip],res[,ip],pyhat=T,plotit=F)$output } eval[it]<-sum(abs(fhat/sqrt(sum(fhat^2))-fhat.old/sqrt(sum(fhat.old^2)))) if(it > 1){ itm<-it-1 dif<-abs(eval[it]-eval[itm]) } fhat.old<-fhat if(dif<.01)break } val<-apply(fhat,1,sum) aval<-mean(y-val,...) val<-val+aval flag<-(val<0) val[flag]<-0 flag<-(val>1) val[flag]<-1 if(plotit && p==2){ fitr<-val iout<-c(1:length(fitr)) nm1<-length(fitr)-1 for(i in 1:nm1){ ip1<-i+1 for(k in ip1:length(fitr))if(sum(x[i,]==x[k,])==2)iout[k]<-0 } fitr<-fitr[iout>=1] # Eliminate duplicate points in the x-y plane # This is necessary when doing three dimensional plots # with the S-PLUS function interp mkeep<-x[iout>=1,] fitr<-interp(mkeep[,1],mkeep[,2],fitr) persp(fitr,theta=theta,phi=phi,expand=expand,xlab="x1",ylab="x2",zlab="", scale=scale) }} if(!pyhat)val<-"Done" val } logadr<-function(x,y,est=tmean,iter=10,pyhat=F,plotit=T,fr=.8,xout=F,eout=xout, outfun=out,theta=50,phi=25,expand=.5,STAND=T,...){ # # additive model based on a variation of Copas' (1983) smooth # for binary outcomes. # (Use backfitting algorithm.) # m<-elimna(cbind(x,y)) x<-as.matrix(x) p<-ncol(x) p1<-p+1 y<-m[,p1] x<-m[,1:p] if(STAND){ for (ip in 1:p)x[,ip]<-(x[,ip]-mean(x[,ip]))/sqrt(var(x[,ip])) } if(xout){ keepit<-rep(T,nrow(x)) flag<-outfun(x,plotit=F)$out.id keepit[flag]<-F x<-x[keepit,] y<-y[keepit] } x<-as.matrix(x) if(p==1)val<-rungen(x[,1],y,est=est,pyhat=T,plotit=plotit,fr=fr,...)$output if(p>1){ np<-p+1 x<-m[,1:p] y<-m[,np] fhat<-matrix(NA,ncol=p,nrow=length(y)) fhat.old<-matrix(NA,ncol=p,nrow=length(y)) res<-matrix(NA,ncol=np,nrow=length(y)) dif<-1 for(i in 1:p) fhat.old[,i]<-logrsm(x[,i],y,pyhat=T,plotit=F)$output eval<-NA for(it in 1:iter){ for(ip in 1:p){ res[,ip]<-y for(ip2 in 1:p){ if(ip2 != ip)res[,ip]<-res[,ip]-fhat.old[,ip2] } fhat[,ip]<-rungen(x[,ip],res[,ip],est=est,pyhat=T,plotit=F,fr=fr,...)$output } eval[it]<-sum(abs(fhat/sqrt(sum(fhat^2))-fhat.old/sqrt(sum(fhat.old^2)))) if(it > 1){ itm<-it-1 dif<-abs(eval[it]-eval[itm]) } fhat.old<-fhat if(dif<.01)break } val<-apply(fhat,1,sum) aval<-est(y-val,...) val<-val+aval if(plotit && p==2){ fitr<-val iout<-c(1:length(fitr)) nm1<-length(fitr)-1 for(i in 1:nm1){ ip1<-i+1 for(k in ip1:length(fitr))if(sum(x[i,]==x[k,])==2)iout[k]<-0 } fitr<-fitr[iout>=1] # Eliminate duplicate points in the x-y plane # This is necessary when doing three dimensional plots # with the S-PLUS function interp mkeep<-x[iout>=1,] fitr<-interp(mkeep[,1],mkeep[,2],fitr) persp(fitr,theta=theta,phi=phi,expand=expand,xlab="x1",ylab="x2",zlab="", scale=scale) }} if(!pyhat)val<-"Done" val } qhomtsub<-function(isub,x,y,qval){ # # Perform quantile regression using x[isub] to predict y[isub] # isub is a vector of length n, # a bootstrap sample from the sequence of integers # 1, 2, 3, ..., n # # This function is used by other functions when computing # bootstrap estimates. # # regfun is some regression method already stored in S-PLUS # It is assumed that regfun$coef contains the intercept and slope # estimates produced by regfun. The regression methods written for # this book, plus regression functions in S-PLUS, have this property. # # x is assumed to be a matrix containing values of the predictors. # xmat<-matrix(x[isub,],nrow(x),ncol(x)) temp<-qplotreg(xmat,y[isub],qval=qval,plotit=F) regboot<-temp[1,2]-temp[2,2] regboot } qplotreg<-function(x, y,qval=c(.2,.8),plotit=T,xlab="X",ylab="Y"){ # # Compute the quantile regression line for each of the # quantiles indicated by qval. # plotit=T, plot the results. # n<-length(qval) coef<-matrix(NA,ncol=2,nrow=n) x<-as.matrix(x) if(ncol(x)>1)stop("This version allows one predictor only.") if(plotit)plot(x,y,xlab=xlab,ylab=ylab) for(it in 1:n){ coef[it,]<-qreg(x,y,qval=qval[it],pr=F)$coef if(plotit)abline(coef[it,1],coef[it,2]) } coef } ancmpbpb<-function(x1,y1,x2,y2,fr1=1,fr2=1,alpha=.05,pts=NA,est=tmean,nboot=NA, bhop=F,SEED=T,...){ print("This function has been eliminated. Please use ancmppb instead.") } ancovamp<-function(x1,y1,x2,y2,fr1=1,fr2=1,tr=.2,alpha=.05,pts=NA){ # # Compare two independent groups using the ancova method. # No parametric assumption is made about the form of # the regression lines--a running interval smoother is used. # Design points are chosen based on depth of points in x1 if pts=NA # Assume data are in x1 y1 x2 and y2 # if(is.na(pts[1])){ x1<-as.matrix(x1) pts<-ancdes(x1) } pts<-as.matrix(pts) if(nrow(pts)>=29){ print("WARNING: More than 28 design points") print("Only first 28 are used.") pts<-pts[1:28,] } n1<-1 n2<-1 vecn<-1 mval1<-cov.mve(x1) mval2<-cov.mve(x2) for(i in 1:nrow(pts)){ n1[i]<-length(y1[near3d(x1,pts[i,],fr1,mval1)]) n2[i]<-length(y2[near3d(x2,pts[i,],fr2,mval2)]) } flag<-rep(T,nrow(pts)) for(i in 1:nrow(pts))if(n1[i]<10 || n2[i]<10)flag[i]<-F pts<-pts[flag,] if(sum(flag)==1)pts<-t(as.matrix(pts)) if(sum(flag)==0)stop("No comparable design points found, might increase span.") mat<-matrix(NA,nrow(pts),7) dimnames(mat)<-list(NULL,c("n1","n2","DIF","TEST","se","ci.low","ci.hi")) for (i in 1:nrow(pts)){ g1<-y1[near3d(x1,pts[i,],fr1,mval1)] g2<-y2[near3d(x2,pts[i,],fr2,mval2)] g1<-g1[!is.na(g1)] g2<-g2[!is.na(g2)] test<-yuen(g1,g2,tr=tr) mat[i,1]<-length(g1) mat[i,2]<-length(g2) if(length(g1)<=5)print(paste("Warning, there are",length(g1)," points corresponding to the design point X=",pts[i,])) if(length(g2)<=5)print(paste("Warning, there are",length(g2)," points corresponding to the design point X=",pts[i,])) mat[i,3]<-test$dif mat[i,4]<-test$teststat mat[i,5]<-test$se if(nrow(pts)>=2)critv<-smmcrit(test$df,nrow(pts)) if(nrow(pts)==1)critv<-qt(.975,test$df) cilow<-test$dif-critv*test$se cihi<-test$dif+critv*test$se mat[i,6]<-cilow mat[i,7]<-cihi } list(points=pts,output=mat,crit=critv) } qsm<-function(x,y,qval=c(.2,.5,.8),fr=.8,plotit=T,scat=T,pyhat=F,eout=F,xout=F,outfun=out,op=T){ # # running interval smoother for the quantiles stored in # qval # # fr controls amount of smoothing # op=T, use Harrell-Davis estimator # op=F, use single order statistic # plotit<-as.logical(plotit) scat<-as.logical(scat) m<-cbind(x,y) if(ncol(m)!=2)stop("Must have exactly one predictor") m<-elimna(m) if(eout && xout)stop("Not allowed to have eout=xout=T") if(eout){ flag<-outfun(m,plotit=F)$keep m<-m[flag,] } if(xout){ flag<-outfun(x)$keep m<-m[flag,] } x<-m[,1] y<-m[,2] rmd<-c(1:length(x)) if(pyhat)outval<-matrix(NA,ncol=length(qval),nrow=length(x)) if(scat)plot(x,y) for(it in 1:length(qval)){ if(!op)for(i in 1:length(x))rmd[i]<-qest(y[near(x,x[i],fr)],q=qval[it]) if(op)for(i in 1:length(x))rmd[i]<-hd(y[near(x,x[i],fr)],q=qval[it]) if(pyhat)outval[,it]<-rmd if(!scat)plot(x,y,type="n") points(x,rmd,type="n") sx<-sort(x) xorder<-order(x) sysm<-rmd[xorder] lines(sx,sysm) } if(pyhat)output<-outval if(!pyhat)output<-"Done" list(output=output) } locvar<-function(x,y,pyhat=F,pts=x,plotit=T){ # # For each x, estimate VAR(y|x) # with the method used by Bjerve and Doksum # i.e., use Fan's kernel regression method. # yhat<-locreg(x,y,pyhat=T,plotit=F,pts=x) val<-locreg(x,(y-yhat)^2,pyhat=pyhat,pts=pts,plotit=plotit) val } qhomt<-function(x,y,nboot=100,alpha=.05,qval=c(.2,.8),plotit=T,SEED=T, xlab="X",ylab="Y"){ # # Test hypothesis that the error term is homogeneous by # computing a confidence interval for beta_1-beta_2, the # difference between the slopes of the qval[2] and qval[1] quantile # regression slopes, estimated via the Koenker-Basset method. # So by default, use the .8 quantile and # the .2 quantiles # library(MASS) #library(lqs) # prior to version 1.9.0, use library(lqs) if(length(qval)!=2)stop("Argument qval should have 2 values exactly") x<-as.matrix(x) if(ncol(x)!=1)stop("Use qhomtv2 with more than one predictor") xy<-elimna(cbind(x,y)) x<-xy[,1] y<-xy[,2] x<-as.matrix(x) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") data<-matrix(sample(length(y),size=length(y)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,qhomtsub,x,y,qval) # An nboot vector. se<-sqrt(var(bvec)) temp<-qplotreg(x,y,qval=qval,plotit=plotit,xlab=xlab,ylab=ylab) crit<-qnorm(1-alpha/2) crit.ad<-NA if(alpha==.05 && qval[1]==.2 && qval[2]==.8)crit.ad<-qnorm(0.-.104/sqrt(nrow(x))+.975) dif<-temp[2,2]-temp[1,2] regci<-NA regci[1]<-dif-crit*se regci[2]<-dif+crit*se ci.ad<-c(dif-crit.ad*se,dif+crit.ad*se) sig.level<-2*(1-pnorm(abs(dif)/se)) list(dif.est=dif,dif.ci=regci,p.value=sig.level,adjusted.ci=ci.ad,se=se) } smmval<-function(dfvec,iter=10000,alpha=.05,SEED=T){ if(SEED)set.seed(1) dfv<-length(dfvec)/sum(1/dfvec) vals<-NA tvals<-NA J<-length(dfvec) for(i in 1:iter){ for(j in 1:J){ tvals[j]<-rt(1,dfvec[j]) } vals[i]<-max(abs(tvals)) } vals<-sort(vals) ival<-round((1-alpha)*iter) qval<-vals[ival] qval } lincon<-function(x,con=0,tr=.2,alpha=.05,KB=F,pr=T,crit=NA,SEED=T){ # # A heteroscedastic test of d linear contrasts using trimmed means. # # The data are assumed to be stored in $x$ in list mode. # Length(x) is assumed to correspond to the total number of groups, J # It is assumed all groups are independent. # # con is a J by d matrix containing the contrast coefficients that are used. # If con is not specified, all pairwise comparisons are made. # # Missing values are automatically removed. # # KB=F yields Welch-Sidak method # KB=T yields Kaiser-Bowden method # flag<-T if(alpha!= .05 && alpha!=.01)flag<-F if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in a matrix or in list mode.") con<-as.matrix(con) J<-length(x) h<-vector("numeric",J) w<-vector("numeric",J) xbar<-vector("numeric",J) for(j in 1:J){ xx<-!is.na(x[[j]]) val<-x[[j]] x[[j]]<-val[xx] # Remove missing values h[j]<-length(x[[j]])-2*floor(tr*length(x[[j]])) # h is the number of observations in the jth group after trimming. w[j]<-((length(x[[j]])-1)*winvar(x[[j]],tr))/(h[j]*(h[j]-1)) xbar[j]<-mean(x[[j]],tr) } if(sum(con^2)==0){ CC<-(J^2-J)/2 psihat<-matrix(0,CC,6) dimnames(psihat)<-list(NULL,c("Group","Group","psihat","ci.lower","ci.upper", "p.value")) test<-matrix(NA,CC,6) dimnames(test)<-list(NULL,c("Group","Group","test","crit","se","df")) jcom<-0 for (j in 1:J){ for (k in 1:J){ if (j < k){ jcom<-jcom+1 test[jcom,3]<-abs(xbar[j]-xbar[k])/sqrt(w[j]+w[k]) sejk<-sqrt(w[j]+w[k]) test[jcom,5]<-sejk psihat[jcom,1]<-j psihat[jcom,2]<-k test[jcom,1]<-j test[jcom,2]<-k psihat[jcom,3]<-(xbar[j]-xbar[k]) df<-(w[j]+w[k])^2/(w[j]^2/(h[j]-1)+w[k]^2/(h[k]-1)) test[jcom,6]<-df psihat[jcom,6]<-2*(1-pt(test[jcom,3],df)) if(!KB){ if(flag){ if(alpha==.05)crit<-smmcrit(df,CC) if(alpha==.01)crit<-smmcrit01(df,CC) } if(!flag)crit<-smmval(dfvec=rep(df,CC),alpha=alpha,SEED=SEED) } if(KB)crit<-sqrt((J-1)*(1+(J-2)/df)*qf(1-alpha,J-1,df)) test[jcom,4]<-crit psihat[jcom,4]<-(xbar[j]-xbar[k])-crit*sejk psihat[jcom,5]<-(xbar[j]-xbar[k])+crit*sejk }}}} if(sum(con^2)>0){ if(nrow(con)!=length(x)){ stop("The number of groups does not match the number of contrast coefficients.") } psihat<-matrix(0,ncol(con),5) dimnames(psihat)<-list(NULL,c("con.num","psihat","ci.lower","ci.upper", "p.value")) test<-matrix(0,ncol(con),5) dimnames(test)<-list(NULL,c("con.num","test","crit","se","df")) df<-0 for (d in 1:ncol(con)){ psihat[d,1]<-d psihat[d,2]<-sum(con[,d]*xbar) sejk<-sqrt(sum(con[,d]^2*w)) test[d,1]<-d test[d,2]<-sum(con[,d]*xbar)/sejk df<-(sum(con[,d]^2*w))^2/sum(con[,d]^4*w^2/(h-1)) if(flag){ if(alpha==.05)crit<-smmcrit(df,ncol(con)) if(alpha==.01)crit<-smmcrit01(df,ncol(con)) } if(!flag)crit<-smmval(dfvec=rep(df,ncol(con)),alpha=alpha,SEED=SEED) test[d,3]<-crit test[d,4]<-sejk test[d,5]<-df psihat[d,3]<-psihat[d,2]-crit*sejk psihat[d,4]<-psihat[d,2]+crit*sejk psihat[d,5]<-2*(1-pt(abs(test[d,2]),df)) } } if(pr){ print("Note: confidence intervals are adjusted to control FWE") print("But p-values are not adjusted to control FWE") } list(test=test,psihat=psihat) } bwmedimcp<-function(J,K,x,JK=J*K,grp=c(1:JK),alpha=.05){ # # Multiple comparisons for interactions # in a split-plot design. # The analysis is done by taking difference scores # among all pairs of dependent groups and # determining which of # these differences differ across levels of Factor A # using trimmed means. # # For MOM or M-estimators, use spmcpi which uses a bootstrap method # # The s-plus variable x is assumed to contain the raw # data stored in list mode or in a matrix. # If in list mode, x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second: level 1,2 # x[[K]] is the data for level 1,K # x[[K+1]] is the data for level 2,1, x[[2K]] is level 2,K, etc. # # If the data are in a matrix, column 1 is assumed to # correspond to x[[1]], column 2 to x[[2]], etc. # # When in list mode x is assumed to have length JK, the total number of # groups being tested, but a subset of the data can be analyzed # using grp # if(is.matrix(x)) { y <- list() for(j in 1:ncol(x)) y[[j]] <- x[, j] x <- y } JK<-J*K if(JK!=length(x))stop("Something is wrong. Expected ",JK," groups but x contains ", length(x), "groups instead.") MJ<-(J^2-J)/2 MK<-(K^2-K)/2 JMK<-J*MK MJMK<-MJ*MK Jm<-J-1 data<-list() for(j in 1:length(x)){ data[[j]]<-x[[grp[j]]] # Now have the groups in proper order. } x<-data output<-matrix(0,MJMK,7) dimnames(output)<-list(NULL,c("A","A","B","B","psihat","sig","crit.sig")) jp<-1-K kv<-0 kv2<-0 test<-NA for(j in 1:J){ jp<-jp+K xmat<-matrix(NA,ncol=K,nrow=length(x[[jp]])) for(k in 1:K){ kv<-kv+1 xmat[,k]<-x[[kv]] } xmat<-elimna(xmat) for(k in 1:K){ kv2<-kv2+1 x[[kv2]]<-xmat[,k] }} m<-matrix(c(1:JK),J,K,byrow=T) ic<-0 for(j in 1:J){ for(jj in 1:J){ if(j 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.01,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01){ dvec<-alpha/c(1:ncon) } temp2<-order(0-test) zvec<-dvec[1:ncon] sigvec<-(test[temp2]>=zvec) for (ic in 1:ncol(con)){ output[temp2,7]<-zvec } output } bwmedbmcp<-function(J,K,x,JK=J*K,grp=c(1:JK),con=0,alpha=.05,dif=F,pool=F,bop=F,nboot=100,SEED=T){ # # All pairwise comparisons among levels of Factor B # in a split-plot design using trimmed means. # # Data are pooled for each level # of Factor B. # bop=T, use bootstrap estimates of standard errors. # FWE controlled with Rom's method # # The s-plus variable x is assumed to contain the raw # data stored in list mode or in a matrix. # If in list mode, x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second: level 1,2 # x[[K]] is the data for level 1,K # x[[K+1]] is the data for level 2,1, x[[2K]] is level 2,K, etc. # # If the data are in a matrix, column 1 is assumed to # correspond to x[[1]], column 2 to x[[2]], etc. # # When in list mode x is assumed to have length JK, the total number of # groups being tested, but a subset of the data can be analyzed # using grp # if(is.matrix(x)) { y <- list() for(j in 1:ncol(x)) y[[j]] <- x[, j] x <- y } JK<-J*K data<-list() for(j in 1:length(x)){ data[[j]]<-x[[grp[j]]] # Now have the groups in proper order. } x<-data if(pool){ data<-list() m1<-matrix(c(1:JK),J,K,byrow=T) for(k in 1:K){ for(j in 1:J){ flag<-m1[j,k] if(j==1)temp<-x[[flag]] if(j>1){ temp<-c(temp,x[[flag]]) }} data[[k]]<-temp } print("Group numbers refer to levels of Factor B") if(!dif)temp<-lincdm(data,con=con,alpha=alpha,nboot=nboot,mop=bop) if(dif)temp<-qdmcpdif(data,con=con,alpha=alpha) return(temp) } if(!pool){ mat<-matrix(c(1:JK),ncol=K,byrow=T) for(j in 1:J){ data<-list() ic<-0 for(k in 1:K){ ic<-ic+1 data[[ic]]<-x[[mat[j,k]]] } print(paste("For level ", j, " of Factor A:")) if(!dif)temp<-lincdm(data,con=con,alpha=alpha,nboot=nboot,mop=bop) if(dif)temp<-qdmcpdif(data,con=con,alpha=alpha) print(temp$test) print(temp$psihat) }} } gamplot<-function(x,y,sop=F,pyhat=F,eout=F,xout=F,outfun=out,plotit=T, xlab="X",ylab="",zlab="",theta=50,phi=25,expand=.5,scale=F){ # # Plot regression surface using generalized additive model # # sop=F, use lowess # sop=T, use splines # library(akima) library(mgcv) x<-as.matrix(x) np<-ncol(x) np1<-np+1 if(ncol(x)>4)stop("x should have at most four columns of data") m<-elimna(cbind(x,y)) if(xout && eout)stop("Can't have xout=eout=T") if(eout){ flag<-outfun(m)$keep m<-m[flag,] } if(xout){ flag<-outfun(x,plotit=F)$keep m<-m[flag,] } x<-m[,1:np] x<-as.matrix(x) y<-m[,np1] if(!sop){ if(ncol(x)==1)fitr<-fitted(gam(y~x[,1])) if(ncol(x)==2)fitr<-fitted(gam(y~x[,1]+x[,2])) if(ncol(x)==3)fitr<-fitted(gam(y~x[,1]+x[,2]+x[,3])) if(ncol(x)==4)fitr<-fitted(gam(y~x[,1]+x[,2]+x[,3]+x[,4])) } if(sop){ if(ncol(x)==1)fitr<-fitted(gam(y~s(x[,1]))) if(ncol(x)==2)fitr<-fitted(gam(y~s(x[,1])+s(x[,2]))) if(ncol(x)==3)fitr<-fitted(gam(y~s(x[,1])+s(x[,2])+s(x[,3]))) if(ncol(x)==4)fitr<-fitted(gam(y~s(x[,1])+s(x[,2])+s(x[,3])+s(x[,4]))) } last<-fitr if(plotit){ if(ncol(x)==1){ plot(x,fitr,xlab=xlab,ylab=ylab) } if(ncol(x)==2){ iout<-c(1:length(fitr)) nm1<-length(fitr)-1 for(i in 1:nm1){ ip1<-i+1 for(k in ip1:length(fitr))if(sum(x[i,]==x[k,])==2)iout[k]<-0 } fitr<-fitr[iout>=1] # Eliminate duplicate points in the x-y plane # This is necessary when doing three dimensional plots # with the S-PLUS function interp mkeep<-x[iout>=1,] fitr<-interp(mkeep[,1],mkeep[,2],fitr) persp(fitr,theta=theta,phi=phi,expand=expand,xlab="x1",ylab="x2",zlab="", scale=scale) } } if(!pyhat)last <- "Done" last } rgvar<-function(x,est=covmcd,...){ # # compute a robust generalized variance # # choices for est are: # var # covmcd # covmve # skipcov with MM=F (boxplot) MM=T (MAD-MEDIAN), op=1 (MGV method) # op=2 (projection method for outliers) # covroc (S+ only as of Dec, 2005) # Rocke's measure of scatter, this requires that the command # library(robust,first=T) has been executed. # library(MASS) val<-prod(eigen(est(x,...))$values) val } rgvarseb<-function(x,nboot=100,est=skipcov,SEED=T,...){ # n<-nrow(x) val<-NA for(i in 1:nboot){ data<-sample(n,n,replace=T) val[i]<-rgvar(x[data,],est=est,...) } se<-sqrt(var(val)) se } covmve<-function(x){ library(MASS) val<-cov.mve(x) list(center=val$center,cov=val$cov) } mvecov<-function(x){ library(MASS) val<-cov.mve(x) val$cov } rgvar2g<-function(x,y,nboot=100,est=covmcd,alpha=.05,cop=3,op=2,SEED=T,...){ # # Two independent groups. # Test hypothesis of equal generalized variances. # # Choices for est include: # var # covmcd # covmve # skipcov with MM=F (boxplot) MM=T (MAD-MEDIAN), op=1 (MGV method) # op=2 (projection method for outliers) # covroc Rocke's measure of scatter, this requires that the command # library(robust,first=T) has been executed. # if(SEED)set.seed(2) se1<-rgvarseb(x,nboot=nboot,est=est,SEED=SEED,...) se2<-rgvarseb(y,nboot=nboot,est=est,SEED=SEED,...) dif<-rgvar(x,est=est,...)-rgvar(y,est=est,...) test.stat<-dif/sqrt(se1^2+se2^2) test.stat } covmcd<-function(x,nsamp="sample"){ # # nsamp="best" is the default used by R, # meaning that the number of samples is chosen so that # exhaustive enumeration is done up to 5000 samples # nsamp="sample" the number of samples # is min(5*p, 3000) # #library(lqs) library(MASS) val<-cov.mcd(x,nsamp=nsamp) list(center=val$center,cov=val$cov) } mcdcov<-function(x,nsamp="sample"){ # # nsamp="best" is the default used by R, # meaning that the number of samples is chosen so that # exhaustive enumeration is done up to 5000 samples # nsamp="sample" the number of samples # is min(5*p, 3000) # #library(lqs) library(MASS) val<-cov.mcd(x,nsamp=nsamp) val$cov } ancdes<-function(x,depfun=fdepth,...){ # # Choose points for design of an ANCOVA # x is the n by p matrix m. # if(!is.matrix(x))stop("x must be a matrix") temp<-depfun(x,plotit=F,...) temp2<-order(temp) val<-matrix(x[temp2[length(temp)],],ncol=ncol(x)) nmid<-round(length(temp)/2) id2<-(temp[temp2[nmid]]==temp) val2<-matrix(x[id2,],ncol=ncol(x)) if(!is.matrix(val2))val2<-t(as.matrix(val2)) val<-rbind(val,val2) val } stacklist<-function(x){ # # Assumes x has list mode with each entry a # matrix having p columns. # # Goal: stack the data into a matrix having p columns. # p<-ncol(x[[1]]) xx<-as.matrix(x[[1]]) for(j in 2:length(x)){ temp<-as.matrix(x[[j]]) xx<-rbind(xx,temp) } xx } smvar<-function(x,y,fr=.6,xout=T,eout=F,xlab="X",ylab="VAR(Y|X)",pyhat=F,plotit=T,nboot=40, RNA=F,SEED=T){ # # Estimate VAR(Y|X) using bagged version of running interval method # # xout=T eliminates all points for which x is an outlier. # eout=F eliminates all points for which (x,y) is an outlier. # # pyhat=T will return estimate for each x. # # RNA=T removes missing values when applying smooth # with RNA=F, might get NA for some pyhat values. # # plotit=T, scatterplot of points x versus square of # predicted y minus y # stemming from a smooth. Then plots a line indicating # var(y|x) using bagged smooth # temp <- cbind(x, y) temp <- elimna(temp) x <- temp[, 1] y <- temp[, 2] yhat<-lplot(x, y, pyhat = T, plotit = F)$yhat.values yvar<-(y-yhat)^2 estvar<-runmbo(x,y,est=var,pyhat=T,fr=fr,plotit=F,RNA=RNA,nboot=nboot) if(plotit){ plot(c(x,x),c(yvar,estvar),type="n",xlab=xlab,ylab=ylab) points(x,yvar) sx<-sort(x) xorder<-order(x) sysm<-estvar[xorder] lines(sx,sysm) } output <- "Done" if(pyhat)output <- estvar output } locvarsm<-function(x,y,pyhat=F,pts=x,plotit=T,nboot=40,RNA=T,xlab="X", ylab="VAR(Y|X)",op=2,xout=T,eout=F,pr=T,fr=.6,scat=T,outfun=out,SEED=T){ # # For each x, estimate VAR(y|x) using bootstrap bagging. # with # op=1 uses Fan's kernel method plus bootstrap bagging. # op=2 uses running interval smoother plus bootstrap bagging # # xout=T eliminates points where there are outliers among x values # this option applies only when using op=2 and when using # running interval smoother. # eout=T eliminates outliers among cloud of all data. # if(SEED)set.seed(2) temp<-cbind(x,y) temp<-elimna(temp) x<-temp[,1] y<-temp[,2] if(op==2){ if(pr){ print("Running interval method plus bagging has been chosen") print("op=1 will use Fan's method plus bagging") }} if(op==1){ if(pr){ print("Fan's method plus bagging has been chosen (cf. Bjerve and Doksum)") print("op=2 will use running interval plus bagging") } mat <- matrix(NA, nrow = nboot, ncol = nrow(temp)) for(it in 1:nboot) { idat <- sample(c(1:length(y)), replace = T) xx <- temp[idat, 1] yy <- temp[idat, 2] mat[it, ] <- locvar(xx,yy,pts=x,pyhat=T,plotit=F) } rmd<-apply(mat,2,mean) if(plotit) { plot(c(x, x), c(y, rmd), type = "n", xlab = xlab, ylab= ylab) sx <- sort(x) xorder <- order(x) sysm <- rmd[xorder] lines(sx, sysm) } output<-"Done" if(pyhat)output <- rmd } if(op==2){ output<-runmbo(x,y,fr=fr,est=var,xlab=xlab,ylab=ylab,pyhat=pyhat,eout=eout, xout=xout,RNA=RNA,plotit=plotit,scat=scat,nboot=nboot,outfun=outfun,SEED=SEED) } output } mcp2atm<-function(J,K,x,tr=.2,con=0,alpha=.05,grp=NA,op=F){ # # Do all pairwise comparisons of # main effects for Factor A and B and all interactions # based on trimmed means # # The data are assumed to be stored in x in list mode or in a matrix. # If grp is unspecified, it is assumed x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second factor: level 1,2 # x[[j+1]] is the data for level 2,1, etc. # If the data are in wrong order, grp can be used to rearrange the # groups. For example, for a two by two design, grp<-c(2,4,3,1) # indicates that the second group corresponds to level 1,1; # group 4 corresponds to level 1,2; group 3 is level 2,1; # and group 1 is level 2,2. # # Missing values are automatically removed. # JK <- J * K if(is.matrix(x)) x <- listm(x) if(!is.na(grp[1])) { yy <- x x<-list() for(j in 1:length(grp)) x[[j]] <- yy[[grp[j]]] } if(!is.list(x)) stop("Data must be stored in list mode or a matrix.") for(j in 1:JK) { xx <- x[[j]] x[[j]] <- xx[!is.na(xx)] # Remove missing values } # if(JK != length(x)) warning("The number of groups does not match the number of contrast coefficients.") for(j in 1:JK){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. x[[j]]<-temp } # Create the three contrast matrices temp<-con2way(J,K) conA<-temp$conA conB<-temp$conB conAB<-temp$conAB if(!op){ Factor.A<-lincon(x,con=conA,tr=tr,alpha=alpha) Factor.B<-lincon(x,con=conB,tr=tr,alpha=alpha) Factor.AB<-lincon(x,con=conAB,tr=tr,alpha=alpha) } All.Tests<-NA if(op){ Factor.A<-NA Factor.B<-NA Factor.AB<-NA con<-cbind(conA,conB,conAB) All.Tests<-lincon(x,con=con,tr=tr,alpha=alpha) } list(Factor.A=Factor.A,Factor.B=Factor.B,Factor.AB=Factor.AB,All.Tests=All.Tests,conA=conA,conB=conB,conAB=conAB) } mcp2med<-function(J,K,x,tr=.2,con=0,alpha=.05,grp=NA,op=F,pr=T){ # # Do all pairwise comparisons of # main effects for Factor A and B and all interactions # based on medians # # The data are assumed to be stored in x in list mode or in a matrix.\ # If grp is unspecified, it is assumed x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second factor: level 1,2 # x[[j+1]] is the data for level 2,1, etc. # If the data are in wrong order, grp can be used to rearrange the # groups. For example, for a two by two design, grp<-c(2,4,3,1) # indicates that the second group corresponds to level 1,1; # group 4 corresponds to level 1,2; group 3 is level 2,1; # and group 1 is level 2,2. # # Missing values are automatically removed. # if(pr)print("Suggest using med2mcp instead, especially with tied values") JK <- J * K if(is.matrix(x)) x <- listm(x) if(!is.na(grp[1])) { yy <- x x<-list() for(j in 1:length(grp)) x[[j]] <- yy[[grp[j]]] } if(!is.list(x)) stop("Data must be stored in list mode or a matrix.") # if(JK != length(x)){ print("Warning:") print("Number of groups does not match the number of contrast coefficients.") } for(j in 1:JK){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. x[[j]]<-temp } # Create the three contrast matrices # temp<-con2way(J,K) conA<-temp$conA conB<-temp$conB conAB<-temp$conAB if(!op){ Factor.A<-msmed(x,con=conA,alpha=alpha) Factor.B<-msmed(x,con=conB,alpha=alpha) Factor.AB<-msmed(x,con=conAB,alpha=alpha) } All.Tests<-NA if(op){ Factor.A<-NA Factor.B<-NA Factor.AB<-NA con<-cbind(conA,conB,conAB) All.Tests<-msmed(x,con=con,tr=tr,alpha=alpha) } list(Factor.A=Factor.A,Factor.B=Factor.B,Factor.AB=Factor.AB, All.Tests=All.Tests,conA=conA,conB=conB,conAB=conAB) } mdifloc<-function(x,y,est=tukmed,...){ # # Compute multivariate measure of location associated # with the distribution of x-y # # By default, use Tukey's median. # x<-as.matrix(x) y<-as.matrix(y) FLAG<-F if(ncol(x)!=ncol(y))stop("x and y should have the same number of columns") if(ncol(x)==1 && ncol(y)==1)FLAG<-T if(FLAG)val<-loc2dif(x,y,est=est,...) if(!FLAG){ J<-(ncol(x)^2-ncol(x))/2 mat<-matrix(NA,ncol=ncol(x),nrow=nrow(x)*nrow(y)) for(j in 1:ncol(x))mat[,j]<-as.vector(outer(x[,j], y[,j], FUN = "-")) val<-est(mat,...) } val } mdiflcr<-function(m1,m2,tr=.5,nullv=rep(0,ncol(m1)),plotit=T, SEED=T,pop=1,fr=.8,nboot=600){ # # For two independent groups, let D=X-Y. # Let theta_D be median of marginal distributions # Goal: Test theta_D=0 # # This is a multivariate analog of Wilcoxon-Mann-Whitney method # Only alpha=.05 can be used. # # When plotting: # pop=1 Use scatterplot # pop=2 Use expected frequency curve. # pop=3 Use adaptive kernel density # if(!is.matrix(m1))stop("m1 is not a matrix") if(!is.matrix(m2))stop("m2 is not a matrix") if(ncol(m1)!=ncol(m2))stop("number of columns for m1 and m2 are not equal") n1<-nrow(m1) n2<-nrow(m2) if(SEED)set.seed(2) data1 <- matrix(sample(n1, size = n1 * nboot, replace = T), nrow = nboot) data2 <- matrix(sample(n2, size = n2 * nboot, replace = T), nrow = nboot) bcon <- matrix(NA, ncol = ncol(m1), nrow = nboot) for(j in 1:nboot)bcon[j,]<-mdifloc(m1[data1[j,],],m2[data2[j,],],est=lloc,tr=tr) tvec<-mdifloc(m1,m2,est=lloc,tr=tr) tempcen <- apply(bcon, 1, mean) smat <- var(bcon - tempcen + tvec) temp <- bcon - tempcen + tvec bcon <- rbind(bcon, nullv) dv <- mahalanobis(bcon, tvec, smat) bplus <- nboot + 1 sig.level <- 1 - sum(dv[bplus] >= dv[1:nboot])/nboot if(plotit && ncol(m1)==2){ if(pop==2)rdplot(mdif,fr=fr) if(pop==1){ plot(mdif[,1],mdif[,2],xlab="VAR 1",ylab="VAR 2",type="n") points(mdif[,1],mdif[,2],pch=".") points(center[1],center[2],pch="o") points(0,0,pch="+") } if(pop==3)akerdmul(mdif,fr=fr) } list(p.value=sig.level,center=tvec) } mwmw<-function(m1,m2,cop=5,pr=T,plotit=T,pop=1,fr=.8,op=1,dop=1){ # # Compute measure of effect size, p, # a multivariate analog of Wilcoxon-Mann-Whitney p # # When plotting: # pop=1 Use scatterplot # pop=2 Use expected frequency curve. # pop=3 Use adaptive kernel density # # dop=1, use method A1 approximation of halfspace depth # dop=2, use method A2 approximation of halfspace depth # # cop determines how center of data is determined when # approximating halfspace depth # cop=1, Halfspace medina # cop=2, MCD # cop=3, marginal medians # cop=4, MVE # cop=5, skipped mean # library(akima) if(pr)print("New critical value used as of Feb. 2005") if(!is.matrix(m1))stop("m1 is not a matrix") if(!is.matrix(m2))stop("m2 is not a matrix") if(ncol(m1)!=ncol(m2))stop("number of columns for m1 and m2 are not equal") if(ncol(m1)==1)stop("Use R function cid or bmp") nn<-min(c(nrow(m1),nrow(m2))) mdif<-matrix(as.vector(outer(m1[,1],m2[,1],"-")),ncol=1) for(j in 2:ncol(m1)){ mdif<-cbind(mdif,matrix(as.vector(outer(m1[,j],m2[,j],"-")),ncol=1)) } if(op==1){ if(ncol(m1)==2)temp2<-depth2(rbind(mdif,c(rep(0,ncol(m1))))) #if(ncol(m1)==3)temp2<-depth3(rbind(mdif,c(rep(0,ncol(m1))))) if(ncol(m1)>2){ if(cop==1)center<-dmean(mdif,tr=.5,dop=dop) if(cop==2)center<-cov.mcd(mdif)$center if(cop==3)center<-apply(mdif,2,median) if(cop==4)center<-cov.mve(mdif)$center if(cop==5)center<-smean(mdif) temp2<-fdepth(rbind(mdif,c(rep(0,ncol(m1))))) }} if(op==2){ temp2<-pdis(rbind(mdif,c(rep(0,ncol(m1))))) temp2<-1/(temp2+1) } center<-dmean(mdif,tr=.5,dop=dop) phat<-temp2[nrow(mdif)+1]/max(temp2) # phat is relative depth of zero vector # Determine critical value crit<-NA alpha<-c(.1,.05,.025,.01) crit[1]<-1-1.6338/sqrt(nn) crit[2]<-1-1.8556/sqrt(nn) crit[3]<-1-2.0215/sqrt(nn) crit[4]<-1-2.1668/sqrt(nn) if(pr){ print("For alpha=.1,.05,.025,.01, the correspoding critical values are") print(crit) print("Reject if phat is less than or equal to the critical value") } if(plotit && ncol(m1)==2){ if(pop==2)rdplot(mdif,fr=fr) if(pop==1){ plot(mdif[,1],mdif[,2],xlab="VAR 1",ylab="VAR 2",type="n") points(mdif[,1],mdif[,2],pch=".") points(center[1],center[2],pch="o") points(0,0,pch="+") } if(pop==3)akerdmul(mdif,fr=fr) } list(phat=phat,center=center,crit.val=crit) } qreg<-function(x, y,qval=.5,op=1,v2=T,pr=F) { # # Compute the quantile regression line. That is, the goal is to # determine the qth (qval) quantile of Y given X using the # the Koenker-Basset approach. # # v2=T, uses the function rq in the R library quantreg # v2=F, uses an older and slower version # x<-as.matrix(x) X<-cbind(x,y) X<-elimna(X) np<-ncol(X) p<-np-1 x<-X[,1:p] x<-as.matrix(x) y<-X[,np] if(!v2){ temp<-ltareg(x,y,0,op=op) if(qval==.5){ coef<-temp$coef res<-temp$res } if(qval!=.5){ START<-temp$coef coef<-nelderv2(X,np,FN=qreg.sub,START=START,qval=qval) }} if(v2){ if(pr){ print("v2=T attempts to use a faster version by calling") print("the function rq, which is stored in the library quantreg,") print("which can be downloaded from") print("http://cran.r-project.org/src/contrib/PACKAGES.html") print("On a PC, store quantreg in the library subdirectory of R") print("On a unix machine, try the command install.packages('quantreg')") print("To avoid this message, use pr=F") print(" ") } library(quantreg) x<-as.matrix(x) temp<-rq(y~x,tau=qval) coef<-temp[1]$coefficients } res <- y - x%*%coef[2:np] - coef[1] list(coef = coef, residuals = res) } qindbt.sub<-function(isub,x,y,qval){ # # Perform regression using x[isub] to predict y[isub] # isub is a vector of length n, # a bootstrap sample from the sequence of integers # 1, 2, 3, ..., n # # This function is used by other functions when computing # bootstrap estimates. # # regfun is some regression method already stored in S-PLUS # It is assumed that regfun$coef contains the intercept and slope # estimates produced by regfun. The regression methods written for # this book, plus regression functions in S-PLUS, have this property. # # x is assumed to be a matrix containing values of the predictors. # xmat<-matrix(x[isub,],nrow(x),ncol(x)) regboot<-NA for(i in 1:length(qval)){ regboot[i]<-qreg(xmat,y[isub],qval[i])$coef[2] } regboot } runmq<-function(x,y,HD=F,qval=c(.2,.5,.8),xlab="X",ylab="Y",fr=1, sm=F,nboot=40,SEED=T,eout=F,xout=F,...){ # # Plot of running interval smoother based on specified quantiles in # qval # # fr controls amount of smoothing # tr is the amount of trimming # # Missing values are automatically removed. # rmd1<-NA xx<-cbind(x,y) p<-ncol(xx)-1 xx<-elimna(xx) x<-xx[,1:p] y<-xx[,ncol(xx)] plot(x,y,xlab=xlab,ylab=ylab) sx1<-sort(x) xorder1<-order(x) for(it in 1:length(qval)){ if(!sm){ if(!HD)temp<-rungen(x,y,est=qest,fr=fr,pyhat=T,plotit=F,q=qval[it]) if(HD)temp<-rungen(x,y,est=hd,fr=fr,pyhat=T,plotit=F,q=qval[it]) rmd1<-temp[1]$output sysm1<-rmd1[xorder1] lines(sx1,sysm1) } if(sm){ if(!HD)temp<-runmbo(x,y,est=qest,fr=fr,pyhat=T,plotit=F,SEED=SEED, nboot=nboot,eout=F,xout=F,q=qval[it]) if(HD)temp<-runmbo(x,y,est=hd,fr=fr,pyhat=T,plotit=F,SEED=SEED, nboot=nboot,eout=F,xout=F,q=qval[it]) rmd1<-temp sysm1<-rmd1[xorder1] lines(sx1,sysm1) } }} ritest<-function(x,y,adfun=adrun,plotfun=lplot,eout=F,xout=T,plotit=T,flag=3, nboot=500,alpha=.05,tr=.2,...){ # # There are two methods for testing for regression interactions # using robust smooths. # The first, performed by this function, fits an additive model # and test the hypothesis that the residuals, given x, is a # horizontal plane. # # The second, which is done by function adtest, tests the hypothesis # that a generalized additive model fits the data. # # Plot used to investigate regression interaction # (the extent a generalized additive model does not fit data). # Compute additive fit, plot residuals # versus x, an n by 2 matrix. # if(!is.matrix(x))stop(" x must be a matrix") if(ncol(x)!=2)stop(" x must have two columns only") yhat<-adfun(x,y,pyhat=T,eout=eout,xout=xout,plotit=F) res<-y-yhat output<-indt(x,res,flag=flag,nboot=nboot,alpha=alpha,tr=tr) if(plotit)plotfun(x,y-yhat,eout=eout,xout=xout,expand = 0.5,scale=F,xlab="X", ylab="Y",zlab="",theta=50,phi=25,...) output } gvar2g<-function(x,y,nboot=100,DF=T,eop=1,est=skipcov, alpha=.05,cop=3,op=1,MM=F,SEED=T,pr=F,fast=F,...){ # # Two independent groups. # Test hypothesis of equal generalized variances. # # DF=T, means skipcov with MM=F is used. # # That is, W-estimator based on a projection outlier detection method # and Carling's method applied to projections. # if equal sample sizes, adjusted critical value is used where appopriate # # DF=F # no adjusted critical value is used and any robust measure of # scatter can be used. # # Choices for est include: # var # covmcd # covmve # skipcov with MM=F (boxplot) MM=T (MAD-MEDIAN), op=1 (MGV method) # op=2 (projection method for outliers) # covroc Rocke's measure of scatter, # # op, cop and eop, see skipcov # adjusted critical level should be used with # skipcov and alpha=.05 only. # fast=T, will use skipcov.for if it is available. # if(SEED)set.seed(2) if(!is.matrix(x))stop("x should be a matrix with ncol>1") if(!is.matrix(y))stop("y should be a matrix with ncol>1") if(ncol(x)==1 || ncol(y)==1)stop("Only multivariate data are allowed") n1<-nrow(x) n2<-nrow(y) adalpha<-NA if(DF){ if(n1==n2 && alpha==.05){ p1<-ncol(x) if(p1==2){ if(n1>=20)adalpha<-1.36/n1+.05 } if(p1==3){ if(n1>=20)adalpha<-1.44/n+.05 } if(p1==4){ if(n1>=40)adalpha<-2.47/n1+.05 } if(p1==5){ if(n1>=40)adalpha<-3.43/n+.05 } if(p1==6){ if(n1>=60)adalpha<-4.01/n1+.05 }}} val<-NA for(j in 1:nboot) { data1 <- sample(n1, size = n1, replace = T) data2 <- sample(n2, size = n2, replace = T) if(!DF){ val[j]<-rgvar(as.matrix(x[data1,]),est=est,...)- rgvar(as.matrix(y[data2,]),est=est,...) } if(DF){val[j]<- if(!fast){ rgvar(as.matrix(x[data1,]),est=skipcov,op=op,outpro.cop=cop,MM=MM,...)- rgvar(as.matrix(y[data2,]),est=skipcov,op=op,outpro.cop=cop,MM=MM,...) } if(fast){ rgvar(as.matrix(x[data1,]),est=skipcov.for,op=op,outpro.cop=cop,MM=MM,...)- rgvar(as.matrix(y[data2,]),est=skipcov.for,op=op,outpro.cop=cop,MM=MM,...) } if(pr)print(c(j,val[j])) }} p.value<-sum(val<0)/nboot p.value<-2*min(p.value,1-p.value) list(p.value=p.value,adjusted.crit.level=adalpha) } grit<-function(x,y,itest=1,sm.fun=rplot,nboot=500,alpha=.05,SEED=T, fr=1,plot.fun=rplot,plotit=T,...){ # # Fit a running interval smoother using projection distances # excluding the predictor variable itest # itest=1 by default, meaning that the goal is to test # the hypothesis that the first variable does not contribute # to the regression model # # Method fits a smooth using x_1, ..., x_p, excluding variabe itest # Then x_itest and the resulting residuals are passed to indt # Alternative choices for smooth include # sm.fun=lplot, and if p>2, runpd # if(!is.matrix(x))stop("Should have two or more predictors stored in a matrix") p<-ncol(x) pp<-p+1 x<-elimna(cbind(x,y)) y<-x[,pp] x<-x[,1:p] flag<-rep(T,ncol(x)) flag[itest]<-F temp<-sm.fun(x[,flag],y,plotit=F,pyhat=T,fr=fr) res<-y-temp test.it<-indt(x[,itest],res) if(plotit)plot.fun(x[,itest],res,...) test.it } stackit<-function(x,jval){ # # Take a matrix having p columns and convert # it to a matrix having jval columns and np/jval rows # So take first jval columns, and rbind this with # next jval columns, etc. # x<-as.matrix(x) chkit<-ncol(x)%%jval if(chkit!=0)stop("ncol(x) is not a multiple of jval") xval<-x[,1:jval] xval<-as.matrix(xval) iloop<-ncol(x)/jval-1 il<-1 iu<-jval for(i in 1:iloop){ il<-il+jval iu<-iu+jval temp<-x[,il:iu] temp<-as.matrix(temp) xval<-rbind(xval,temp) } xval } ancmg<-function(x,y,pool=T,jcen=1,fr=1,depfun=fdepth,nmin=8,op=3,tr=.2,pts=NA, SEED=T,pr=T,cop=3,con=0,nboot=NA,alpha=.05,bhop=F){ # # ANCOVA # for two or more groups based on trimmed means or medians # Multiple covariates are allowed. # # op=1 use omnibus test for trimmed means, with trimming given by tr # op=2 use omnibus test for medians. # (Not recommended when there are tied values, use op=4) # op=3 multiple comparisons using trimming and percentile bootstrap. # This method seems best for general use. # op=4 multiple comparisons using medians and percentile bootstrap # # y is matrix with J columns, so have J groups. # or y can have list mode with length J # # x is a matrix with Jp columns, so first p columns # correspond to the p covariates in the first group, etc. # Or, # x can have list mode with length J and each component # being a matrix with p columns. # So if covariates for group 1 are in the matrix m1 # x[[1]]<-m1 will store them in x, x having list mode # # nmin is the minimum sample size allowed for any group # when testing hypotheses. # If a design point results in a sample size ncol(x))stop("jcen has an invalid value") xcen<-x[,js:jcenp] } if(is.list(x))xcen<-x[[jcen]] if(pool){ if(is.matrix(x))xval<-stackit(x,pval) if(is.list(x))xval<-stacklist(x) mval<-cov.mve(xval) pts<-ancdes(xval,depfun=depfun,cop=cop) } if(!pool){ pts<-ancdes(xcen,depfun=depfun,cop=cop) mval<-cov.mve(xcen) } nval<-matrix(NA,ncol=J,nrow=nrow(pts)) icl<-0-pval+1 icu<-0 for(j in 1:J){ icl<-icl+pval icu<-icu+pval for(i in 1:nrow(pts)){ if(is.matrix(x) && is.matrix(y)){ nval[i,j]<-length(y[near3d(x[,icl:icu],pts[i,],fr,mval),j]) } if(is.matrix(x) && is.list(y)){ tempy<-y[[j]] nval[i,j]<-length(tempy[near3d(x[,icl:icu],pts[i,],fr,mval)]) } if(is.list(x) && is.matrix(y)){ xm<-as.matrix(x[[j]]) nval[i,j]<-length(y[near3d(xm,pts[i,],fr,mval),j]) } if(is.list(x) && is.list(y)){ tempy<-y[[j]] xm<-as.matrix(x[[j]]) nval[i,j]<-length(tempy[near3d(xm,pts[i,],fr,mval)]) } # }} flag<-rep(T,nrow(pts)) for(i in 1:nrow(pts)){ if(min(nval[i,])=nmin && sum(flagr)>=nmin){ yl<-est(y[flagl],...) yr<-est(y[flagr],...) xl<-est(x[flagl],...) xr<-est(x[flagr],...) vals[i]<-(yr-yl)/(xr-xl) }} if(plotit){ plot(c(x,x[1],x[2]),c(vals,-5,5),xlab=xlab,ylab=ylab) xord<-order(x) lines(x[xord],vals[xord]) } vals } rslopesm<-function(x,y,fr=1,est=tmean,nmin=10,pts=x,plotit=F,xlab="X", ylab="Y",SEED=T,nboot=40,xout=F,RNA=T,atr=.2,scat=T,pyhat=T,...){ # # For a regression line predicting Y given X # Estimate slope at points in pts with bagging # followed by a smooth. # # pyhat=T, returns estimated slopes corresponding to the sorted # x values. # fr controls amount of smoothing # atr controls the amount of trimming. # # OUTPUT: by default, the estimated slopes at # X_1<=X_2<=...<=X_n # That is, for the x values written in ascending order, the # slope is estimated for each value. If the slope is not considered # estimable, the estimate is set to NA. # # pts is used if the goal is to estimate the slope for some # other collection of points. # # nmin controls how many points close to x are required when # deciding that the slope is estimable. # plotit=T will plot the estimates. # # The plotted points are the estimates using rslope and # the solid line gives the estimated values reported by this function # # Missing values are automatically removed. # if(SEED) set.seed(2) temp<-cbind(x,y) if(ncol(temp)!=2)stop("One predictor only is allowed") temp<-elimna(temp) # Eliminate any rows with missing values if(xout) { flag <- outfun(temp[, 1], plotit = F)$keep temp <- temp[flag, ] x<-temp[,1] y<-temp[,2] } flag<-order(x) x<-x[flag] y<-y[flag] mat<-matrix(NA,nrow=nboot,ncol=length(pts)) vals<-NA for(it in 1:nboot) { idat <- sample(c(1:length(y)), replace = T) xx <- temp[idat, 1] yy <- temp[idat, 2] # mat[it, ] <- runhat(xx, yy, pts = x, est = est, fr = fr, ...) mat[it,]<-rslope(xx,yy,fr=fr,est=est,nmin=nmin,pts=x,plotit=F) } rmd<-apply(mat,2,mean,na.rm=RNA,tr=atr) flag<-is.na(rmd) rmdsm<-lplot(x,rmd,pyhat=T,plotit=plotit) output<-"Done" if(pyhat){ temp<-rep(NA,length(x)) temp[!flag]<-rmdsm output<-temp } output } out<-function(x,cov.fun=covmve,plotit=T,SEED=T,xlab="X",ylab="Y",...){ # # Search for outliers using either the minimum volume ellipsoid method # or the minimum covariance determinant (MCD) method. # mcd=T uses the MCD method # # x is an n by p matrix or a vector of data. # # The function returns the values flagged as an outlier plus # the (row) number where the data point is stored. # If x is a vector, out.id=4 indicates that the fourth observation # is an outlier and outval=123 indicates that 123 is the value. # If x is a matrix, out.id=4 indicates that the fourth row of # the matrix is an outlier and outval reports the corresponding # values. # # The function also returns the distance of the # points identified as outliers # in the variable dis. # # For bivariate data, if plotit=T, plot points and circle outliers. # # cov.fun determines how the measure of scatter is estimated. # Possible hoices are # covmve (the MVE estimate) # covmcd (the MCD estimate) # covmba2 (the MBA or median ball algorithm) # covnnve (Wang-Rafferty method, JASA, 2002) # if(SEED)set.seed(12) if(is.list(x))stop("Data cannot be stored in list mode") x<-elimna(x) # missing data are removed if(!is.matrix(x)){ dis<-(x-median(x))^2/mad(x)^2 crit<-sqrt(qchisq(.975,1)) vec<-c(1:length(x)) } if(is.matrix(x)){ mve<-cov.fun(x,...) dis<-mahalanobis(x,mve$center,mve$cov) crit<-sqrt(qchisq(.975,ncol(x))) vec<-c(1:nrow(x)) } dis<-sqrt(dis) chk<-ifelse(dis>crit,1,0) id<-vec[chk==1] keep<-vec[chk==0] if(is.matrix(x)){ if(ncol(x)==2 && plotit){ plot(x[,1],x[,2],xlab=xlab, ylab=ylab,type="n") flag<-rep(T,nrow(x)) flag[id]<-F points(x[flag,1],x[flag,2],pch="*") if(sum(!flag)>0)points(x[!flag,1],x[!flag,2],pch="o") }} if(!is.matrix(x))outval<-x[id] if(is.matrix(x))outval<-x[id,] list(out.val=outval,out.id=id,keep=keep,dis=dis,crit=crit) } m1way<-function(x,est=hd,nboot=599,SEED=T,...){ # # Test the hypothesis that J measures of location are equal # using the percentile bootstrap method. # By default, medians are compared using 599 bootstrap samples. # and the Harrell-Davis Estimator. To use the usual sample median, set # est=median # # The data are assumed to be stored in x in list mode. Thus, # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. Length(x)=the number of groups = J, say. # # if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in list mode or a matrix.") J<-length(x) nval<-vector("numeric",length(x)) gest<-vector("numeric",length(x)) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. bvec<-matrix(0,J,nboot) print("Taking bootstrap samples. Please wait.") for(j in 1:J){ print(paste("Working on group ",j)) nval[j]<-length(x[[j]]) gest[j]<-est(x[[j]]) xcen<-x[[j]]-est(x[[j]],...) data<-matrix(sample(xcen,size=length(x[[j]])*nboot,replace=T),nrow=nboot) bvec[j,]<-apply(data,1,est,...) # A J by nboot matrix # containing the bootstrap values of est. } teststat<-wsumsq(gest,nval) testb<-apply(bvec,2,wsumsq,nval) p.value<-1 - sum(teststat >= testb)/nboot teststat<-wsumsq(gest,nval) list(teststat=teststat,p.value=p.value) } oancpb<-function(x1,y1,x2,y2,est=tmean,tr=.2,pts=NA,fr1=1,fr2=1,nboot=600, alpha=.05,plotit=T,SEED=T,PRO=F,...){ # # Compare two independent groups using an ancova method # with a percentile bootstrap combined with a running interval # smooth. # # This function performs an omnibus test using data corresponding # to K design points specified by the argument pts. If # pts=NA, K=5 points are chosen for you (see Introduction to Robust # Estimation and Hypothesis Testing.) # Null hypothesis is that conditional distribution of Y, given X for first # group, minus the conditional distribution of Y, given X for second # group is equal to zero. # The strategy is to choose K specific X values # and then test the hypothesis that all K differences are zero. # # If you want to choose specific X values, Use the argument # pts # Example: pts=c(1,3,5) will use X=1, 3 and 5. # # For multiple comparisons using these J points, use ancpb # # Assume data are in x1 y1 x2 and y2 # # PRO=F, means Mahalanobis distance is used. # PRO=T, projection distance is used. # # fr1 and fr2 are the spans used to fit a smooth to the data. # gv1<-vector("list") if(is.na(pts[1])){ isub<-c(1:5) # Initialize isub test<-c(1:5) xorder<-order(x1) y1<-y1[xorder] x1<-x1[xorder] xorder<-order(x2) y2<-y2[xorder] x2<-x2[xorder] n1<-1 n2<-1 vecn<-1 for(i in 1:length(x1))n1[i]<-length(y1[near(x1,x1[i],fr1)]) for(i in 1:length(x1))n2[i]<-length(y2[near(x2,x1[i],fr2)]) for(i in 1:length(x1))vecn[i]<-min(n1[i],n2[i]) sub<-c(1:length(x1)) isub[1]<-min(sub[vecn>=12]) isub[5]<-max(sub[vecn>=12]) isub[3]<-floor((isub[1]+isub[5])/2) isub[2]<-floor((isub[1]+isub[3])/2) isub[4]<-floor((isub[3]+isub[5])/2) for (i in 1:5){ j<-i+5 temp1<-y1[near(x1,x1[isub[i]],fr1)] temp2<-y2[near(x2,x1[isub[i]],fr2)] temp1<-temp1[!is.na(temp1)] temp2<-temp2[!is.na(temp2)] gv1[[i]]<-temp1 gv1[[j]]<-temp2 } # loc<-NA if(SEED)set.seed(2) bvec<-matrix(NA,nrow=nboot,ncol=5) for(j in 1:5){ k<-j+5 loc[j]<-est(gv1[[j]])-est(gv1[[k]]) xx<-matrix(sample(gv1[[j]],size=length(gv1[[j]])*nboot,replace=T), nrow=nboot) yy<-matrix(sample(gv1[[k]],size=length(gv1[[k]])*nboot,replace=T), nrow=nboot) bvec[,j]<-apply(xx,1,FUN=est,...)-apply(yy,1,FUN=est,...) } nullv<-rep(0,5) if(!PRO){ mvec<-apply(bvec,2,FUN=mean) m1<-var(t(t(bvec)-mvec+loc)) temp<-mahalanobis(rbind(bvec,nullv),loc,m1) } if(PRO){ temp<-pdis(rbind(bvec,nullv)) } sig.level<-sum(temp[nboot+1]nullval || chkit[2]nullval || chkit[2]150)fr<-.09 } } if(SEED)set.seed(2) x<-as.matrix(x) mflag<-matrix(NA,nrow=length(y),ncol=length(y)) for (j in 1:length(y)){ for (k in 1:length(y)){ mflag[j,k]<-(sum(x[j,]<=x[k,])==ncol(x)) } } yhat<-adrun(x,y,est=est,plotit=F,fr=fr,pyhat=T) regres<-y-yhat print("Taking bootstrap sample, please wait.") data<-matrix(runif(length(y)*nboot),nrow=nboot) data<-sqrt(12)*(data-.5) # standardize the random numbers. rvalb<-apply(data,1,adtests1,yhat,regres,mflag,x,fr) # An n x nboot matrix of R values rvalb<-rvalb/sqrt(length(y)) dstatb<-apply(abs(rvalb),2,max) wstatb<-apply(rvalb^2,2,mean) v<-c(rep(1,length(y))) rval<-adtests1(v,yhat,regres,mflag,x,fr) rval<-rval/sqrt(length(y)) dstat<-max(abs(rval)) wstat<-mean(rval^2) p.value.d<-1-sum(dstat>=dstatb)/nboot p.value.w<-1-sum(wstat>=wstatb)/nboot list(dstat=dstat,wstat=wstat,p.value.d=p.value.d,p.value.w=p.value.w) } rhom<-function(x,y,op=1,op2=F,tr=.2,plotit=T,xlab="X",ylab="ABS(res)", est=median,sm=F,SEED=T){ # For regression model, Y=m(X)+s(X)e, # where s(X) models heteroscedasticity, and e has median 0, # test hypothesis s(X)=1 for any X # # For p>1, method tests for each p whether residuals and x_j # have a horizontal regression line. # # op2=F, tests for homogeneity using running interval smoother # op2=T, test of independence based on Y-M(Y), M(Y) some measure # of location given by argument est. # In general, op2=T should NOT be used when the goal is to test # the hypothesis of a homoscedastic error term. # # op=1 test using regression method (function regci) # op=2 test using Winsorized correlation # tr is amount of winsorizing. # op=3 test using a wild boostrap method # x<-as.matrix(x) p<-ncol(x) pp<-p+1 xy<-elimna(cbind(x,y)) x<-xy[,1:p] y<-xy[,pp] x<-as.matrix(x) output<-NA if(ncol(x)==1){ if(!op2)res<-y-runhat(x,y,est=est,pts=x) if(op2)res<-y-est(y) if(op==1)output<-regci(x,abs(res),SEED=SEED,pr=F)$p.value[2] if(op==2)output<-wincor(x,abs(res),tr=tr)$siglevel if(op==3)output<-indt(x,abs(res),tr=0,SEED=SEED)$p.value.d } if(ncol(x)>1){ pv<-ncol(x)+1 if(!op2)res<-y-rung3hat(x,y,est=est,pts=x)$rmd if(op2)res<-y-est(y) if(op==1)output<-regci(x,abs(res),pr=F)$sig.level[2:pv] if(op==2)output<-winall(cbind(x,abs(res)),tr=tr)$siglevel[1:ncol(x),pv] if(op==3)output<-indt(x,abs(res),tr=0,SEED=SEED)$p.value.d } if(plotit){ if(ncol(x)==1){ if(!sm)rungen(x,abs(res),est=est,xlab=xlab,ylab=ylab) if(sm)runmbo(x,abs(res),est=est,xlab=xlab,ylab=ylab) } if(ncol(x)==2){ if(sm)rung3d(x,abs(res),est=est,xlab=xlab,ylab=ylab) if(!sm)run3bo(x,abs(res),est=est,xlab=xlab,ylab=ylab) }} list(p.value=output) } gk.sigmamu <- function(x, c1 = 4.5, c2 = 3.0, mu.too = FALSE, ...) { n <- length(x) medx <- median(x) sigma0 <- median(abs(x - medx)) w <- abs(x - medx) / sigma0 w <- ifelse(w<=c1,(1.0 - (w / c1)^2)^2,0) mu <- sum(x * w) / sum(w) x <- (x - mu) / sigma0 rho <- x^2 rho[rho > c2^2] <- c2^2 sigma2 <- sigma0^2 / n * sum(rho) if(mu.too) c(mu, sqrt(sigma2)) else sqrt(sigma2) } gk <- function(x, y, ...) { ((gk.sigmamu(x + y, ...))^2 - (gk.sigmamu(x - y, ...))^2) / 4.0 } hard.rejection <- function(distances, p, beta = 0.9, ...) { d0 <- qchisq(beta, p) * median(distances) / qchisq(0.5, p) weights <- double(length(distances)) weights[distances <= d0] <- 1.0 weights } # # # gkcov<-function(x,y,gk.sigmamu=taulc,...){ # # Compute robust covariance using the Gnanadesikan-Kettenring # estimator. # (cf. Marrona & Zomar, 2002, Technometrics # val<-.25*(gk.sigmamu(x+y,...)-gk.sigmamu(x-y,...)) val } covogk<-function(x,sigmamu=taulc,v=gkcov,n.iter=1,beta=.9,...){ # # Compute robust (weighted) covariance matrix in Maronna and Zamar # (2002, Technometrics, eq. 7). # # x is an n by p matrix # n.iter number of iterations. 1 seems to be best # sigmamu is any user supplied function having the form # sigmamu(x,mu.too=F) and which computes a robust measure of # of dispersion if mu.too=F. If mu.too=T, it returns # a robust measure of location as well. # v is any robust covariance # if(!is.matrix(x))stop("x should be a matrix") x<-elimna(x) # remove any rows with missing data temp<-ogk.pairwise(x,sigmamu=sigmamu,v=v,n.iter=n.iter,beta=beta,...)$wcovmat temp } ogk<-function(x,sigmamu=taulc,v=gkcov,n.iter=1,beta=.9,...){ # # Compute robust (weighted) covariance matrix in Maronna and Zamar # (2002, Technometrics, eq. 7). # # x is an n by p matrix # n.iter number of iterations. 1 seems to be best # sigmamu is any user supplied function having the form # sigmamu(x,mu.too=F) and which computes a robust measure of # of dispersion if mu.too=F. If mu.too=T, it returns # a robust measure of location as well. # v is any robust covariance # if(!is.matrix(x))stop("x should be a matrix") x<-elimna(x) # remove any rows with missing data temp<-ogk.pairwise(x,sigmamu=sigmamu,v=v,n.iter=n.iter,beta=beta,...) list(center=temp$wcenter,cov=temp$wcovmat) } ogk.pairwise <- function(X,n.iter=1,sigmamu=taulc,v=gkcov,beta=.9,...) #weight.fn=hard.rejection,beta=.9,...) { # Downloaded (and modified slightly) from www.stats.ox.ac.uk/~konis/pairwise.q # Corrections noted by V. Todorov have been incorporated # data.name <- deparse(substitute(X)) X <- as.matrix(X) n <- dim(X)[1] p <- dim(X)[2] Z <- X U <- diag(p) A <- list() # Iteration loop. for(iter in 1:n.iter) { # Compute the vector of standard deviations d and # the correlation matrix U. d <- apply(Z, 2, sigmamu, ...) Z <- sweep(Z, 2, d, '/') for(i in 1:(p - 1)) { for(j in (i + 1):p) { U[j, i] <- U[i, j] <- v(Z[ , i], Z[ , j], ...) } } # Compute the eigenvectors of U and store them in # the columns of E. E <- eigen(U, symmetric = TRUE)$vectors # Compute A, there is one A for each iteration. A[[iter]] <- d * E # Project the data onto the eigenvectors. Z <- Z %*% E } # End of orthogonalization iterations. # Compute the robust location and scale estimates for # the transformed data. # sqrt.gamma <- apply(Z, 2, sigmamu, mu.too = TRUE, ...) sqrt.gamma <- apply(Z, 2, sigmamu, mu.too = TRUE) center <- sqrt.gamma[1, ] sqrt.gamma <- sqrt.gamma[2, ] # Compute the mahalanobis distances. Z <- sweep(Z, 2, center) Z <- sweep(Z, 2, sqrt.gamma, '/') distances <- rowSums(Z^2) # From the inside out compute the robust location and # covariance matrix estimates. See equation (5). covmat <- diag(sqrt.gamma^2) for(iter in seq(n.iter, 1, -1)) { covmat <- A[[iter]] %*% covmat %*% t(A[[iter]]) center <- A[[iter]] %*% center } center <- as.vector(center) # Compute the reweighted estimate. First, compute the # weights using the user specified weight function. #weights <- weight.fn(distances, p, ...) weights <- hard.rejection(distances, p, beta=beta,...) sweights <- sum(weights) # Then compute the weighted location and covariance # matrix estimates. wcenter <- colSums(sweep(X, 1, weights, '*')) / sweights Z <- sweep(X, 2, wcenter) Z <- sweep(Z, 1, sqrt(weights), '*') wcovmat <- (t(Z) %*% Z) / sweights; list(center = center, covmat = covmat, wcenter = wcenter, wcovmat = wcovmat, distances = distances, sigmamu = deparse(substitute(sigmamu)), v = deparse(substitute(v)), data.name = data.name, data = X) } gk.sigmamu <- function(x, c1 = 4.5, c2 = 3.0, mu.too = FALSE, ...) { n <- length(x) medx <- median(x) sigma0 <- median(abs(x - medx)) # w <- (x - medx) / sigma0 # w <- (1.0 - (w / c1)^2)^2 #w[w < 0.0] <- 0.0 w <- abs(x - medx) / sigma0 w <- ifelse(w<=c1,(1.0 - (w / c1)^2)^2,0) mu <- sum(x * w) / sum(w) x <- (x - mu) / sigma0 rho <- x^2 rho[rho > c2^2] <- c2^2 sigma2 <- sigma0^2 / n * sum(rho) if(mu.too) c(mu, sqrt(sigma2)) else sqrt(sigma2) } gk <- function(x, y, ...) { ((gk.sigmamu(x + y, ...))^2 - (gk.sigmamu(x - y, ...))^2) / 4.0 } hard.rejection <- function(distances, p, beta = 0.9, ...) { d0 <- qchisq(beta, p) * median(distances) / qchisq(0.5, p) weights <- double(length(distances)) weights[distances <= d0] <- 1.0 weights } # # # outogk<-function(x,sigmamu=taulc,v=gkcov,op=T,SEED=F, beta=max(c(.95,min(c(.99,1/nrow(x)+.94)))),n.iter=1,plotit=T,...){ # # Use the ogk estimator to # determine which points are outliers # # op=T uses robust Mahalanobis distance based on # the OGK estimator with beta adjusted so that # the outside rate per observation is approximately .05 # under normality. # op=F returns the outliers based on the distances used # by the OGK estimator # (Currently, op=T seems best for detecting outliers.) # if(!is.matrix(x))stop("x should be a matrix") x<-elimna(x) if(!op){ temp<-ogk.pairwise(x,sigmamu=sigmamu,v=v,beta=beta,n.iter=n.iter,...) vals<-hard.rejection(temp$distances,p=ncol(x),beta=beta,...) flag<-(vals==1) vals<-c(1:nrow(x)) outid<-vals[!flag] keep<-vals[flag] if(is.matrix(x)){ if(ncol(x)==2 && plotit){ plot(x[,1],x[,2],xlab="X", ylab="Y",type="n") points(x[flag,1],x[flag,2]) if(sum(!flag)>0)points(x[!flag,1],x[!flag,2],pch="o") }}} if(op){ temp<-out(x,cov.fun=ogk,beta=beta,plotit=plotit,SEED=SEED) outid<-temp$out.id keep<-temp$keep } list(out.id=outid,keep=keep,distances=temp$dis) } splot<-function(x,op=T,VL=F,xlab="X",ylab="Rel. Freq."){ # # Frequency plot # x<-x[!is.na(x)] temp<-sort(unique(x)) freq<-NA for(i in 1:length(temp)){ freq[i]<-sum(x==temp[i]) } freq<-freq/length(x) tfreq<-freq tfreq[1]<-0 tfreq[2]<-max(freq) plot(temp,tfreq,xlab=xlab,ylab=ylab,type="n") points(temp,freq,pch="*") if(op) if(!VL)lines(temp,freq) if(VL){ for(i in 1:length(temp))lines(c(temp[i],temp[i]),c(0,freq[i])) } } outcov<-function(x,y=NA,outfun=outogk,plotit=F){ # # Remove outliers and compute covariances # if(!is.na(y[1]))x<-cbind(x,y) keep<-outfun(x,plotit=plotit)$keep val<-var(x[keep,]) if(ncol(val)==2)val<-val[1,2] list(cov=val) } covout<-function(x,y=NA,outfun=outogk,plotit=F){ # # Remove outliers and compute covariances # if(!is.na(y[1]))x<-cbind(x,y) keep<-outfun(x,plotit=plotit)$keep val<-var(x[keep,]) if(ncol(val)==2)val<-val[1,2] val } tbscor<-function(x,y=NA){ # # Compute a correlation coefficient using the TBS measure of scatter # if(!is.na(y[1]))x<-cbind(x,y) if(!is.matrix(x))stop("x should be a matrix") x<-elimna(x) n<-nrow(x) p<-ncol(x) temp<-tbs(x)$cov val<-matrix(NA,p,p) for(j in 1:p){ for(k in 1:p){ val[j,k]<-temp[k,j]/sqrt(temp[k,k]*temp[j,j]) }} test<-abs(val*sqrt((n-2)/(1-val^2))) if(p==2){ val<-val[1,2] p.value<-c("Greater than .1") crit<-20.20/n+1.89 if(test>=crit)p.value<-c("Less than .1") crit<-30.41/n+2.21 if(test>=crit)p.value<-c("Less than .05") crit<-39.72/n+2.5 if(test>=crit)p.value<-c("Less than .025") crit<-58.55/n+2.80 if(test>=crit)p.value<-c("Less than .01") } list(cor=val,test.stat=test,p.value=p.value) } skiptbs<-function(x,y=NA,plotit=F){ # # Remove outliers and compute correlations # if(!is.na(y[1]))x<-cbind(x,y) x<-elimna(x) n<-nrow(x) keep<-outtbs(x,plotit=plotit)$keep val<-cor(x[keep,]) p.value<-NA test<-NA crit.05<-30.41/n+2.21 vat<-val diag(vat)<-0 test<-abs(vat*sqrt((n-2)/(1-vat^2))) diag(test)<-NA if(ncol(val)==2){ p.value<-c("Greater than .1") val<-val[1,2] test<-abs(val*sqrt((n-2)/(1-val^2))) p.value<-c("Greater than .1") crit<-20.20/n+1.89 if(test>=crit)p.value<-c("Less than .1") crit<-30.41/n+2.21 if(test>=crit)p.value<-c("Less than .05") crit<-39.72/n+2.5 if(test>=crit)p.value<-c("Less than .025") crit<-58.55/n+2.80 if(test>=crit)p.value<-c("Less than .01") } list(cor=val,test.stat=test,p.value=p.value,crit.05=crit.05) } skipogk<-function(x,y=NA,plotit=F){ # # Remove outliers and compute correlations # if(!is.na(y[1]))x<-cbind(x,y) x<-elimna(x) n<-nrow(x) keep<-outogk(x,plotit=plotit)$keep val<-cor(x[keep,]) p.value<-NA test<-NA crit.05<-15.49/n+2.68 vat<-val diag(vat)<-0 test<-abs(vat*sqrt((n-2)/(1-vat^2))) diag(test)<-NA if(ncol(val)==2){ p.value<-c("Greater than .1") val<-val[1,2] test<-abs(val*sqrt((n-2)/(1-val^2))) crit<-4.8/n+2.72 if(test>=crit)p.value<-c("Less than .1") crit<-15.49/n+2.68 if(test>=crit)p.value<-c("Less than .05") crit<-14.22/n+3.26 if(test>=crit)p.value<-c("Less than .025") crit<-24.83/n+3.74 if(test>=crit)p.value<-c("Less than .01") } list(cor=val,test.stat=test,p.value=p.value,crit.05=crit.05) } rqfit<-function(x,y,qval=.5,alpha=.05,xout=F,outfun=out,res=T,...){ # # Do a quantile regression fit # if(alpha!=.05)stop("This function only allows alpha=.05. Use qregci") library(quantreg) xx<-cbind(x,y) p<-ncol(xx)-1 xx<-elimna(xx) x<-xx[,1:p] y<-xx[,ncol(xx)] x=as.matrix(x) if(xout){ flag<-outfun(x,...)$keep x<-x[flag,] y<-y[flag] } residuals<-NA if(res)residuals<-rq(y~x)$residuals temp<-summary(rq(y~x,tau=qval,alpha=alpha)) temp0<-temp[[4]] if(is.matrix(temp[[3]]))temp0<-temp[[3]] #Newer R version temp<-temp0 coef<-temp[,1] ci<-temp[,2:3] list(coef=coef,ci=ci,residuals=residuals) } rqtest.sub<-function(isub,x,y,qval=.5){ # # Perform regression using x[isub] to predict y[isub] # isub is a vector of length n, # a bootstrap sample from the sequence of integers # 1, 2, 3, ..., n # # This function is used by other functions when computing # bootstrap estimates. # # x is assumed to be a matrix containing values of the predictors. # xmat<-matrix(x[isub,],nrow(x),ncol(x)) regboot<-rqfit(xmat,y[isub],qval=qval)$coef regboot } tbs <- function(x,eps=1e-3,maxiter=20,r=.45,alpha=.05){ # Rocke's contrained s-estimator # # r=.45 is the breakdown point # alpha=.05 is the asymptotic rejection probability. # if(!is.matrix(x))stop("x should be a matrix with two or more columns") x<-elimna(x) library(MASS) #temp<-cov.mve(x) temp<-cov.mcd(x) # The use of mcd is crucial; using mve results in # very poor outside rate per obs under normality. t1<-temp$center s<-temp$cov n <- nrow(x) p <- ncol(x) if(p==1)stop("x should be a matrix with two or more columns") c1M<-cgen.bt(n,p,r,alpha,asymp=FALSE) c1<-c1M$c1 if(c1==0)c1<-.001 #Otherwise get division by zero M<-c1M$M b0 <- erho.bt(p,c1,M) crit <- 100 iter <- 1 w1d <- rep(1,n) w2d <- w1d while ((crit > eps)&(iter <= maxiter)) { t.old <- t1 s.old <- s wt.old <- w1d v.old <- w2d d2 <- mahalanobis(x,center=t1,cov=s) d <- sqrt(d2) k <- ksolve.bt(d,p,c1,M,b0) d <- d/k w1d <- wt.bt(d,c1,M) w2d <- v.bt(d,c1,M) t1 <- (w1d %*% x)/sum(w1d) s <- s*0 for (i in 1:n) { xc <- as.vector(x[i,]-t1) s <- s + as.numeric(w1d[i])*(xc %o% xc) } s <- p*s/sum(w2d) mnorm <- sqrt(as.vector(t.old) %*% as.vector(t.old)) snorm <- eigen(s.old)$values[1] crit1 <- max(abs(t1 - t.old)) # crit <- max(crit1,crit2) crit <- max(abs(w1d-wt.old))/max(w1d) iter <- iter+1 } # mnorm <- sqrt(as.vector(t1) %*% as.vector(t1)) # snorm <- eigen(s)$values[1] # return(list(t1=t1,s=s)) list(center=t1,cov=s) } erho.bt <- function(p,c1,M) # expectation of rho(d) under chi-squared p return(chi.int(p,2,M)/2 +(M^2/2+c1*(5*c1+16*M)/30)*chi.int2(p,0,M+c1) +(M^2/2-M^2*(M^4-5*M^2*c1^2+15*c1^4)/(30*c1^4))*( chi.int(p,0,M+c1)-chi.int(p,0,M)) +(1/2+M^4/(2*c1^4)-M^2/c1^2)*(chi.int(p,2,M+c1)-chi.int(p,2,M)) +(4*M/(3*c1^2)-4*M^3/(3*c1^4))*(chi.int(p,3,M+c1)-chi.int(p,3,M)) +(3*M^2/(2*c1^4)-1/(2*c1^2))*(chi.int(p,4,M+c1)-chi.int(p,4,M)) -(4*M/(5*c1^4))*(chi.int(p,5,M+c1)-chi.int(p,5,M)) +(1/(6*c1^4))*(chi.int(p,6,M+c1)-chi.int(p,6,M))) chi.int <- function(p,a,c1) # partial expectation d in (0,c1) of d^a under chi-squared p return( exp(lgamma((p+a)/2)-lgamma(p/2))*2^{a/2}*pchisq(c1^2,p+a) ) chi.int2 <- function(p,a,c1) # partial expectation d in (c1,\infty) of d^a under chi-squared p return( exp(lgamma((p+a)/2)-lgamma(p/2))*2^{a/2}*(1-pchisq(c1^2,p+a))) cgen.bt <- function(n,p,r,alpha,asymp=FALSE){ # find constants c1 and M that gives a specified breakdown r # and rejection point alpha if (asymp == FALSE){if (r > (n-p)/(2*n) ) r <- (n-p)/(2*n)} # maximum achievable breakdown # # if rejection is not achievable, use c1=0 and best rejection # limvec <- rejpt.bt.lim(p,r) if (1-limvec[2] <= alpha) { c1 <- 0 M <- sqrt(qchisq(1-alpha,p)) } else { c1.plus.M <- sqrt(qchisq(1-alpha,p)) M <- sqrt(p) c1 <- c1.plus.M - M iter <- 1 crit <- 100 eps <- 1e-5 while ((crit > eps)&(iter<100)) { deps <- 1e-4 M.old <- M c1.old <- c1 er <- erho.bt(p,c1,M) fc <- er - r*(M^2/2+c1*(5*c1+16*M)/30) fcc1 <- (erho.bt(p,c1+deps,M)-er)/deps fcM <- (erho.bt(p,c1,M+deps)-er)/deps fcp <- fcM - fcc1 - r*(M-(5*c1+16*M)/30+c1*9/30) M <- M - fc/fcp if (M >= c1.plus.M ){M <- (M.old + c1.plus.M)/2} c1 <- c1.plus.M - M # if (M-c1 < 0) M <- c1.old+(M.old-c1.old)/2 crit <- abs(fc) iter <- iter+1 } } list(c1=c1,M=M,r1=r) } erho.bt.lim <- function(p,c1) # expectation of rho(d) under chi-squared p return(chi.int(p,2,c1)+c1^2*chi.int2(p,0,c1)) erho.bt.lim.p <- function(p,c1) # derivative of erho.bt.lim wrt c1 return(chi.int.p(p,2,c1)+c1^2*chi.int2.p(p,0,c1)+2*c1*chi.int2(p,0,c1)) rejpt.bt.lim <- function(p,r){ # find p-value of translated biweight limit c # that gives a specified breakdown c1 <- 2*p iter <- 1 crit <- 100 eps <- 1e-5 while ((crit > eps)&(iter<100)) { c1.old <- c1 fc <- erho.bt.lim(p,c1) - c1^2*r fcp <- erho.bt.lim.p(p,c1) - 2*c1*r c1 <- c1 - fc/fcp if (c1 < 0) c1 <- c1.old/2 crit <- abs(fc) iter <- iter+1 } return(c(c1,pchisq(c1^2,p),log10(1-pchisq(c1^2,p)))) } chi.int.p <- function(p,a,c1) return( exp(lgamma((p+a)/2)-lgamma(p/2))*2^{a/2}*dchisq(c1^2,p+a)*2*c1 ) chi.int2.p <- function(p,a,c1) return( -exp(lgamma((p+a)/2)-lgamma(p/2))*2^{a/2}*dchisq(c1^2,p+a)*2*c1 ) ksolve.bt <- function(d,p,c1,M,b0){ # find a constant k which satisfies the s-estimation constraint # for modified biweight k <- 1 iter <- 1 crit <- 100 eps <- 1e-5 while ((crit > eps)&(iter<100)) { k.old <- k fk <- mean(rho.bt(d/k,c1,M))-b0 fkp <- -mean(psi.bt(d/k,c1,M)*d/k^2) k <- k - fk/fkp if (k < k.old/2) k <- k.old/2 if (k > k.old*1.5) k <- k.old*1.5 crit <- abs(fk) iter <- iter+1 } return(k) } rho.bt <- function(x,c1,M) { x1 <- (x-M)/c1 ivec1 <- (x1 < 0) ivec2 <- (x1 > 1) return(ivec1*(x^2/2) +ivec2*(M^2/2+c1*(5*c1+16*M)/30) +(1-ivec1-ivec2)*(M^2/2-M^2*(M^4-5*M^2*c1^2+15*c1^4)/(30*c1^4) +(1/2+M^4/(2*c1^4)-M^2/c1^2)*x^2 +(4*M/(3*c1^2)-4*M^3/(3*c1^4))*x^3 +(3*M^2/(2*c1^4)-1/(2*c1^2))*x^4 -4*M*x^5/(5*c1^4)+x^6/(6*c1^4))) } psi.bt <- function(x,c1,M) { x1 <- (x-M)/c1 ivec1 <- (x1 < 0) ivec2 <- (x1 > 1) return(ivec1*x+(1-ivec1-ivec2)*x*(1-x1^2)^2) } psip.bt <- function(x,c1,M) { x1 <- (x-M)/c1 ivec1 <- (x1 < 0) ivec2 <- (x1 > 1) return(ivec1+(1-ivec1-ivec2)*((1-x1^2)^2+4*x*x1*(1-x1^2)/c1)) } wt.bt <- function(x,c1,M) { x1 <- (x-M)/c1 ivec1 <- (x1 < 0) ivec2 <- (x1 > 1) return(ivec1+(1-ivec1-ivec2)*(1-x1^2)^2) } v.bt <- function(x,c1,M) return(x*psi.bt(x,c1,M)) olstests1<-function(vstar,yhat,res,x){ ystar <- yhat + res * vstar p<-ncol(x) pp<-p+1 vals<-lsfit(x,ystar)$coef[2:pp] test<-sum(vals^2) test } kerreg<-function(x,y,pyhat=F,pts=NA,plotit=T,theta=50,phi=25,expand=.5, scale=F,zscale=F,eout=F,xout=F,outfun=out,np=100,xlab="X",ylab="Y", varfun=pbvar,e.pow=T,pr=T){ # # Compute local weighted regression with Epanechnikov kernel # # See Fan, Annals of Statistics, 1993, 21, 196-217. # cf. Bjerve and Doksum, Annals of Statistics, 1993, 21, 890-902 # # With a single predictor, this function calles locreg # See locreg for information about np and plotting # library(akima) x<-as.matrix(x) #library(akima) d<-ncol(x) np1<-d+1 m<-elimna(cbind(x,y)) if(xout && eout)stop("Can't have eout=xout=T") if(eout){ flag<-outfun(m,plotit=F)$keep m<-m[flag,] } if(xout){ flag<-outfun(x,plotit=F)$keep m<-m[flag,] } if(zscale){ for(j in 1:np1){ m[,j]<-(m[,j]-median(m[,j]))/mad(m[,j]) }} x<-m[,1:d] x<-as.matrix(x) y<-m[,np1] n<-nrow(x) if(d>1){ xrem<-x pi<-gamma(.5)^2 cd<-c(2,pi) if(d==2)A<-1.77 if(d==3)A<-2.78 if(d>2){ for(j in 3:d)cd[j]<-2*pi*cd[j-2]/j # p. 76 } if(d>3)A<-(8*d*(d+2)*(d+4)*(2*sqrt(pi))^d)/((2*d+1)*cd[d]) # p. 87 hval<-A*(1/n)^(1/(d+4)) # p. 86 for(j in 1:d){ sig<-sqrt(var(x[,j])) temp<-idealf(x[,j]) iqr<-(temp$qu-temp$ql)/1.34 A<-min(c(sig,iqr)) x[,j]<-x[,j]/A } xx<-cbind(rep(1,nrow(x)),x) yhat<-NA for(j in 1:n){ yhat[j]<-NA temp1<-t(t(x)-x[j,])/(hval) temp1<-temp1^2 temp1<-apply(temp1,1,FUN="sum") temp<-.5*(d+2)*(1-temp1)/cd[d] epan<-ifelse(temp1<1,temp,0) # Epanechnikov kernel, p. 76 chkit<-sum(epan!=0) if(chkit >= np1){ vals<-lsfit(x,y,wt=epan)$coef yhat[j]<-xx[j,]%*%vals }} if(plotit && d==2){ if(pr){ if(!scale){ print("scale=F is specified") print("If there is dependence, might use scale=T") }} #library(akima) m<-elimna(cbind(xrem,yhat)) xrem<-m[,1:d] yhat<-m[,np1] fitr<-yhat iout<-c(1:length(fitr)) nm1<-length(fitr)-1 for(i in 1:nm1){ ip1<-i+1 for(k in ip1:length(fitr))if(sum(xrem[i,]==xrem[k,])==2)iout[k]<-0 } fitr<-fitr[iout>=1] mkeep<-xrem[iout>=1,] fit<-interp(mkeep[,1],mkeep[,2],fitr) persp(fit,theta=theta,phi=phi,expand=expand,xlab="x1",ylab="x2",zlab="", scale=scale) }} if(d==1){ yhat<-locreg(x[,1],y,pyhat=T,np=np,plotit=plotit,pts=pts, xlab=xlab,ylab=ylab) yhat2<-locreg(x[,1],y,pyhat=T,np=0,plotit=F) } if(d>1)yhat2<-yhat m<-NULL E.pow<-varfun(yhat2[!is.na(yhat2)])/varfun(y) if(pyhat)m<-yhat list(Explanatory.Power=E.pow,yhat=m) } attract<-function(X, Y, k = 5) { # Works in Splus but not in R. # For simple linear regression: plots k elemental starts and # their domains of attraction. Calls conc2. l1coef <- l1fit(X, Y)$coef X <- as.matrix(X) nr <- dim(X)[1] nc <- dim(X)[2] + 1 J <- 1:nc dom <- matrix(nrow = k, ncol = nc) par(mfrow = c(1, 2)) plot(X, Y) title("a) 5 Elemental Starts") for(i in 1:k) { ## get J J <- sample(nr, nc) ## get bJ, the elem fit if(abs(X[J[1]] - X[J[2]]) < 1/100000000) { slope <- 0 } else { slope <- (Y[J[1]] - Y[J[2]])/(X[J[1]] - X[J[2]]) } int <- Y[J[1]] - slope * X[J[1]] fit <- c(int, slope) yhat <- X %*% fit[2:nc] + fit[1] lines(X, yhat) ## get the domain of attraction for LTA concentration dom[i, ] <- conc2(X, Y, start = fit)$coef } plot(X, Y) for(i in 1:k) { fit <- dom[i, ] yhat <- X %*% fit[2:nc] + fit[1] lines(X, yhat) } title("b) The Corresponding Attractors") } bg2ci<-function(x, alpha = 0.05) { #gets BGse with middle n^0.8 cases for sample median and #the corresponding robust 100 (1-alpha)% CI. This is optimal #for estimating the SE but is not resistant. n <- length(x) up <- 1 - alpha/2 med <- median(x) ln <- max(1,floor(n/2) - ceiling(0.5 * n^0.8)) un <- n - ln rdf <- un - ln - 1 cut <- qt(up, rdf) d <- sort(x) se2 <- (d[un] - d[ln])/(2 * n^0.3) rval <- cut * se2 rlo2 <- med - rval rhi2 <- med + rval #got low and high endpoints of robust CI list(int = c(rlo2, rhi2), med = med, se2 = se2) } cav<-function(alpha = 0.01, k = 5) { #gets n(asy var) for the alpha trimmed mean #and T_(A,n)(k) if errors are Cauchy(0,1) z <- tan(pi * (alpha - 0.5)) val <- (z - atan(z))/((1 - 2 * alpha) * atan(z)) ntmav <- val + (2 * alpha * (tan(pi * (alpha - 0.5)))^2)/(1 - 2 * alpha )^2 zj <- k alphaj <- 0.5 + atan( - k)/pi alphaj <- ceiling(100 * alphaj)/100 zj <- tan(pi * (alphaj - 0.5)) val <- (zj - atan(zj))/((1 - 2 * alphaj) * atan(zj)) natmav <- val + (2 * alphaj * (tan(pi * (alphaj - 0.5)))^2)/(1 - 2 * alphaj)^2 return(ntmav, natmav) } cci<-function(x, alpha = 0.05) { #gets classical 100 (1-alpha)% CI #defaults are alpha = .05 n <- length(x) up <- 1 - alpha/2 mn <- mean(x) v <- var(x) se <- sqrt(v/n) val <- qt(up, n - 1) * se lo <- mn - val hi <- mn + val list(int = c(lo, hi), mean = mn, se = se) } cgci<-function(x, alpha = 0.05, ks = 3.5) { #gets T_S,n with a coarse grid # and the corresponding robust 100 (1-alpha)% CI n <- length(x) up <- 1 - alpha/2 med <- median(x) madd <- mad(x, constant = 1) d <- sort(x) ##get robust T_S,n CI lo <- sum(x < (med - ks * madd)) hi <- sum(x > (med + ks * madd)) tp <- max(hi, lo)/n if(tp == 0) tp <- 0 if(tp > 0 && tp <= 0.01) tp <- 0.01 if(tp > 0.01 && tp <= 0.1) tp <- 0.1 if(tp > 0.1 && tp <= 0.25) tp <- 0.25 if(tp > 0.25 && tp <= 0.4) tp <- 0.4 if(tp > 0.4) tp <- 0.49 tstmn <- mean(x, trim = tp) #have obtained the two stage trimmed mean ln <- floor(n * tp) un <- n - ln if(ln > 0) { d[1:ln] <- d[(ln + 1)] d[(un + 1):n] <- d[un] } den <- ((un - ln)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- un - ln - 1 rval <- qt(up, rdf) * sqrt(swv/n) tslo <- tstmn - rval tshi <- tstmn + rval ##got low and high endpoints of robust T_S,n CI list(int = c(tslo, tshi), tp = tp) } cisim <-function(n, runs = 500, type = 1, eps = 0.25, shift = 100, df = 1, kaa = 6, kss = 3.5) { # simulates classical and 2 robust CI's for median as well as # CI's based on T_A,n, T_S,n and the 25% trimmed mean. # type = 1: normal data, type = 2: contaminated normal data # type = 3: t(df) data, type = 4: double exponential # type = 5: exponential if(type == 1) x <- matrix(rnorm(n * runs), nrow = runs, ncol = n) if(type == 2) { x <- matrix(rnorm(n * runs), nrow = runs, ncol = n) x <- x + shift * matrix(rbinom(n * runs, 1, eps), nrow = runs, ncol = n) } if(type == 3) x <- matrix(rt(n * runs, df = df), nrow = runs, ncol = n) if(type == 4) { x <- matrix(rexp(n * runs), nrow = runs, ncol = n) w <- matrix(rbinom(n * runs, 1, 0.5), nrow = runs, ncol = n) w <- 2 * w - 1 x <- w * x } if(type == 5) x <- matrix(rexp(n * runs), nrow = runs, ncol = n) tcov <- 0 tlow <- 1:runs tup <- tlow tacov <- 0 talow <- tup taup <- tup tscov <- 0 tslow <- tup tsup <- tup bgcov <- 0 bglow <- tup bgup <- tup mcov <- 0 mlow <- tup mup <- tup trcov <- 0 trlow <- tup trup <- tup mexp <- log(2) for(i in 1:runs) { out <- robci(x[i, ], ka = kaa, ks = kss) tlow[i] <- out$tint[1] tup[i] <- out$tint[2] talow[i] <- out$taint[1] taup[i] <- out$taint[2] tslow[i] <- out$tsint[1] tsup[i] <- out$tsint[2] bglow[i] <- out$bgint[1] bgup[i] <- out$bgint[2] mlow[i] <- out$mint[1] mup[i] <- out$mint[2] trlow[i] <- out$trint[1] trup[i] <- out$trint[2] if(type == 5) { if(tlow[i] < 1 && tup[i] > 1) tcov <- tcov + 1 if(talow[i] < 0.89155 && taup[i ] > 0.89155) tacov <- tacov + 1 if(tslow[i] < 0.83071 && tsup[i ] > 0.83071) tscov <- tscov + 1 if(bglow[i] < mexp && bgup[i] > mexp) bgcov <- bgcov + 1 if(mlow[i] < mexp && mup[i] > mexp) mcov <- mcov + 1 if(trlow[i] < 0.73838 && trup[i ] > 0.73838) trcov <- trcov + 1 } else { if(tlow[i] < 0 && tup[i] > 0) tcov <- tcov + 1 if(talow[i] < 0 && taup[i] > 0 ) tacov <- tacov + 1 if(tslow[i] < 0 && tsup[i] > 0 ) tscov <- tscov + 1 if(bglow[i] < 0 && bgup[i] > 0 ) bgcov <- bgcov + 1 if(mlow[i] < 0 && mup[i] > 0) mcov <- mcov + 1 if(trlow[i] < 0 && trup[i] > 0 ) trcov <- trcov + 1 } } tcov <- tcov/runs tlen <- sqrt(n) * mean(tup - tlow) tacov <- tacov/runs talen <- sqrt(n) * mean(taup - talow) tscov <- tscov/runs tslen <- sqrt(n) * mean(tsup - tslow) bgcov <- bgcov/runs bglen <- sqrt(n) * mean(bgup - bglow) mcov <- mcov/runs mlen <- sqrt(n) * mean(mup - mlow) trcov <- trcov/runs trlen <- sqrt(n) * mean(trup - trlow) list(tcov = tcov, tlen = tlen, tacov = tacov, talen = talen, tscov = tscov, tslen = tslen, bgcov = bgcov, bglen = bglen, mcov = mcov, mlen = mlen, trcov = trcov, trlen = trlen) } cltv<- function(gam = 0.5) { # Gets asy var for lts(h) and lta(h)at Cauchy C(0,1) # where h/n -> gam. k <- tan((pi * gam)/2) num <- 2 * k - pi * gam den <- pi * (gam - (2 * k)/(pi * (1 + k^2)))^2 ltsv <- num/den num <- gam den <- 4 * (1/pi - 1/(pi * (1 + k^2)))^2 ltav <- num/den return(ltsv, ltav) } cmba2<- function(x, csteps = 5, ii = 1) { # gets the covmba estimator using 98, 95, 90, 80, 70, 60 and 50% trimming n <- dim(x)[1] p <- dim(x)[2] mds <- matrix(nrow = n, ncol = 8, 0) ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) cmd <- sqrt(mahalanobis(x, mns, covs)) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } mds[, 8] <- sqrt(mahalanobis(x, mns, covs)) covb <- covs mnb <- mns ##get the square root of det(covb) critb <- prod(diag(chol(covb))) ##get the resistant estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) smd2 <- sort(md2) val <- p + 3 tem <- 1:7 tem[1] <- smd2[val + floor(0.02 * n)] tem[2] <- smd2[val + floor(0.05 * n)] tem[3] <- smd2[val + floor(0.1 * n)] tem[4] <- smd2[val + floor(0.2 * n)] tem[5] <- smd2[val + floor(0.3 * n)] tem[6] <- smd2[val + floor(0.4 * n)] tem[7] <- median(md2) medd2 <- tem[7] for(j in ii:7) { ## get the start val2 <- tem[j] mns <- apply(x[md2 <= val2, ], 2, mean) covs <- var(x[md2 <= val2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } mds[, j] <- sqrt(mahalanobis(x, mns, covs)) plot(cmd, mds[, j]) identify(cmd, mds[, j]) crit <- prod(diag(chol(covs))) if(crit < critb) { critb <- crit covb <- covs mnb <- mns } } pairs(mds) ##scale for better performance at MVN rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/(qchisq(0.5, p)) covb <- const * covb list(center = mnb, cov = covb, mds = mds) } conc2<- function(x, y, start = l1fit(x, y)$coef) { #Finds that LTA attractor of the start. nc <- dim(x)[2] + 1 res <- y - (x %*% start[2:nc] + start[1]) ares <- abs(res) cov <- ceiling(length(y)/2) m <- sort(ares, partial = cov)[cov] old <- sum(ares[ares <= m]) new <- old - 1 ct <- 0 while(new < old) { ct <- ct + 1 start <- l1fit(x[ares <= m, ], y[ares <= m])$coef res <- y - (x %*% start[2:nc] + start[1 ]) ares <- abs(res) m <- sort(ares, partial = cov)[cov] new <- sum(ares[ares <= m]) #print(old) if(new < old) { old <- new new <- new - 1 } } list(coef = start, ct = ct) } concmv<- function(n = 100, csteps = 5, gam = 0.4, outliers = T, start = 2) { #Shows how concentration works when p = 2. # Use start = 1 for DGK, start = 2 for MBA sphere, start = 3 for MBA MAD p <- 2 #A <- cbind(c(1, 0.9), c(0.9, 1)) x <- matrix(rnorm(n * p), ncol = p, nrow = n) #A <- diag(sqrt(1:p)) #if(outliers == T) { # val <- floor(gam * n) # tem <- 10 + 0 * 1:p # x[1:val, ] <- x[1:val, ] + tem #} #x <- x %*% A A <- cbind(c(1, 0.4), c(0.4, 1)) B <- cbind(c(0.5, 0), c(0, 0.5)) if(outliers == T) { val <- floor(gam * n) x[(val + 1):n, ] <- x[(val + 1):n, ] %*% A x[1:val, ] <- x[1:val, ] %*% B x[1:val, 1] <- x[1:val, 1] + 0 x[1:val, 2] <- x[1:val, 2] + 6 } else { x <- x %*% A } if(start == 1) { covs <- var(x) mns <- apply(x, 2, mean) } if(start == 2) { covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } if(start >= 2) { tem <- apply(x, 2, mad)^2 covv <- diag(tem) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) plot(x[, 1], x[, 2]) points(x[md2 <= medd2, 1], x[md2 <= medd2, 2], pch = 15) identify(x[, 1], x[, 2]) } } concsim<- function(n = 100, p = 2, steps = 5, gam = 0.4, runs = 20) { # This Splus function is used to determine when the DD # plot separates outliers from non-outliers for various starts. A <- sqrt(diag(1:p)) mbact <- 0 fmcdct <- 0 mbct <- 0 madct <- 0 dgkct <- 0 for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) ## outliers have mean (10, 10 sqrt(2), ..., 10 sqrt(p))^T val <- floor(gam * n) tem <- 10 + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem x <- x %*% A #MBA out <- covmba(x, csteps = steps) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbact <- mbact + 1 #DGK covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) dgkct <- dgkct + 1 #Median Ball start covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbct <- mbct + 1 #MAD start tem <- apply(x, 2, mad)^2 covv <- diag(tem) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) madct <- madct + 1 #FMCD out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) fmcdct <- fmcdct + 1 } list(mbact = mbact, fmcdct = fmcdct, dgkct = dgkct, mbct = mbct, madct = madct) } corrsim<- function(n = 100, p = 3, eps = 0.4, nruns = 100, type = 1) { #For R, first type "library(lqs)" before using this function # This function generates 100 n by p matrices x. # The output is the 100 sample correlations between the MDi and RDi # RDi uses covmba for type = 1, rmba for type = 2, cov.mcd for type = 3 # mahalanobis gives squared Maha distances corrs <- 1:nruns for(i in 1:nruns) { wt <- 0 * (1:n) x <- matrix(rnorm(n * p), ncol = p, nrow = n) #The following 3 commands make x elliptically contoured. #zu <- runif(n) #x[zu < eps,] <- x[zu < eps,]*5 #x <- x^2 # To make marginals of x lognormal, use #x <- exp(x) center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) if(type == 1) { out <- covmba(x) } if(type == 2) { out <- rmba(x) } if(type == 3) { out <- cov.mcd(x) } center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) # need square roots for the usual distances md <- sqrt(md2) rd <- sqrt(rd2) const <- sqrt(qchisq(0.5, p))/median(rd) rd <- const * rd # wt[rd < sqrt(qchisq(0.975, p))] <- 1 # corrs[i] <- cor(md[wt > 0], rd[wt > 0])} corrs[i] <- cor(md, rd) } cmean <- mean(corrs) cmin <- min(corrs) clt95 <- sum(corrs < 0.95) clt80 <- sum(corrs < 0.8) list(cmean = cmean, cmin = cmin, clt95 = clt95, clt80 = clt80, corrs = corrs) } covdgk<- function(x, csteps = 10) { #computes the scaled DGK multivariate estimator p <- dim(x)[2] covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } ##scale for consistency at MVN rd2 <- mahalanobis(x, mns, covs) const <- median(rd2)/(qchisq(0.5, p)) covs <- const * covs list(center = mns, cov = covs) } covmba <- function(x, csteps = 5) { # gets the MBA estimator zx <- x x <- as.matrix(x) p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1){ mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } if(p == 1){ mns <- mean(x[md2 <= medd2]) covs <- var(x[md2 <= medd2]) } } covb <- covs mnb <- mns ##get the square root of det(covb) critb <- prod(diag(chol(covb))) ##get the resistant estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start if(p > 1){ mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } if(p == 1){ mns <- mean(zx[md2 <= medd2]) covs <- var(zx[md2 <= medd2]) } ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) if(p > 1){ mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } if(p == 1){ mns <- mean(zx[md2 <= medd2]) covs <- var(zx[md2 <= medd2]) } } crit <- prod(diag(chol(covs))) if(crit < critb) { critb <- crit covb <- covs mnb <- mns } ##scale for better performance at MVN rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/(qchisq(0.5, p)) covb <- const * covb list(center = mnb, cov = covb) } covmba2<- function(x, csteps = 5) { # gets the MBA estimator, use covmba2 instead of covmba if p > 1 p <- dim(x)[2] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } covb <- covs mnb <- mns ##get the square root of det(covb) critb <- prod(diag(chol(covb))) ##get the resistant estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } crit <- prod(diag(chol(covs))) if(crit < critb) { critb <- crit covb <- covs mnb <- mns } ##scale for better performance at MVN rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/(qchisq(0.5, p)) covb <- const * covb list(center = mnb, cov = covb) } covsim2<- function(n=100, p = 2, steps = 5, gam = 0.4, runs = 20) { # This Splus function is used to determine when the DD # plot separates outliers from non-outliers. A <- sqrt(diag(1:p)) mbact <- 0 for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) ## outliers have mean (10, 10 sqrt(2), ..., 10 sqrt(p))^T val <- floor(gam * n) tem <- 10 + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem x <- x %*% A out <- covmba(x, csteps = steps) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) if(min(rd2[1:val]) > max(rd2[(val + 1):n])) mbact <- mbact + 1 } list(mbact = mbact) } ctrviews<- function(x, Y, ii = 1) { # Uses classical distances instead of robust distances. # Trimmed views for 90, 80, ... 0 percent # trimming. Allows visualization of m # and crude estimatation of c beta in models # of the form y = m(x^T beta) + e. # Workstation: activate a graphics # device with command "X11()" or "motif()." # R needs command "library(lqs)." # Advance the view with the right mouse button. # In R, highight "stop." x <- as.matrix(x) center <- apply(x, 2, mean) cov <- var(x) rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) bhat <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$coef ESP <- x %*% bhat[-1] plot(ESP, Y) title(labs[i]) identify(ESP, Y) print(bhat) } } ddcomp<- function(x, steps = 5) { # Makes 4 DD plots using the FMCD and MBA estimators. # Click left mouse button to identify points. # Click right mouse button to end the function. # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. # R users need to type "library(lqs)" before using. p <- dim(x)[2] par(mfrow = c(2, 2)) center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) # MD is the classical and RD the robust distance MD <- sqrt(md2) #DGK start md2 <- mahalanobis(x, center, cov) medd2 <- median(md2) ## get the start mns <- center covs <- cov ## concentrate for(i in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RDdgk <- const * rd plot(MD, RDdgk) abline(0, 1) identify(MD, RDdgk) title("DGK DD Plot") #MBA out <- covmba(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) rd <- sqrt(rd2) #Scale the RD so the plot follows the identity line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RDm <- const * rd plot(MD, RDm) abline(0, 1) identify(MD, RDm) title("MBA DD Plot") #FMCD out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RDf <- const * rd plot(MD, RDf) abline(0, 1) identify(MD, RDf) title("FMCD DD Plot") #Median Ball start covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:steps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } rd2 <- mahalanobis(x, mns, covs) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RDmb <- const * rd plot(MD, RDmb) abline(0, 1) identify(MD, RDmb) title("Med Ball DD Plot") } ddmv<- function(n = 100, p = 2, steps = 5, gam = 0.4, outtype = 2, est = 1) { # This Splus function is used to determine when the DD # plot separates outliers from non-outliers for various starts. # Workstation needs to activate a graphics # device with the command "X11()" or "motif()." # Advance the view with the right mouse button. ## est = 1 for DGK, 2 for median ball, 3 for MAD A <- sqrt(diag(1:p)) x <- matrix(rnorm(n * p), ncol = p, nrow = n) val <- floor(gam * n) tem <- 10 + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem #if outtype = 1, outliers are Np(10 1, Ip) nonoutliers Np(0,Ip) if(outtype == 2) x <- x %*% A ## outliers have mean (10, 10 sqrt(2), ..., 10 sqrt(p))^T ## get the start if(est == 1) { #DGK classical start covs <- var(x) mns <- apply(x, 2, mean) } if(est == 2) { #Median Ball high breakdown start covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } if(est == 3) { #MAD high breakdown start tem <- apply(x, 2, mad)^2 covv <- diag(tem) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(x[md2 <= medd2, ], 2, mean) covs <- var(x[md2 <= medd2, ]) } ## concentrate and plot, highlighting outliers MD <- sqrt(mahalanobis(x, mns, covs)) for(i in 1:steps) { md <- sqrt(mahalanobis(x, mns, covs)) medd <- median(md) mns <- apply(x[md <= medd, ], 2, mean) covs <- var(x[md <= medd, ]) rd <- sqrt(mahalanobis(x, mns, covs)) plot(MD, rd) points(MD[1:val], rd[1:val], pch = 15) identify(MD, rd) } } ddplot<- function(x) { # Makes a DD plot. cov.mcd is used for the RDi. # Click left mouse button to identify points. # Click right mouse button to end the function. # Unix systems turn on graphics device eg enter # command "X11()" or "motif()" before using. # R users need to type "library(lqs)" before using. p <- dim(x)[2] center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) out <- cov.mcd(x) # or use out <- cov.mve(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) # md is the classical and rd the robust distance MD <- sqrt(md2) rd <- sqrt(rd2) #Scale the RD so the plot follows the 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) RD <- const * rd plot(MD, RD) abline(0, 1) identify(MD, RD) # list(MD = MD, RD = RD) } ddsim<- function(n = 100, p = 3, eps = 0.4, type = 1) { # R: type "library(lqs)" before using if type = 3. # Rapidly plots 20 DD plots in a row. # Unix: type "X11()" or "motif()" to # turn on a graphics device. # RDi uses covmba for type = 1, rmba for type = 2, cov.mcd for type = 3 med <- 1:20 for(i in 1:20) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) ## For elliptically contoured data, use: #zu <- runif(n) #x[zu < eps,] <- x[zu < eps,]*5 #x <- x^2 ##For lognormal marginals, add: #x <- exp(x) center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) if(type == 1) { out <- covmba(x) } if(type == 2) { out <- rmba(x) } if(type == 3) { out <- cov.mcd(x) } center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) md <- sqrt(md2) rd <- sqrt(rd2) #Scale the RDi so plot follows 0-1 line #if the data is multivariate normal. const <- sqrt(qchisq(0.5, p))/median(rd) rd <- const * rd plot(md, rd) abline(0, 1) med[i] <- median(md) #The following command can be inserted #to slow down the plots "identify(md,rd)" } list(med = med) } deav<- function(alpha = 0.01, k = 5) { #gets n(asy var) for the alpha trimmed mean #and T_(A,n)(k) if errors are DE(0,1) z <- - log(2 * alpha) num <- 2 - (2 + 2 * z + z^2) * exp( - z) den <- (1 - exp( - z)) * (1 - 2 * alpha) val1 <- num/den num <- 2 * alpha * z^2 den <- (1 - 2 * alpha)^2 ntmav <- val1 + num/den zj <- k * log(2) alphaj <- 0.5 * exp( - zj) alphaj <- ceiling(100 * alphaj)/100 zj <- - log(2 * alphaj) num <- 2 - (2 + 2 * zj + zj^2) * exp( - zj) den <- (1 - exp( - zj)) * (1 - 2 * alphaj) val1 <- num/den num <- 2 * alphaj * zj^2 den <- (1 - 2 * alphaj)^2 natmav <- val1 + num/den return(ntmav, natmav) } deltv<- function(gam = 0.5) { # Gets asy var for lts(h) and lta(h) at standard double exp # where h/n -> gam. k <- -1 * log(1 - gam) num <- 2 - (2 + 2 * k + k^2) * exp( - k) den <- (gam - k * exp( - k))^2 ltsv <- num/den ltav <- 1/gam return(ltsv, ltav) } diagplot<- function(x, Y) { # Scatterplot matrix of OLS diagnostics. # Workstation need to activate a graphics # device with command "X11()" or "motif()." n <- length(Y) rmat <- matrix(nrow = n, ncol = 7) out <- lsfit(x, Y) tem <- ls.diag(out) rmat[, 1] <- tem$cooks rmat[, 2] <- tem$hat rmat[, 3] <- tem$std.res rmat[, 4] <- tem$stud.res rmat[, 5] <- tem$dfits rmat[, 6] <- Y - out$resid rmat[, 7] <- Y pairs(rmat, labels = c("Cook's CD", "leverages", "stand resid", "stud resid", "DFFITS", "YHAT", "Y")) } ellipse<- function(x, center = apply(x, 2, mean), cov = var(x), alph = 0.95) {# Makes a covering interval. The x should have 2 columns. mu1 <- center[1] mu2 <- center[2] w <- solve(cov) w11 <- w[1, 1] w12 <- w[1, 2] w22 <- w[2, 2] tem <- x[, 2] - mu2 y2 <- seq(min(tem), max(tem), length = 100) xc <- qchisq(alph, 2) el <- matrix(0, 2, 2) ind <- 0 for(i in 1:100) { j1 <- (y2[i] * w12)^2 j2 <- w11 * ((y2[i])^2 * w22 - xc) # print(i) # print(j1 - j2) if((j1 - j2) >= 0) { ind <- ind + 2 tem <- (y2[i] * w12)^2 tem <- tem - w11 * ((y2[i])^2 * w22 - xc) tem <- sqrt(tem) term <- ( - y2[i] * w12 + tem)/ w11 el <- rbind(el, c((term + mu1), ( y2[i] + mu2))) term <- ( - y2[i] * w12 - tem)/ w11 el <- rbind(el, c((term + mu1), ( y2[i] + mu2))) } } el <- el[3:ind, ] nn <- dim(x)[1] if((ind - 2) > nn) { tem <- sample((ind - 2), nn) el <- el[tem, ] } xt <- cbind(x[, 1], el[, 1]) yt <- cbind(x[, 2], el[, 2]) matplot(xt, yt) } essp<- function(x, Y, M = 50) { # Trimmed view or ESSP for M percent # trimming. Allows visualization of g # and crude estimation of c beta in models # of the form y = g(x^T beta,e). # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(lqs)." # Click on the right mouse button to finish. # In R, highlight "stop." x <- as.matrix(x) tval <- M/100 out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) val <- quantile(rd2, (1 - tval)) bhat <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$ coef ESP <- x %*% bhat[-1] plot(ESP, Y) identify(ESP, Y) return(bhat[-1]) } ffL<- function(x, y) { # for unix, use X11() to turn on the graphics device before using this function # this function makes a FF lambda plot where the competing models are Y^L n <- length(y) rmat <- matrix(nrow = n, ncol = 5) rmat[, 1] <- y - lsfit(x, y)$resid ytem <- (y^(0.5) - 1)/0.5 rmat[, 2] <- ytem - lsfit(x, ytem)$resid rmat[, 3] <- log(y) - lsfit(x, log(y))$resid ytem <- (y^(-0.5) - 1)/-0.5 rmat[, 4] <- ytem - lsfit(x, ytem)$resid ytem <- (y^(-1) - 1)/-1 rmat[, 5] <- ytem - lsfit(x, ytem)$resid pairs(rmat, labels = c("YHAT", "YHAT^(0.5)", "YHAT^(0)", "YHAT^(-0.5)", "YHAT^(-1)")) min(cor(rmat)) } fflynx<-function(){ # R users need to type library(ts) and data(lynx) Y <- log10(lynx) FAR2 <- 1:114 FAR11 <- 1:114 FAR12 <- 1:114 SETAR272 <- 1:114 SETAR252 <- 1:114 for(i in 3:114){ FAR2[i ] <- 1.05 + 1.41*Y[i-1] -0.77*Y[i-2]} for(i in 12:114){ FAR11[i ] <- 1.13*Y[i-1] -0.51*Y[i-2] + .23*Y[i-3] -0.29*Y[i-4] + .14*Y[i-5] -0.14*Y[i-6] + 0.08*Y[i-7] -0.04*Y[i-8] + .13*Y[i-9] + 0.19*Y[i-10] - .31*Y[i-11] } for(i in 13:114){ FAR12[i ] <- 1.123 + 1.084*Y[i-1] -0.477*Y[i-2] + .265*Y[i-3] -0.218*Y[i-4] + .180*Y[i-9] - .224*Y[i-12] } for(i in 13:114){ if( Y[i-2] <= 3.116){ SETAR272[i ] <- 0.546 + 1.032*Y[i-1] -0.173*Y[i-2] + .171*Y[i-3] -0.431*Y[i-4] + .332*Y[i-5] - .284*Y[i-6] + .210*Y[i-7]} else {SETAR272[i ] <- 2.632 + 1.492*Y[i-1] -1.324*Y[i-2]} } for(i in 13:114){ if( Y[i-2] <= 3.05){ SETAR252[i ] <- 0.768 + 1.064*Y[i-1] -0.200*Y[i-2] + .164*Y[i-3] -0.428*Y[i-4] + .181*Y[i-5] } else {SETAR252[i ] <- 2.254 + 1.474*Y[i-1] -1.202*Y[i-2]} } x <- cbind(Y,FAR2,FAR11,FAR12,SETAR272,SETAR252) x <- x[13:114,] print(cor(x)) pairs(x) } ffplot<- function(x, y, nsamps = 7) { # For Unix, use X11() to turn on the graphics device before # using this function. For R, first type library(lqs). # Makes an FF plot with several resistant estimators. # Need the program mbareg.. n <- length(y) rmat <- matrix(nrow = n, ncol = 6) lsfit <- y - lsfit(x, y)$residuals print("got OLS") l1fit <- y - l1fit(x, y)$residuals print("got L1") almsfit <- y - lmsreg(x, y)$resid print("got ALMS") altsfit <- y - ltsreg(x, y)$residuals print("got ALTS") mbacoef <- mbareg(x, y, nsamp = nsamps)$coef MBAFIT <- mbacoef[1] + x %*% mbacoef[-1] print("got MBA") rmat[, 1] <- y rmat[, 2] <- lsfit rmat[, 3] <- l1fit rmat[, 4] <- almsfit rmat[, 5] <- altsfit rmat[, 6] <- MBAFIT pairs(rmat, labels = c("Y", "OLS Fit", "L1 Fit", "ALMS Fit", "ALTS Fit", "MBAREG Fit")) } ffplot2<- function(x, y, nsamps = 7) { # For Unix, use X11() to turn on the graphics device before # using this function. For R, first type library(lqs). # Makes an FF plot with several resistiant estimators. # Need the program mbareg. n <- length(y) rmat <- matrix(nrow = n, ncol = 5) lsfit <- y - lsfit(x, y)$residuals print("got OLS") almsfit <- y - lmsreg(x, y)$resid print("got ALMS") altsfit <- y - ltsreg(x, y)$residuals print("got ALTS") mbacoef <- mbareg(x, y, nsamp = nsamps)$coef MBAFIT <- mbacoef[1] + x %*% mbacoef[-1] print("got MBA") rmat[, 1] <- y rmat[, 2] <- lsfit rmat[, 3] <- almsfit rmat[, 4] <- altsfit rmat[, 5] <- MBAFIT pairs(rmat, labels = c("Y", "OLS Fit", "ALMS Fit", "ALTS Fit", "MBAREG Fit")) } fysim<-function( runs = 20) { # 20 FY plots for simulated AR(2) time series data fycorr <- 1:runs for(i in 1: runs){ Y <- ardata()$arts out <- ar.yw(Y) Yts <- Y[10:200] FIT <- Yts - out$resid[10:200] plot(FIT,Yts) abline(0,1) fycorr[i] <- cor(FIT,Yts) } list(fycorr=fycorr) } gamper<- function(h, k=500) { n <- 10000 c <- 5000 gam0 <- min((n - c)/n, (1 - (1 - 0.2^(1/k))^(1/ h))) * 100 print(gam0) } gamper2<- function(p, k = 500) { ##estimates the amount of contamination fmcd can tolerate n <- 10000 c <- 5000 h <- p + 1 gam0 <- min((n - c)/n, (1 - (1 - 0.2^(1/k))^(1/h))) * 100 print(gam0) } llrdata <- function(n = 100, q=5) { # Generates data for loglinear regression. # y <- 0 * 1:n beta <- 0 * 1:q beta[1:3] <- 1 alpha <- -2.5 x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- 0.5*x + 1 SP <- alpha + x%*%beta y <- rpois(n,lambda=exp(SP)) list(x=x,y=y) } llressp <- function(x,y) { # Makes the ESSP for loglinear regression. # Workstation: need to activate a graphics # device with command "X11()" or "motif()." # # If q is changed, change the formula in the glm statement. q <- 5 # change formula to x[,1]+ ... + x[,q] with q out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[,5], family = poisson) ESP <- x %*% out$coef[-1] + out$coef[1] Y <- y plot(ESP,Y) abline(mean(y),0) fit <- y fit <- exp(ESP) indx <- sort.list(ESP) lines(ESP[indx],fit[indx]) lines(lowess(ESP,y),type="s") } llrplot<- function(x, y) { # Makes ESSP, the weighted forward response and residual plots for loglinear regression. # Workstation: need to activate a graphics # device with command "X11()" or "motif()." # # If q is changed, change the formula in the glm statement. q <- 5 # change formula to x[,1]+ ... + x[,q] with q out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[, 5], family = poisson) ESP <- x %*% out$coef[-1] + out$coef[1] Y <- y par(mfrow = c(2, 2)) plot(ESP, Y) abline(mean(y), 0) Ehat <- exp(ESP) indx <- sort.list(ESP) lines(ESP[indx], Ehat[indx]) lines(lowess(ESP, y), type = "s") title("a) ESSP") Vhat <- (y - Ehat)^2 plot(Ehat, Vhat) abline(0, 1) #abline(lsfit(Ehat, Vhat)$coef) title("b)") Z <- y Z[y < 1] <- Z[y < 1] + 0.5 MWRES <- sqrt(Z) * (log(Z) - x %*% out$coef[-1] - out$coef[1]) MWFIT <- sqrt(Z) * log(Z) - MWRES plot(MWFIT, sqrt(Z) * log(Z)) abline(0, 1) #abline(lsfit(MWFIT, sqrt(Z) * log(Z))$coef) title("c) WFRP Based on MLE") plot(MWFIT, MWRES) title("d) WRP Based on MLE") } llrsim<- function(n = 100, nruns = 1, type = 1) { # Runs llrpot 10 times on simulated LLR. # Type = 1 for Poisson data, Type = 2 for negative binomial data # Calls llrdata, oddata, llrplot. q <- 5 for(i in 1:nruns) { if(type == 1) out <- llrdata(n, q) else out <- oddata(n, q) x <- out$x y <- out$y llrplot(x, y) #identify(MWFIT, MWRES) } } llrwtfrp <- function(x,y) { # Makes the weighted forward response and residual plots for loglinear regression. # Workstation: need to activate a graphics # device with command "X11()" or "motif()." # # If q is changed, change the formula in the glm statement. q <- 5 # change formula to x[,1]+ ... + x[,q] with q out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[,5], family = poisson) ESP <- x %*% out$coef[-1] + out$coef[1] Z <- y Z[y<1] <- Z[y<1] + 0.5 out2<-lsfit(x,y=log(Z),wt=Z) #WRES <- sqrt(Z)*(log(Z) - x%*%out2$coef[-1] - out2$coef[1]) WRES <- out2$res WFIT <- sqrt(Z)*log(Z) - WRES MWRES <- sqrt(Z)*(log(Z) - x%*%out$coef[-1] - out$coef[1]) MWFIT <- sqrt(Z)*log(Z) - MWRES par(mfrow=c(2,2)) plot(WFIT,sqrt(Z)*log(Z)) abline(0,1) title("a) Weighted Forward Response Plot") plot(WFIT,WRES) title("b) Weighted Residual Plot") plot(MWFIT,sqrt(Z)*log(Z)) abline(0,1) title("c) WFRP Based on MLE") plot(MWFIT,MWRES) title("d) WRP Based on MLE") } lmsviews<- function(x, Y, ii = 1) { # Trimmed views using lmsreg for 90, 80, ... 0 percent # trimming. Allows visualization of m # and crudely estimation of c beta in models # of the form y = m(x^T beta) + e. # Workstation: activate a graphics device # with commands "X11()" or "motif()." # R needs command "library(lqs)." # Advance the view with the right mouse button and # in R, highight "stop." x <- as.matrix(x) out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) b <- lmsreg(x[rd2 <= val, ], Y[rd2 <= val])$coef ESP <- x %*% b[-1] plot(ESP, Y) title(labs[i]) identify(ESP, Y) print(b) } } lrdata <- function(n = 200, type = 3) { # Generates data for logistic regression. # If X|y=1 ~ N(mu_1,I) and X|Y=0 ~ N(0,I) then beta = mu_1 and alpha = -0.5 ||mu_1||^2. # # If q is changed, change the formula in the glm statement. q <- 5 y <- 0 * 1:n y[(n/2 + 1):n] <- y[(n/2 + 1):n] + 1 beta <- 0 * 1:q if(type == 1) { beta[1] <- 1 alpha <- -0.5 } if(type == 2) { beta <- beta + 1 alpha <- -q/2 } if(type == 3) { beta[1:3] <- 1 alpha <- -1.5 } x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { x[(n/2 + 1):n, 1] <- x[(n/2 + 1 ):n, 1] + 1 } if(type == 2) { x[(n/2 + 1):n, ] <- x[(n/2 + 1 ):n, ] + 1 } if(type == 3) { x[(n/2 + 1):n, 1:3 ] <- x[(n/2 + 1 ):n, 1:3 ] + 1 } #X|y=0 ~ N(0, I) and X|y=1 ~ N(beta,I) # change formula to x[,1]+ ... + x[,q] with q out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[,5], family = binomial) list(alpha = alpha, beta = beta, lrcoef = out$coef,x=x,y=y) } lressp <- function(x,y,slices=10) { # Makes the ESSP for logistic regression. # If X|y=1 ~ N(mu_1,I) and X|Y=0 ~ N(0,I) then beta = mu_1 and alpha = ||mu_1||^2. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(lqs)." # Advance the view with the right mouse button. # In R, highlight "stop." # # If q is changed, change the formula in the glm statement. q <- 5 # change formula to x[,1]+ ... + x[,q] with q out <- glm(y ~ x[, 1] + x[, 2] + x[, 3] + x[, 4] + x[,5], family = binomial) ESP <- x %*% out$coef[-1] + out$coef[1] Y <- y plot(ESP,Y) abline(mean(y),0) fit <- y fit <- exp(ESP)/(1 + exp(ESP)) # lines(sort(ESP),sort(fit)) indx <- sort.list(ESP) lines(ESP[indx],fit[indx]) fit2 <- fit n <- length(y) val <- as.integer(n/slices) for(i in 1: (slices-1)){ fit2[((i-1)*val+1):(i*val)] <- mean(y[indx[((i-1)*val+1):(i*val)]]) } fit2[((slices-1)*val+1):n] <- mean(y[indx[((slices-1)*val+1):n]]) # fit2 is already sorted in order corresponding to indx lines(ESP[indx],fit2) #list(fit2=fit2,n=n,slices=slices,val=val) } lsviews<- function(x, Y, ii = 1) { # This function is the same as tvreg except that the untrimmed # cases are highlighted. It compares the LS fits for 90, 80, # ..., 0 percent trimming. Used to visualize g if y = g(beta^T x,e). # Workstation: activate a graphics # device with command "X11()" or "motif()." # R needs command "library(lqs)." # Advance the view with the right mouse button. # In R, highlight ``stop." x <- as.matrix(x) out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) bhat <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$coef ESP <- bhat[1] + x %*% bhat[-1] plot(ESP, Y) points(ESP[rd2 <= val], Y[rd2 <= val], pch = 15, cex = 1.4) abline(0, 1) title(labs[i]) identify(ESP, Y) print(bhat) } } maha<- function(x) { # Generates the classical mahalanobis distances. center <- apply(x, 2, mean) cov <- var(x) return(sqrt(mahalanobis(x, center, cov))) } mbalata<- function(x, y, k=6, nsamp = 7) { #gets the median ball fit with 7 centers, med resid crit, 7 ball sizes x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] # q + 1 is number of predictors including intercept vals <- c(q + 3 + floor(n/100), q + 3 + floor(n/40), q + 3 + floor(n/20), q + 3 + floor(n/10), q + 3 + floor(n/5), q + 3 + floor(n/3), q + 3 + floor(n/2)) covv <- diag(q) centers <- sample(n, nsamp) temp <- lsfit(x, y) mbaf <- temp$coef ## get LATA criterion res <- temp$residuals crit <- k^2*median(res^2) cn <- sum(res^2 <= crit) absres <- sort(abs(res)) critf <- sum(absres[1:cn]) ## for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) smd2 <- sort(md2) for(j in 1:7) { temp <- lsfit(x[md2 <= smd2[vals[j]], ], y[md2 <= smd2[vals[j]]]) #Use OLS on rows with md2 <= cutoff = smd2[vals[j]] res <- y - temp$coef[1] - x %*% temp$coef[-1] ## get LATA criterion crit <- k^2*median(res^2) cn <- sum(res^2 <= crit) absres <- sort(abs(res)) crit <- sum(absres[1:cn]) ## if(crit < critf) { critf <- crit mbaf <- temp$coef } } } list(coef = mbaf, critf = critf) } mbamv<- function(x, y, nsamp = 7) { # This function is for simple linear regression. The # highlighted boxes get weight 1. Click on right # mouse button to advance plot. Only uses 50% trimming. x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] covv <- diag(q) centers <- sample(n, nsamp) for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) med <- median(md2) plot(x, y) points(x[md2 < med], y[md2 < med], pch = 15) abline(lsfit(x[md2 < med],y[md2 < med])) identify(x, y) } } mbamv2<- function(x, Y, nsamp = 7) { # This function is for multiple linear regression. The # highlighted boxes get weight 1. Click on right # mouse button to advance plot. Only uses 50% trimming. x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] covv <- diag(q) centers <- sample(n, nsamp) for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) med <- median(md2) if(q ==1){out <- lsfit(x[md2 < med],Y[md2 < med])} else{out <- lsfit(x[md2 < med,],Y[md2 < med])} FIT <- out$coef[1] + x%*%out$coef[-1] RES <- Y - FIT par(mfrow=c(2,1)) plot(FIT,Y) points(FIT[md2 < med], Y[md2 < med], pch = 15) abline(0,1) identify(FIT, Y) plot(FIT,RES) points(FIT[md2 < med], RES[md2 < med], pch = 15) abline(0,0) identify(FIT, RES) } } mbareg<- function(x, y, nsamp = 7) { #gets the mbareg fit with 7 centers, med resid crit, 7 ball sizes x <- as.matrix(x) n <- dim(x)[1] q <- dim(x)[2] # q + 1 is number of predictors including intercept vals <- c(q + 3 + floor(n/100), q + 3 + floor(n/40), q + 3 + floor(n/20 ), q + 3 + floor(n/10), q + 3 + floor(n/5), q + 3 + floor(n/3), q + 3 + floor(n/2)) covv <- diag(q) centers <- sample(n, nsamp) temp <- lsfit(x, y) mbaf <- temp$coef critf <- median(temp$residuals^2) for(i in 1:nsamp) { md2 <- mahalanobis(x, center = x[centers[i], ], covv) smd2 <- sort(md2) for(j in 1:7) { temp <- lsfit(x[md2 <= smd2[vals[j]], ], y[md2 <= smd2[ vals[j]]]) #Use OLS on rows with md2 <= cutoff = smd2[vals[j]] res <- y - temp$coef[1] - x %*% temp$coef[-1] crit <- median(res^2) if(crit < critf) { critf <- crit mbaf <- temp$coef } } } list(coef = mbaf, critf = critf) } med2ci<- function(x, cc = 4, alpha = 0.05) { #gets ~ 50% trimmed mean se for sample median and the corresponding robust 100 (1-alpha)% CI #defaults are alpha = .05, cc = 5 may be better than the default up <- 1 - alpha/2 n <- length(x) med <- median(x) ln <- floor(n/2) - ceiling(sqrt(n/cc)) un <- n - ln low <- ln + 1 d <- sort(x) if(ln > 0) { d[1:ln] <- d[(low)] d[(un + 1):n] <- d[un] } den <- ((un - ln)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- un - low rval <- qt(up, rdf) * sqrt(swv/n) rlo <- med - rval rhi <- med + rval list(int = c(rlo, rhi), med = med, swv = swv) } medci<- function(x, alpha = 0.05) { #gets Bloch and Gastwirth SE for sample median and the corresponding resistant 100 (1-alpha)% CI #defaults are alpha = .05 n <- length(x) up <- 1 - alpha/2 med <- median(x) ln <- floor(n/2) - ceiling(sqrt(n/4)) un <- n - ln d <- sort(x) rdf <- un - ln - 1 cut <- qt(up, rdf) sebg <- 0.5 * (d[un] - d[ln + 1]) rval <- cut * sebg rlo <- med - rval rhi <- med + rval list(int = c(rlo, rhi), med = med, sebg = sebg) } MLRplot<-function(x, Y) { # Forward response plot and residual plot. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(lqs)" if a robust estimator replaces lsfit. # Advance the view with the right mouse button. x <- as.matrix(x) out <- lsfit(x, Y) cook <- ls.diag(out)$cooks n <- dim(x)[1] p <- dim(x)[2] + 1 tem <- cook > min(0.5, (2 * p)/n) bhat <- out$coef FIT <- bhat[1] + x %*% bhat[-1] par(mfrow = c(2, 1)) plot(FIT, Y) abline(0, 1) points(FIT[tem], Y[tem], pch = 15) identify(FIT, Y) title("Forward Response Plot") RES <- Y - FIT plot(FIT, RES) points(FIT[tem], RES[tem], pch = 15) identify(FIT, RES) title("Residual Plot") } mlrplot2 <- function(x, Y) { # Forward response plot and residual plot for two mbareg estimators. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(lqs)" if a robust estimator replaces lsfit. # Advance the view with the right mouse button. x <- as.matrix(x) out <- mbareg(x, Y) bhat <- out$coef FIT <- bhat[1] + x %*% bhat[-1] par(mfrow = c(2, 2)) plot(FIT, Y) abline(0, 1) identify(FIT, Y) title("MBA Forward Response Plot") RES <- Y - FIT plot(FIT, RES) identify(FIT, RES) title("MBA Residual Plot") # out <- mbalata(x, Y) bhat <- out$coef FIT <- bhat[1] + x %*% bhat[-1] plot(FIT, Y) abline(0, 1) identify(FIT, Y) title("MBALATA Forward Response Plot") RES <- Y - FIT plot(FIT, RES) identify(FIT, RES) title("MBALATA Residual Plot") } mplot<- function(x) { # Makes a DD plot only using the MDi, the RDi are not used. p <- dim(x)[2] center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) md <- sqrt(md2) rd <- md const <- sqrt(qchisq(0.5, p))/median(rd) rd <- const * rd plot(md, rd) abline(0, 1) identify(md, rd) } nav<- function(alpha = 0.01, k = 5) { #gets n(asy var) for the alpha trimmed mean #and T_(A,n)(k) if errors are N(0,1) z <- - qnorm(alpha) den <- 1 - (2 * z * dnorm(z))/(2 * pnorm(z) - 1 ) val <- den/(1 - 2 * alpha) ntmav <- val + (2 * alpha * z^2)/(1 - 2 * alpha )^2 zj <- k * qnorm(0.75) alphaj <- pnorm( - zj) alphaj <- ceiling(100 * alphaj)/100 zj <- - qnorm(alphaj) den <- 1 - (2 * zj * dnorm(zj))/(2 * pnorm(zj) - 1) val <- den/(1 - 2 * alphaj) natmav <- val + (2 * alphaj * zj^2)/(1 - 2 * alphaj)^2 return(ntmav, natmav) } nltv<- function(gam = 0.5) { # Gets asy var for lts(h) and lta(h) at standard normal # where h/n -> gam. k <- qnorm(0.5 + gam/2) den <- gam - 2 * k * dnorm(k) ltsv <- 1/den tem <- (1 - exp( - (k^2)/2))^2 ltav <- (2 * pi * gam)/(4 * tem) return(ltsv, ltav) } oddata<- function(n = 100, q = 5, theta = 1) { # Generates overdispersion (negative binomial) data for loglinear regression. # y <- 1:n pr <- 1/(1 + theta) beta <- 0 * 1:q beta[1:3] <- 1 alpha <- -2.5 x <- matrix(rnorm(n * q), nrow = n, ncol = q) x <- 0.5 * x + 1 SP <- alpha + x %*% beta y <- rnbinom(n, size = ceiling(exp(SP)), pr) list(x = x, y = y) } pifclean<- function(k, gam) { p <- floor(log(3/k)/log(1 - gam)) list(p = p) } piplot<-function(x, y, alpha = 0.05) { # For Unix, use X11() to turn on the graphics device before # using this function. # Makes an FY plot with prediction limits added. x <- as.matrix(x) p <- dim(x)[2] + 1 n <- length(y) up <- 1:n low <- up out <- lsfit(x, y) tem <- ls.diag(out) lev <- tem$hat res <- out$residuals FIT <- y - res Y <- y corfac <- (1 + 15/n)*sqrt(n/(n - p)) val2 <- quantile(res, c(alpha/2, 1 - alpha/2)) #get lower and upper PI limits for each case for(i in 1:n) { val <- sqrt(1 + lev[i]) val3 <- as.single(corfac * val2[1] * val) val4 <- as.single(corfac * val2[2] * val) up[i] <- FIT[i] + val4 low[i] <- FIT[i] + val3 } zy <- c(min(low), Y, max(up)) zx <- c(min(FIT), FIT, max(FIT)) #change labels so plot labels are good ff <- FIT yy <- Y Y <- zy FIT <- zx plot(FIT, Y, type = "n") points(ff, yy) abline(0, 1) points(ff, up, pch = 17) points(ff, low, pch = 17) } pisim<-function(n = 100, q = 7, nruns = 100, alpha = 0.05, eps = 0.1, shift = 9, type = 1) { # compares new and classical PIs for multiple linear regression # if type = 1 for N(0,1) errors, 2 for t3 errors, 3 for exp(1) - 1 errors # 4 for uniform(-1,1) errors, 5 for (1-eps) N(0,1) + eps N(0,(1+shift)^2) errors # constant = 1 so there are p = q+1 coefficients b <- 0 * 1:q + 1 cpicov <- 0 npicov <- 0 acpicov <- 0 opicov <- 0 val3 <- 1:nruns val4 <- val3 val5 <- val3 pilen <- matrix(0, nrow = nruns, ncol = 4) coef <- matrix(0, nrow = nruns, ncol = q + 1) corfac <- (1 + 15/n) * sqrt(n/(n - q - 1)) corfac2 <- sqrt(n/(n - q - 1)) for(i in 1:nruns) { x <- matrix(rnorm(n * q), nrow = n, ncol = q) if(type == 1) { y <- 1 + x %*% b + rnorm(n) xf <- rnorm(q) yf <- 1 + xf %*% b + rnorm(1) } if(type == 2) { y <- 1 + x %*% b + rt(n, df = 3) xf <- rnorm(q) yf <- 1 + xf %*% b + rt(1, df = 3) } if(type == 3) { y <- 1 + x %*% b + rexp(n) - 1 xf <- rnorm(q) yf <- 1 + xf %*% b + rexp(1) - 1 } if(type == 4) { y <- 1 + x %*% b + runif(n, min = -1, max = 1) xf <- rnorm(q) yf <- 1 + xf %*% b + runif(1, min = -1, max = 1) } if(type == 5) { err <- rnorm(n, sd = 1 + rbinom(n, 1, eps) * shift) y <- 1 + x %*% b + err xf <- rnorm(q) yf <- 1 + xf %*% b + rnorm(1, sd = 1 + rbinom(1, 1, eps ) * shift) } out <- lsfit(x, y) fres <- out$resid coef[i, ] <- out$coef yfhat <- out$coef[1] + xf %*% out$coef[-1] w <- cbind(1, x) xtxinv <- solve(t(w) %*% w) xf <- c(1, xf) hf <- xf %*% xtxinv hf <- hf %*% xf val <- sqrt(1 + hf) #get classical PI mse <- sum(fres^2)/(n - q - 1) val2 <- qt(1 - alpha/2, n - q - 1) * sqrt(mse) * val up <- yfhat + val2 low <- yfhat - val2 pilen[i, 1] <- up - low if(low < yf && up > yf) cpicov <- cpicov + 1 #get semiparametric PI val2 <- quantile(fres, c(alpha/2, 1 - alpha/2)) val3[i] <- as.single(corfac * val2[1] * val) val4[i] <- as.single(corfac * val2[2] * val) up <- yfhat + val4[i] low <- yfhat + val3[i] pilen[i, 2] <- up - low if(low < yf && up > yf) npicov <- npicov + 1 # asymptotically conservative PI val6 <- corfac2 * max(abs(val2)) val5[i] <- val6 * val up <- yfhat + val5[i] low <- yfhat - val5[i] pilen[i, 3] <- up - low if(low < yf && up > yf) acpicov <- acpicov + 1 # asymptotically optimal PI sres <- sort(fres) cc <- ceiling(n * (1 - alpha)) rup <- sres[cc] rlow <- sres[1] olen <- rup - rlow if(cc < n) { for(j in (cc + 1):n) { zlen <- sres[j] - sres[j - cc + 1] if(zlen < olen) { olen <- zlen rup <- sres[j] rlow <- sres[j - cc + 1] } } } up <- yfhat + corfac * val * rup low <- yfhat + corfac * val * rlow pilen[i, 4] <- up - low if(low < yf && up > yf) opicov <- opicov + 1 } pimnlen <- apply(pilen, 2, mean) mnbhat <- apply(coef, 2, mean) lcut <- mean(val3) hcut <- mean(val4) accut <- mean(val5) cpicov <- cpicov/nruns npicov <- npicov/nruns acpicov <- acpicov/nruns opicov <- opicov/nruns list(mnbhat = mnbhat, pimenlen = pimnlen, cpicov = cpicov, npicov = npicov, acpicov = acpicov, opicov = opicov, lcut = lcut, hcut = hcut, accut = accut) } ratmn<- function(x, k1 = 6, k2 = 6) { #robust 2 stage asymmetically trimmed mean madd <- mad(x, constant = 1) med <- median(x) LM <- sum(x < (med - k1 * madd)) nmUM <- sum(x > (med + k2 * madd)) n <- length(x) # ll (hh) is the percentage to be trimmed to the left (right) ll <- ceiling((100 * LM)/n) hh <- ceiling((100 * (nmUM))/n) tem <- sort(x) ln <- floor((ll * n)/100) un <- floor((n * (100 - hh))/100) low <- ln + 1 val1 <- tem[low] val2 <- tem[un] rtmn <- mean(x[(x >= val1) & (x <= val2)]) trmn } rcisim<- function(n, runs = 500, type = 1, eps = 0.25, shift = 100, df = 1, kaa = 6, kss = 3.5) { # Used to simulate one CI at a time. # type = 1: normal data, type = 2: contaminated normal data # type = 3: t(df) data, type = 4: double exponential # type = 5: exponential if(type == 1) x <- matrix(rnorm(n * runs), nrow = runs, ncol = n) if(type == 2) { x <- matrix(rnorm(n * runs), nrow = runs, ncol = n) x <- x + shift * matrix(rbinom(n * runs, 1, eps), nrow = runs, ncol = n) } if(type == 3) x <- matrix(rt(n * runs, df = df), nrow = runs, ncol = n) if(type == 4) { x <- matrix(rexp(n * runs), nrow = runs, ncol = n) w <- matrix(rbinom(n * runs, 1, 0.5), nrow = runs, ncol = n) w <- 2 * w - 1 x <- w * x } if(type == 5) x <- matrix(rexp(n * runs), nrow = runs, ncol = n) cov <- 0 ## change this value with each interval epar <- 1.0 low <- 1:runs up <- low for(i in 1:runs) { ## change the following line with each interval out <- cci(x[i, ]) low[i] <- out$int[1] up[i] <- out$int[2] if(type == 5) { if(low[i] < epar && up[i] > epar) cov <- cov + 1 } else { if(low[i] < 0 && up[i] > 0) cov <- cov + 1 } } cov <- cov/runs len <- sqrt(n) * mean(up - low) list(cov = cov, len = len) } rcovsim<- function(n, p = 2, steps = 5, gam = 0.4, runs = 20, outliers = T) { # This Splus function demonstrates that covmba estimates mu # but is slightly biased for sigma. The function rmba is better for small n. A <- sqrt(diag(1:p)) cloc <- 0 * (1:p) csig <- 0 * A mbaloc <- cloc mbasig <- csig rmbaloc <- cloc rmbasig <- csig fmcdloc <- cloc fmcdsig <- csig for(i in 1:runs) { x <- matrix(rnorm(n * p), ncol = p, nrow = n) # code below would give mean = (10, ..., 10)^T to the outliers # if(outliers == T) { # val <- floor(gam * n) # tem <- 10 + 0 * 1:p # x <- x %*% A # x[1:val, ] <- x[1:val, ] + # tem # } # else { # x <- x %*% A # } ## code below: outliers have mean (10, 10 sqrt(2), ..., 10 sqrt(p))^T if(outliers == T) { val <- floor(gam * n) tem <- 10 + 0 * 1:p x[1:val, ] <- x[1:val, ] + tem } x <- x %*% A out <- covmba(x, csteps = steps) mbaloc <- mbaloc + out$center mbasig <- mbasig + out$cov out <- rmba(x) rmbaloc <- rmbaloc + out$center rmbasig <- rmbasig + out$cov cloc <- cloc + apply(x, 2, mean) csig <- csig + var(x) out <- cov.mcd(x) fmcdloc <- fmcdloc + out$center fmcdsig <- fmcdsig + out$cov } mbaloc <- mbaloc/runs mbasig <- mbasig/runs rmbaloc <- rmbaloc/runs rmbasig <- rmbasig/runs cloc <- cloc/runs csig <- csig/runs fmcdloc <- fmcdloc/runs fmcdsig <- fmcdsig/runs list(mbaloc = mbaloc, rmbaloc = rmbaloc, cloc = cloc, fmcdloc = fmcdloc, mbasig = mbasig, rmbasig = rmbasig, csig = csig, fmcdsig = fmcdsig) } rcplot <- function(x, Y) { # Forward response plot and residual plot. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # Advance the view with the right mouse button. x <- as.matrix(x) out <- lsfit(x, Y) #zz <- ls.diag(out) CD <- zz$cooks n <- dim(x)[1] p <- dim(x)[2] + 1 RES <- out$resid #tem <- zz$std.dev #tem <- p * tem * (n - p)^2 #sortr <- sort(RES) #quad <- (n * (sortr^2))/tem plot(RES, CD) #lines(sortr, quad) identify(RES,CD) title("RC Plot") } rmaha<- function(x) { # Produces robust Mahalanobis distances (scaled for normal data). p <- dim(x)[2] out <- cov.mcd(x) center <- out$center cov <- out$cov rd <- mahalanobis(x, center, cov) const <- sqrt(qchisq(0.5, p))/median(rd) return(const * sqrt(rd)) } robci <- function(x, alpha = 0.05, trmp = 0.25, ka = 6, ks = 3.5 ) { #Gets several robust 100 (1-alpha)% CI's for data x. #defaults are alpha = .05 n <- length(x) up <- 1 - alpha/2 med <- median(x) madd <- mad(x, constant = 1) d <- sort(x) dtem <- d ## get the CI for T_A, LM <- sum(x < (med - ka * madd)) nmUM <- sum(x > (med + ka * madd)) # ll (hh) is the percentage to be trimmed to the left (right) ll <- ceiling((100 * LM)/n) hh <- ceiling((100 * (nmUM))/n) ln <- floor((ll * n)/100) un <- floor((n * (100 - hh))/100) low <- ln + 1 val1 <- dtem[low] val2 <- dtem[un] tstmn <- mean(x[(x >= val1) & (x <= val2)]) #have obtained the two stage asymmetrically trimmed mean if(ln > 0) { d[1:ln] <- d[low] } if(un < n) { d[(un + 1):n] <- d[un] } den <- ((un - ln)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- un - low rval <- qt(up, rdf) * sqrt(swv/n) talo <- tstmn - rval tahi <- tstmn + rval ##got low and high endpoints of robust T_A,n CI ##get robust T_S,n CI d <- dtem lo <- sum(x < (med - ks * madd)) hi <- sum(x > (med + ks * madd)) low <- ceiling((100 * lo)/n) high <- ceiling((100 * hi)/n) tp <- min(max(low, high)/100, 0.5) tstmn <- mean(x, trim = tp) #have obtained the two stage symetrically trimmed mean ln <- floor(n * tp) un <- n - ln if(ln > 0) { d[1:ln] <- d[(ln + 1)] } if(un < n) { d[(un + 1):n] <- d[un] } den <- ((un - ln)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- un - ln - 1 rval <- qt(up, rdf) * sqrt(swv/n) tslo <- tstmn - rval tshi <- tstmn + rval ##got low and high endpoints of robust T_S,n CI ##get median CI that uses a scaled Winsorized variance d <- dtem lnbg <- floor(n/2) - ceiling(sqrt(n/4)) unbg <- n - lnbg lowbg <- lnbg + 1 if(lnbg > 0) { d[1:lnbg] <- d[(lowbg)] } if(unbg < n) { d[(unbg + 1):n] <- d[unbg] } den <- ((unbg - lnbg)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- unbg - lnbg - 1 cut <- qt(up, rdf) rval <- cut * sqrt(swv/n) rlo <- med - rval rhi <- med + rval ##got median CI that uses a scaled Winsorized variance ##get BG CI se2 <- 0.5 * (d[unbg] - d[lowbg]) rval <- cut * se2 rlo2 <- med - rval rhi2 <- med + rval #got low and high endpoints of BG CI ## get classical CI mn <- mean(x) v <- var(x) se <- sqrt(v/n) val <- qt(up, n - 1) * se lo <- mn - val hi <- mn + val ##got classical CI endpoints ## get trimmed mean CI d <- dtem ln <- floor(n * trmp) un <- n - ln trmn <- mean(x, trim = trmp) if(ln > 0) { d[1:ln] <- d[(ln + 1)] } if(un < n) { d[(un + 1):n] <- d[un] } den <- ((un - ln)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- un - ln - 1 rval <- qt(up, rdf) * sqrt(swv/n) trlo <- trmn - rval trhi <- trmn + rval ##got trimmed mean CI endpoints list(tint = c(lo, hi), taint = c(talo, tahi), tsint = c(tslo, tshi), bgint = c(rlo2, rhi2), mint = c(rlo, rhi), trint = c( trlo, trhi)) } rrplot<- function(x, y, nsamps = 7) { # In Unix, use X11() to turn on the graphics device before # using this function. For R, first type library(lqs). # Makes an RR plot. Needs the mbareg function. n <- length(y) rmat <- matrix(nrow = n, ncol = 5) lsres <- lsfit(x, y)$residuals print("got OLS") l1res <- l1fit(x, y)$residuals print("got L1") almsres <- lmsreg(x, y)$resid print("got ALMS") altsres <- ltsreg(x, y)$residuals print("got ALTS") out <- mba$coef mbacoef <- mbareg(x, y, nsamp = nsamps)$coef MBARES <- y - mbacoef[1] - x %*% mbacoef[-1] print("got MBA") rmat[, 1] <- lsres rmat[, 2] <- l1res rmat[, 3] <- almsres rmat[, 4] <- altsres rmat[, 5] <- MBARES pairs(rmat, labels = c("OLS residuals", "L1 residuals", "ALMS residuals", "ALTS residuals", "MBA residuals")) } rrplot2<- function(x, y, nsamps = 7) { # In Unix, use X11() to turn on the graphics device before # using this function. For R, first type library(lqs). # Makes an RR plot. Needs the mbareg function. n <- length(y) rmat <- matrix(nrow = n, ncol = 4) lsres <- lsfit(x, y)$residuals print("got OLS") almsres <- lmsreg(x, y)$resid print("got ALMS") altsres <- ltsreg(x, y)$residuals print("got ALTS") out <- mba$coef mbacoef <- mbareg(x, y, nsamp = nsamps)$coef MBARES <- y - mbacoef[1] - x %*% mbacoef[-1] print("got MBA") rmat[, 1] <- lsres rmat[, 2] <- almsres rmat[, 3] <- altsres rmat[, 4] <- MBARES pairs(rmat, labels = c("OLS residuals", "ALMS residuals", "ALTS residuals", "MBA residuals")) } rstmn<- function(x, k1 = 5, k2=5) { #robust symmetically trimmed 2 stage mean #truncates too many cases when the contamination is asymmetric madd <- mad(x, constant = 1) med <- median(x) LM <- sum(x < (med - k1 * madd)) nmUM <- sum(x > (med + k2 * madd)) n <- length(x) #ll (hh) is the percentage trimmed to the left (right) # tp is the trimming proportion ll <- ceiling((100 * LM)/n) hh <- ceiling((100 * nmUM)/n) tp <- min(max(ll, hh)/100, 0.5) mean(x, trim = tp) } sir<- function(x, y, h) { # Obtained from STATLIB. Contributed by Thomas Koetter. # Calculates the effective dimension-reduction (e.d.r.) # directions by Sliced Inverse Regression (K.C. Li 1991, JASA 86, 316-327) # # Input: x n x p matrix, explanatory variable # y n x 1 vector, dependent variable # h scalar: if h >= 2 number of slices # if h <= -2 number of elements within a slice # 0 < h < 1 width of a slice: h = slicewidth / # range # # Output: list(edr, evalues) # edr p x p matrix, estimates for the e.d.r. directions # evalues p x 1 vector, the eigenvalues to the directions # # written by Thomas Koetter (thomas@wiwi.hu-berlin.de) 1995 # last modification: 7/18/95 # based on the implementation in XploRe # a full description of the XploRe program can be found in (chapter 11) # 'XploRe: An interactive statistical computing environment', # W. Haerdle, S. Klinke, B.A. Turlach, Springer, 1995 # # This software can be freely used for non-commercial purposes and freely # distributed. #+-----------------------------------------------------------------------------+ #| Thomas Koetter | #| Institut fuer Statistik und Oekonometrie | #| Fakultaet Wirtschaftswissenschaften | #| Humboldt-Universitaet zu Berlin, 10178 Berlin, GERMANY | #+-----------------------------------------------------------------------------+ #| Tel. voice: +49 30 2468-321 | #| Tel. FAX: +49 30 2468-249 | #| E-mail: thomas@wiwi.hu-berlin.de | #+-----------------------------------------------------------------------------+ n <- nrow(x) ndim <- ncol(x) if(n != length(c(y))) { stop("length of y doesn't match to number of rows of x !!") } if( - h > n) { stop("Number of elements within slices can't exceed number of data !!" ) } # stanardize the x variable to z (mean 0 and cov I) xb <- apply(x, 2, mean) si2 <- solve(chol(var(x))) xt <- (x - matrix(xb, nrow(x), ncol(x), byrow = T)) %*% si2 # sort the data regarding y. x values are now packed into slices ord1 <- order(y) data <- cbind(y[ord1], xt[ord1, ]) # determine slicing strategy if(h <= -2) { # abs(h) is number of elements per slice h <- abs(h) ns <- floor(n/h) condit <- 1:n choice <- (1:ns) * h # if there are observations left, add them to the first and last slice if(h * ns != n) { hk <- floor((n - h * ns)/2) choice <- choice + hk choice[ns] <- n # to aviod numerical problems } } else if(h >= 2) { # h is number of slices ns <- h slwidth <- (data[n, 1] - data[1, 1])/ns slend <- seq(data[1, 1] + slwidth, length = ns, by = slwidth) slend[ns] <- data[n, 1] condit <- c(data[, 1]) choice <- slend } else if((0 < h) && (h < 1)) { # h is widht of a slice divides by the range of y ns <- floor(1/h) slwidth <- (data[n, 1] - data[1, 1]) * h slend <- seq(data[1, 1] + slwidth, length = ns, by = slwidth) slend[ns] <- data[n, 1] # to aviod numerical problems condit <- c(data[, 1]) choice <- slend } else stop("values of third parameter not valid") v <- matrix(0, ndim, ndim) # estimate for Cov(E[z|y]) ind <- rep(T, n) # index for already sliced elements ndim <- ndim + 1 j <- 1 # loop counter while(j <= ns) { sborder <- (condit <= choice[j]) & ind # index of slice j if(any(sborder)) { # are there elements in slice j ? ind <- ind - sborder xslice <- data[sborder, 2:ndim] if(sum(sborder) == 1) { # xslice is a vector ! xmean <- xslice v <- v + outer(xmean, xmean, "*") } else { xmean <- apply(xslice, 2, mean) v <- v + outer(xmean, xmean, "*") * nrow(xslice ) } } j <- j + 1 } if(any(ind)) { print("Error: elements unused !!") print(ind) } v <- (v + t(v))/(2 * n) # to prevent numerical errors (v is symmetric) eig <- eigen(v) b <- si2 %*% eig$vectors # estimates for e.d.r. directions data <- sqrt(apply(b * b, 2, sum)) b <- t(b)/data return(list(edr = t(b), evalues = eig$values)) } sirviews<- function(x, Y, ii = 1) { # Uses the function "sir" from STATLIB. # Trimmed views for 90, 80, ... 0 percent # trimming. Allows visualization of m # and crude estimation of c beta in models # of the form y = m(x^T beta) + e. # beta is obtained from SIR. # Workstation need to activate a graphics # device with command "X11()" or "motif()." # R needs command "library(lqs)." # Advance the view with the right mouse button. # In R, highlight "stop." x <- as.matrix(x) q <- dim(x)[2] out <- cov.mcd(x) # or use out <- cov.mve(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) h <- q + 7 for(i in ii:10) { val <- quantile(rd2, tem[i]) b <- sir(x[rd2 <= val, ], Y[rd2 <= val], h)$edr[, 1] ESP <- x %*% b plot(ESP, Y) title(labs[i]) identify(ESP, Y) print(b) } } stmci<- function(x, alpha = 0.05, ks = 3.5) { #gets se for sample median and the corresponding robust 100 (1-alpha)% CI #defaults are alpha = .05 n <- length(x) up <- 1 - alpha/2 med <- median(x) madd <- mad(x, constant = 1) lo <- sum(x < (med - ks * madd)) hi <- sum(x > (med + ks * madd)) low <- ceiling((100 * lo)/n) high <- ceiling((100 * hi)/n) tp <- min(max(low, high)/100, 0.5) tstmn <- mean(x, trim = tp) #have obtained the two stage symetrically trimmed mean ln <- floor(n * tp) un <- n - ln d <- sort(x) if(ln > 0) { d[1:ln] <- d[(ln + 1)] d[(un + 1):n] <- d[un] } den <- ((un - ln)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- un - ln - 1 rval <- qt(up, rdf) * sqrt(swv/n) tslo <- tstmn - rval tshi <- tstmn + rval list(int = c(tslo, tshi), tp = tp) } symviews<- function(x, Y) { # Makes trimmed views for 90, 80, ..., 0 # percent trimming and sometimes works even if m # is symmetric about E(x^t beta) where # y = m(x^T beta ) + e. # For work stations, activate a graphics # device with command "X11()" or "motif()." # For R, use "library(lqs)." # Use the rightmost mouse button to advance # the view. In R, highlight ``stop." x <- as.matrix(x) tem <- seq(0.1, 1, 0.1) bols <- lsfit(x, Y)$coef fit <- x %*% bols[-1] temx <- x[fit > median(fit), ] temy <- Y[fit > median(fit)] out <- cov.mcd(temx) # or use out <- cov.mve(temx) center <- out$center cov <- out$cov rd2 <- mahalanobis(temx, center, cov) for(i in 1:10) { val <- quantile(rd2, tem[i]) bhat <- lsfit(temx[rd2 <= val, ], temy[rd2 <= val])$coef ESP <- x %*% bhat[-1] plot(ESP, Y) identify(ESP, Y) print(bhat) } } tmci<- function(x, alpha = 0.05, tp = 0.25) { #gets se for the tp trimmed mean and the corresponding robust 100 (1-alpha)% CI #defaults are alpha = .05 n <- length(x) up <- 1 - alpha/2 tmn <- mean(x, trim = tp) ln <- floor(n * tp) un <- n - ln d <- sort(x) if(ln > 0) { d[1:ln] <- d[(ln + 1)] d[(un + 1):n] <- d[un] } den <- ((un - ln)/n)^2 swv <- var(d)/den #got the scaled Winsorized variance rdf <- un - ln - 1 rval <- qt(up, rdf) * sqrt(swv/n) tmlo <- tmn - rval tmhi <- tmn + rval list(int = c(tmlo, tmhi), tp = tp) } Tplt<- function(x, y) { # For Unix, use X11() to turn on the graphics device before using this function. # This function plots y^L vs OLS fit. If plot is linear for L, use y^L instead of y. # This is a graphical method for a response transform. olsfit <- y - lsfit(x, y)$resid lam <- c(-1, -2/3, -1/2, -1/3, -1/4, 0, 1/4, 1/ 3, 1/2, 2/3, 1) xl <- c("Y**(-1)", "Y**(-2/3)", "Y**(-0.5)", "Y**(-1/3)", "Y**(-1/4)", "LOG(Y)", "Y**(1/4)", "Y**(1/3)", "Y**(1/2)", "Y**(2/3)", "Y") for(i in 1:length(lam)) { if(lam[i] == 0) ytem <- log(y) else if(lam[i] == 1) ytem <- y else ytem <- (y^lam[i] - 1)/lam[i] plot(olsfit, ytem, xlab = "YHAT", ylab = xl[i]) abline(lsfit(olsfit, ytem)$coef) identify(olsfit, ytem) } } trviews<- function(x, Y, ii = 1) { # Trimmed views for 90, 80, ... 0 percent # trimming. Increase ii if 90% trimming is too harsh. # Allows visualization of m and crudely estimation of # c beta in models of the form y = m(x^T beta) + e. # Workstation: activate a graphics device # with commands "X11()" or "motif()." # R needs command "library(lqs)." # Advance the view with the right mouse button and # in R, highight "stop." x <- as.matrix(x) out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%","10%","0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) b <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$coef ESP <- x %*% b[-1] plot(ESP, Y) title(labs[i]) identify(ESP, Y) print(b) } } tvreg<- function(x, Y, ii = 1) { # Trimmed views (TV) regression for 90, 80, ..., 0 percent # trimming. Increase ii if 90% trimming is too harsh. # Workstation: activate a graphics device # with commands "X11()" or "motif()." # R needs command "library(lqs)." # Advance the view with the right mouse button and # in R, highight "stop." x <- as.matrix(x) out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) labs <- c("90%", "80%", "70%", "60%", "50%", "40%", "30%", "20%", "10%", "0%") tem <- seq(0.1, 1, 0.1) for(i in ii:10) { val <- quantile(rd2, tem[i]) b <- lsfit(x[rd2 <= val, ], Y[rd2 <= val])$coef FIT <- x %*% b[-1] + b[1] plot(FIT, Y) abline(0, 1) title(labs[i]) identify(FIT, Y) print(b) } } tvreg2<- function(X, Y, M = 0) { # Trimmed views regression for M percent trimming. # Workstation: activate a graphics device # with commands "X11()" or "motif()." # R needs command "library(lqs)." X <- as.matrix(X) out <- cov.mcd(X) center <- out$center cov <- out$cov rd2 <- mahalanobis(X, center, cov) tem <- (100 - M)/100 val <- quantile(rd2, tem) b <- lsfit(X[rd2 <= val, ], Y[rd2 <= val])$coef FIT <- X %*% b[-1] + b[1] plot(FIT, Y) abline(0, 1) identify(FIT, Y) list(coef = b) } wddplot<- function(x) {# Shows the southwest corner of the DD plot. n <- dim(x)[1] wt <- 0 * (1:n) p <- dim(x)[2] center <- apply(x, 2, mean) cov <- var(x) md2 <- mahalanobis(x, center, cov) out <- cov.mcd(x) center <- out$center cov <- out$cov rd2 <- mahalanobis(x, center, cov) md <- sqrt(md2) rd <- sqrt(rd2) const <- sqrt(qchisq(0.5, p))/median(rd) rd <- const * rd wt[rd < sqrt(qchisq(0.975, p))] <- 1 MD <- md[wt > 0] RD <- rd[wt > 0] plot(MD, RD) } hc4test<-function(x,y,pval=c(1:ncol(x))){ # # Perform omnibus test using OLS and HC4 estimator # # recommended by Cribari-Neto (2004). # Seems to work well with p=1 but can be unsatisfactory wit p>4 predictors, # Unknown how large n must be when p>1 # x<-as.matrix(x) if(ncol(x)>1)print("WARNING: more than 1 predictor, olstest might be better") if(nrow(x) != length(y))stop("Length of y does not match number of x values") m<-cbind(x,y) m<-elimna(m) y<-m[,ncol(x)+1] n<-length(y) pvalp1<-pval+1 temp<-lsfit(x,y) # unrestricted #print(temp$coef) x<-cbind(rep(1,nrow(x)),m[,1:ncol(x)]) #hval<-hat(x) hval<-x%*%solve(t(x)%*%x)%*%t(x) hval<-diag(hval) hbar<-mean(hval) delt<-cbind(rep(4,n),hval/hbar) delt<-apply(delt,1,min) aval<-(1-hval)^(0-delt) x2<-x[,pvalp1] pval<-0-pvalp1 x1<-x[,pval] #uval<-temp$residuals # unrestricted residuals df<-length(pval) x1<-as.matrix(x1) imat<-diag(1,n) M1<-imat-x1%*%solve(t(x1)%*%x1)%*%t(x1) M<-imat-x%*%solve(t(x)%*%x)%*%t(x) uval<-as.vector(M%*%y) R2<-M1%*%x2 rtr<-solve(t(R2)%*%R2) temp2<-aval*uval^2 S<-diag(aval*uval^2) V<-n*rtr%*%t(R2)%*%S%*%R2%*%rtr nvec<-as.matrix(temp$coef[pvalp1]) test<-n*t(nvec)%*%solve(V)%*%nvec test<-test[1,1] p.value<-1-pchisq(test,df) list(test=test,p.value=p.value) } smean<-function(m,cop=6,MM=F,op=1,outfun=outogk,cov.fun=rmba,...){ # # m is an n by p matrix # # Compute a multivariate skipped measure of location # # op=1: # Eliminate outliers using a projection method # That is, first determine center of data using: # # cop=1 uses Donoho-Gasko median # cop=2 uses MCD center # cop=3 uses median of the marginal distributions. # cop=4 uses MVE center # cop=5 uses TBS # cop=6 uses rmba (Olive's median ball algorithm) # # For each point # consider the line between it and the center, # project all points onto this line, and # check for outliers using # # MM=F, a boxplot rule. # MM=T, rule based on MAD and median # # Repeat this for all points. A point is declared # an outlier if for any projection it is an outlier # using a modification of the usual boxplot rule. # # op=2 use mgv (function outmgv) method to eliminate outliers # with the initial center given by the function cov.fun # that defaults to MBA. # # op=3 use outlier method indicated by outfun # # Eliminate any outliers and compute means # using remaining data. # m<-elimna(m) if(op==1)temp<-outpro(m,plotit=F,cop=cop,MM=MM)$keep if(op==2)temp<-outmgv(m,plotit=F,cov.fun=cov.fun)$keep if(op==3)temp<-outfun(m,plotit=F,...)$keep val<-apply(m[temp,],2,mean) val } mgvreg<-function(x,y,regfun=tsreg,cov.fun=rmba,se=T){ # # Do regression on points not labled outliers # by the MGV method. # (This function replaces an older version of mgvreg as of 11/6/06) # # In contrast to the old version, # when calling outmgv, center of data is determined via # the measure of location corresponding to cov.fun, which defaults # to the median ball algorithm (MBA) # m<-cbind(x,y) m<-elimna(m) # eliminate any rows with missing data ivec<-outmgv(m,plotit=F,cov.fun=cov.fun)$keep np1<-ncol(x)+1 coef<-regfun(m[ivec,1:ncol(x)],m[ivec,np1])$coef vec<-rep(1,length(y)) residuals<-y-cbind(vec,x)%*%coef list(coef=coef,residuals=residuals) } skipcov<-function(m,cop=6,MM=F,op=1,mgv.op=0,outpro.cop=3){ # # m is an n by p matrix # # Compute skipped covariance matrix # # op=1: # Eliminate outliers using a projection method # That is, first determine center of data using: # # cop=1 Donoho-Gasko median, # cop=2 MCD, # cop=3 marginal medians. # cop=4 uses MVE center # cop=5 uses TBS # cop=6 uses rmba (Olive's median ball algorithm) # # For each point # consider the line between it and the center, # project all points onto this line, and # check for outliers using # # MM=F, a boxplot rule. # MM=T, rule based on MAD and median # # Repeat this for all points. A point is declared # an outlier if for any projection it is an outlier # # op=2 use mgv (function outmgv) method to eliminate outliers # # Eliminate any outliers and compute means # using remaining data. # mgv.op=0, mgv uses all pairwise distances to determine center of the data # mgv.op=1 uses MVE # mgv.op=2 uses MCD # temp<-NA m<-elimna(m) m<-as.matrix(m) if(op==2)temp<-outmgv(m,plotit=F,op=mgv.op)$keep if(op==1)temp<-outpro(m,plotit=F,MM=MM,cop=outpro.cop)$keep val<-var(m[temp,]) val } hc4wtest<-function(x,y,nboot=500,SEED=T,RAD=T){ # # Test the hypothesis that all OLS slopes are zero # using HC4 wild bootstrap using wald test. # # This function calls the functions # olshc4 and # lstest4 # if(SEED)set.seed(2) x<-as.matrix(x) # First, eliminate any rows of data with missing values. temp <- cbind(x, y) temp <- elimna(temp) pval<-ncol(temp)-1 x <- temp[,1:pval] y <- temp[, pval+1] x<-as.matrix(x) p<-ncol(x) pp<-p+1 temp<-lsfit(x,y) yhat<-mean(y) res<-y-yhat #s<-lsfitNci4(x, y)$cov[-1, -1] s<-olshc4(x, y)$cov[-1, -1] si<-solve(s) b<-temp$coef[2:pp] wtest<-t(b)%*%si%*%b print("Taking boostrap samples. Please wait.") if(RAD)data<-matrix(ifelse(rbinom(length(y)*nboot,1,0.5)==1,-1,1),nrow=nboot) if(!RAD){ data<-matrix(runif(length(y)*nboot),nrow=nboot) data<-(data-.5)*sqrt(12) # standardize the random numbers. } rvalb<-apply(data,1,lstest4,yhat,res,x) sum<-sum(rvalb>= wtest[1,1]) p.val<-sum/nboot list(p.value=p.val) } lscale<-function(x,m,q) { # # Compute the L-scale as used by Marrona # Technometrics, 2005, 47, 264-273 # # so it is assumed that values in x have been centered # (a measure of location has been subtracted from each value) # and the results squared. # # q is defined in Marrona. For principal components, want to reduce # to p dimensional data, q=ncol(x)-p # hval<-floor((length(x)+m-q+2)/2) flag<-(x<0) if(sum(flag)>0)stop("For lscale, all values must be nonnegative") x<-sort(x) val<-sum(x[1:hval]) val } ortho<-function(x){ # Orthnormalize x # y<-qr(x) y<-qr.Q(y) y } regpca<-function(x,cor=F,loadings=T,covlist=NULL, SCORES=F,pval=ncol(x)){ # # regular PCA, calls princomp # x<-elimna(x) # removes any rows having missing values temp<-princomp(x,cor=cor,scores=T,covlist=covlist) if(!SCORES)temp<-summary(temp,loadings=loadings) if(SCORES){ chkit<-order(temp[1]$sdev) temp<-temp$scores pv<-ncol(x)+1-pval temp<-temp[,chkit[pv:ncol(x)]] } temp } robpca<-function(x,pval=ncol(x) , kmax=10, alpha=0.75, h, mcd=1, plots=1, labsd=3, labod=3, classic=0,plotit=T,pr=T) { # # This is a slightly modified version of the code in robpca.SSC that # was downloade from M, Hubert's web page. # x<-elimna(x) k<-pval # k=0 generates an error when using the original code. if(pr)print(paste("Number of principal components specified is",pval)) # # ROBPCA is a 'ROBust method for Principal Components Analysis'. # It is resistant to outliers in the data. The robust loadings are computed using # projection-pursuit techniques and the MCD method. Therefore ROBPCA can be applied # to both low and high-dimensional data sets.In low dimensions, the MCD method is applied (see cov.mcd). # The ROBPCA method is described in # Hubert, M., Rousseeuw, P.J., Vanden Branden K. (2005), # "ROBPCA: a new approach to robust principal components analysis", # to appear in Technometrics. # For the up-to-date reference, please consult the website: # www.wis.kuleuven.ac.be/stat/robust.html # Required input arguments: # x : Data matrix (observations in the rows, variables in the # columns) # # Optional input arguments: # k : Number of principal components to compute. If k is missing, # or k = 0, a screeplot is drawn which allows you to select # the number of principal components. If k = 0 and plots = 0, # the algorithm itself will determine the number of components. # This is not recommended. # kmax : Maximal number of principal components to compute (default = 10). # If k is provided, kmax does not need to be specified, unless k is larger # than 10. # alpha : (1-alpha) measures the fraction of outliers the algorithm should # resist. Any value between 0.5 and 1 may be specified (default = 0.75). # h : (n-h+1) measures the number of outliers the algorithm should # resist. Any value between n/2 and n may be specified. (default = 0.75*n) # Alpha and h may not both be specified. # mcd : If equal to one: when the number of variables is sufficiently small, # the loadings are computed as the eigenvectors of the MCD covariance matrix, # hence the function 'cov.mcd' is automatically called. The number of # principal components is then taken as k = rank(x). (default) # If equal to zero, the robpca algorithm is always applied. # plots : If equal to one, a scree plot, and a robust score outlier map are # drawn (default). If the input argument 'classic' is equal to one, # the classical plots are drawn as well. # If 'plots' is equal to zero, all plots are suppressed. # labsd : The 'labsd' observations with largest score distance are # labeled on the outlier map. (default = 3) # labod : The 'labod' observations with largest orthogonal distance are # labeled on the outlier map. default = 3) # classic : If equal to one, the classical PCA analysis will be performed. (default = 0) # # I/O: result<-robpca(x,k=2,kmax=10,alpha=0.75,h=50,mcd=1,plots=1,labsd=3,labod=3,classic=0) # The user should only give the input arguments that have to change their default value. # The name of the input arguments needs to be followed by their value. # The order of the input arguments is of no importance. # # Examples: # result<-robpca(x,k=3,alpha=0.65,plots=0) # result<-robpca(x,alpha=0.80,kmax=15,labsd=5) # plotit=F, is the same as using plots=0 # # The output of ROBPCA is a structure containing # # result$P : Robust loadings (eigenvectors) # result$L : Robust eigenvalues # result$M : Robust center of the data # result$T : Robust scores # result$k : Number of (chosen) principal components # result$h : The quantile h used throughout the algorithm # result$sd : Robust score distances within the robust PCA subspace # result$od : Orthogonal distances to the robust PCA subspace # result$cutoff : Cutoff values for the robust score and orthogonal distances # result$flag : The observations whose score distance is larger than result.cutoff.sd # or whose orthogonal distance is larger than result$cutoff$od # can be considered as outliers and receive a flag equal to zero. # The regular observations receive a flag 1. # result$class : 'ROBPCA' # result$classic : If the input argument 'classic' is equal to one, this structure # contains results of the classical PCA analysis. # Short description of the method: Let n denote the number of observations, and p the number of original variables, # then ROBPCA finds a robust center (p x 1) of the data M and a loading matrix P which is (p x k) dimensional. # Its columns are orthogonal and define a new coordinate system. The scores (n x k) are the coordinates of the centered # observations with respect to the loadings: T=(X-M)*P. Note that ROBPCA also yields a robust covariance matrix (often singular) # which can be computed as cov<-out$P*out$L*t(out$P). The scree plot shows the eigenvalues and is helpful to select the number # of principal components. The outlier map visualizes the observations by plotting their orthogonal distance to the robust PCA subspace # versus their robust distances within the PCA subspace. This allows to classify the data points into 4 types: regular observations, # good leverage points, bad leverage points and orthogonal outliers. # # robpca.ssc was written by Jan Wijfels # adapted by Karlien Vanden Branden. # Last Update: 14/01/2005 if(!plotit)plots<-0 library(MASS) if(missing(x)){ stop("Error in robpca: You have to provide at least some data") } data <- as.matrix(x) n <- nrow(data) p <- ncol(data) if(n < p) { X.svd <- kernelEVD(data) } else { X.svd <- classSVD(data) } if(X.svd$rank == 0) { stop("All data points collapse!") } kmax <- max(min(floor(kmax), floor(n/2), X.svd$rank),1) k <- floor(k) if(k < 0) { k <- 0 } else if(k > kmax) { warning("Attention robpca: The number of principal components k = ", k, " is larger then kmax = ", kmax, "; k is set to ", kmax,".") k <- kmax } if(!missing(h) & !missing(alpha)) { stop("Error in robpca: Both inputarguments alpha and h are provided. Only one is required.") } if(missing(h) & missing(alpha)) { h <- min(floor(2*floor((n+kmax+1)/2)-n+2*(n-floor((n+kmax+1)/2))*alpha),n) } if(!missing(h) & missing(alpha)) { alpha <- h/n if(k==0) { if(h < floor((n+kmax+1)/2)) { h <- floor((n+kmax+1)/2) alpha <- h/n warning("Attention robpca: h should be larger than (n+kmax+1)/2. It is set to its minimum value ", h, ".") } } else { if(h < floor((n+k+1)/2)) { h <- floor((n+k+1)/2) alpha <- h/n warning("Attention robpca: h should be larger than (n+k+1)/2. It is set to its minimum value ", h, ".") } } if(h > n) { alpha <- 0.75 if(k==0) { h <- floor(2*floor((n+kmax+1)/2)-n+2*(n-floor((n+kmax+1)/2))*alpha) } else { h <- floor(2*floor((n+k+1)/2)-n+2*(n-floor((n+k+1)/2))*alpha) } warning("Attention robpca: h should be smaller than n = ", n, ". It is set to its default value ", h, ".") } } if(missing(h) & !missing(alpha)) { if(alpha < 0.5) { alpha <- 0.5 warning("Attention robpca: Alpha should be larger then 0.5. It is set to 0.5.") } if(alpha >= 1) { alpha <- 0.75 warning("Attention robpca: Alpha should be smaller then 1. It is set to its default value 0.75.") } if(k==0) { h <- floor(2*floor((n+kmax+1)/2)-n+2*(n-floor((n+kmax+1)/2))*alpha) } else { h <- floor(2*floor((n+k+1)/2)-n+2*(n-floor((n+k+1)/2))*alpha) } } labsd <- floor(max(0,min(labsd,n))) labod <- floor(max(0,min(labod,n))) out <- list() Xa <- X.svd$scores center <- X.svd$centerofX rot <- X.svd$loadings p1 <- ncol(Xa) if( (p1 <= min(floor(n/5), kmax)) & (mcd == 1 ) ) { if(k != 0) { k <- min(k, p1) } else { k <- p1 # cat("Message from robpca: The number of principal # components is defined by the algorithm. It is set to ", k,".\n", sep="") } if(h < floor((nrow(Xa) + ncol(Xa) +1)/2)) { h <- floor((nrow(Xa) + ncol(Xa) +1)/2) cat("Message from robpca: The number of non-outlying observations h is set to ", h," in order to make the mcd algorithm function.\n", sep="") } # Xa.mcd <- cov.mcd(as.data.frame(Xa), quan=h, print=F) Xa.mcd <- cov.mcd(as.data.frame(Xa), quan=h) # R version #print(Xa.mcd$method) #if(length(grep("equation", Xa.mcd$method)) == 1) { # print(Xa.mcd$method) # stop("The ROBPCA algorithm can not deal with this # result from the FAST-MCD algorithm. The algorithm is aborted.") # } #print("OUT") Xa.mcd.svd <- svd(Xa.mcd$cov) scores <- (Xa - matrix(data=rep(Xa.mcd$center, times=nrow(Xa)), nrow=nrow(Xa), ncol=ncol(Xa), byrow=T)) %*% Xa.mcd.svd$u out$M <- center + as.vector(Xa.mcd$center %*% t(rot)) out$L <- Xa.mcd.svd$d[1:k] out$P <- X.svd$loadings %*% Xa.mcd.svd$u[,1:k] out$T <- as.matrix(scores[,1:k]) if(is.list(dimnames(data))) { dimnames(out$T)[[1]] <- dimnames(data)[[1]] } out$h <- h out$k <- k out$alpha <- alpha } else { directions <- choose(n,2) ndirect <- min(250, directions) all <- (ndirect == directions) seed <- 0 B <- extradir(Xa, ndirect, seed, all) Bnorm <- vector(mode="numeric", length=nrow(B)) Bnorm<-apply(B,1,vecnorm) Bnormr <- Bnorm[Bnorm > 1.E-12] B <- B[Bnorm > 1.E-12,] A <- diag(1/Bnormr) %*% B Y <- Xa %*% t(A) Z <- matrix(data=0, nrow=n, ncol=length(Bnormr)) for(i in 1:ncol(Z)) { univ <- unimcd(Y[,i],quan = h) if(univ$smcd < 1.E-12) { r2 <- qr(data[univ$weights==1,])$rank if(r2 == 1) { stop("Error in robpca: At least ", sum(univ$weights), " observations are identical.") } } else { Z[,i] <- abs(Y[,i] - univ$tmcd) / univ$smcd } } H0 <- order(apply(Z, 1, max)) Xh <- Xa[H0[1:h],] Xh.svd <- classSVD(Xh) kmax <- min(Xh.svd$rank, kmax) if( (k == 0) & (plots == 0) ) { test <- which((Xh.svd$eigenvalues/Xh.svd$eigenvalues[1]) <= 1.E-3) if(length(test) != 0) { k <- min(min(Xh.svd$rank, test[1]), kmax) } else { k <- min(Xh.svd$rank, kmax) } cumulative <- cumsum(Xh.svd$eigenvalues[1:k]) / sum(Xh.svd$eigenvalues) if(cumulative[k] > 0.8) { k <- which(cumulative >= 0.8)[1] } cat("Message from robpca: The number of principal components is set by the algorithm. It is set to ", k, ".\n", sep="") } else { if( (k==0) & (plots != 0) ) { loc <- 1:kmax plot(loc, Xh.svd$eigenvalues[1:kmax], type='b', axes= F, xlab="Component", ylab="Eigenvalue") axis(2) axis(1, at=loc) cumv <- cumsum(Xh.svd$eigenvalues)/sum(Xh.svd$eigenvalues) text(loc, Xh.svd$eigenvalues[1:kmax] + par("cxy")[2], as.character(signif(cumv[1:kmax], 2))) box <- dialogbox(title="ROBPCA", controls=list(),buttons = c("OK")) box <- dialogbox.add.control(box, where=1, statictext.control(paste("How many principal components would you like to retain?\nMaximum = ", kmax, sep=""), size=c(200,20))) box <- dialogbox.add.control(box, where=2, editfield.control(label="Your choice:", size=c(30,10))) input <- as.integer(dialogbox.display(box)$values$"Your choice:") k <- max(min(min(Xh.svd$rank, input), kmax), 1) } else { k <- min(min(Xh.svd$rank, k), kmax) } } if(k!=X.svd$rank){ XRc <- Xa-matrix(data=rep(Xh.svd$centerofX, times=nrow(Xa)), nrow=nrow(Xa), ncol=ncol(Xa), byrow=T) Xtilde <- XRc%*%Xh.svd$loadings[,1:k]%*%t(Xh.svd$loadings[,1:k]) Rdiff <- XRc-Xtilde odh <- apply(Rdiff,1,vecnorm) ms <- unimcd(odh^(2/3),h) cutoffodh <- sqrt(qnorm(0.975,ms$tmcd,ms$smcd)^3) indexset <- (odh<=cutoffodh) Xh.svd <- classSVD(Xa[indexset,]) kmax <- min(Xh.svd$rank, kmax) } center <- center + Xh.svd$centerofX %*% t(rot) rot <- rot %*% Xh.svd$loadings Xstar<- (Xa - matrix(data=rep(Xh.svd$centerofX, times=nrow(Xa)), nrow=nrow(Xa), ncol=ncol(Xa), byrow=T)) %*% Xh.svd$loadings Xstar <- as.matrix(Xstar[,1:k]) rot <- as.matrix(rot[,1:k]) mah <- mahalanobis(Xstar, center=rep(0, ncol(Xstar)), cov=diag(Xh.svd$eigenvalues[1:k], nrow=k)) oldobj <- prod(Xh.svd$eigenvalues[1:k]) niter <- 100 for(j in 1:niter) { mah.order <- order(mah) Xh <- as.matrix(Xstar[mah.order[1:h],]) Xh.svd <- classSVD(Xh) obj <- prod(Xh.svd$eigenvalues) Xstar <- (Xstar - matrix(data=rep(Xh.svd$centerofX, times=nrow(Xstar)), nrow=nrow(Xstar), ncol=ncol(Xstar), byrow=T)) %*% Xh.svd$loadings center <- center + Xh.svd$centerofX %*% t(rot) rot <- rot %*% Xh.svd$loadings mah <- mahalanobis(Xstar, center=rep(0, ncol(Xstar)), cov=diag(x=Xh.svd$eigenvalues, nrow=length(Xh.svd$eigenvalues))) if( (Xh.svd$rank == k) & ( abs(oldobj - obj) < 1.E-12) ) { break } else { oldobj <- obj if(Xh.svd$rank < k) { j <- 1 k <- Xh.svd$rank } } } Xstar.mcd <- cov.mcd(as.data.frame(Xstar), quan=h) # R version #if(length(grep("equation", Xstar.mcd$method)) == 1) { # print(Xstar.mcd$method) #stop("The ROBPCA algorithm can not deal with this result from the #FAST-MCD algorithm. The algorithm is aborted.") # } # if(Xstar.mcd$raw.objective < obj) { covf <- Xstar.mcd$cov centerf <- Xstar.mcd$center # } # else { # consistencyfactor <- median(mah)/qchisq(0.5,k) # mah <- mah/consistencyfactor # weights <- ifelse(mah <= qchisq(0.975, k), T, F) # noMCD <- weightmecov(Xstar, weights, n, k) # centerf <- noMCD$center # covf <- noMCD$cov # } covf.eigen <- eigen(covf) covf.eigen.values.sort <- greatsort(covf.eigen$values) P6 <- covf.eigen$vectors P6 <- covf.eigen$vectors[,covf.eigen.values.sort$index] out$T <- (Xstar - matrix(data=rep(centerf, times=n), nrow=n, ncol=ncol(Xstar), byrow=T)) %*% covf.eigen$vectors[,covf.eigen.values.sort$index] if(is.list(dimnames(data))) { dimnames(out$T)[[1]] <- dimnames(data)[[1]] } out$P <- rot %*% covf.eigen$vectors[,covf.eigen.values.sort$index] out$M <- as.vector(center + centerf %*% t(rot)) out$L <- as.vector(covf.eigen$values) out$k <- k out$h <- h out$alpha <- alpha } oldClass(out) <- "robpca" out <- CompRobustDist(data, X.svd$rank, out, classic) if(classic == 1) { out <- CompClassicDist(X.svd, out) } if(plots == 1) { plot(out, classic, labod=labod, labsd=labsd) } return(out) } "greatsort"<-function(vec){ x <- vec * (-1) index <- order(x) return(list(sortedvector=rev(sort(vec)), index=index)) } "classSVD"<-function(x){ if(!is.matrix(x)) { stop("The function classSVD requires input of type 'matrix'.") } n <- nrow(x) p <- ncol(x) if(n == 1) { stop("The sample size is 1. No singular value decomposition can be performed.") } if(p < 5) { tolerance <- 1E-12 } else { if(p <= 8) { tolerance <- 1E-14 } else { tolerance <- 1E-16 } } centerofX <- apply(x, 2, mean) Xcentered <- scale(x, center=T, scale=F) XcenteredSVD <- svd(Xcentered/sqrt(n-1)) rank <- sum(XcenteredSVD$d > tolerance) eigenvalues <- (XcenteredSVD$d[1:rank])^2 loadings <- XcenteredSVD$v[,1:rank] scores <- Xcentered %*% loadings return(list(loadings=as.matrix(loadings), scores=as.matrix(scores), eigenvalues=as.vector(eigenvalues), rank=rank, Xcentered=as.matrix(Xcentered), centerofX=as.vector(centerofX))) } "kernelEVD"<-function(x){ if(!is.matrix(x)) { stop("The function kernelEVD requires input of type 'matrix'.") } n <- nrow(x) p <- ncol(x) if(n > p) { return(classSVD(x)) } else { centerofX <- apply(x, 2, mean) Xcentered <- scale(x, center=T, scale=F) if(n == 1) { stop("The sample size is 1. No singular value decomposition can be performed.") } eigen <- eigen(Xcentered %*% t(Xcentered)/(n-1)) eigen.descending <- greatsort(eigen$values) loadings <- eigen$vectors[,eigen.descending$index] tolerance <- n * max(eigen$values) * .Machine$double.eps rank <- sum(eigen.descending$sortedvector > tolerance) eigenvalues <- eigen.descending$sortedvector[1:rank] loadings <- t((Xcentered/sqrt(n-1))) %*% loadings[,1:rank] %*% diag(1/sqrt(eigenvalues), nrow=length(eigenvalues), ncol=length(eigenvalues)) scores <- Xcentered %*% loadings return(list(loadings=as.matrix(loadings), scores=as.matrix(scores), eigenvalues=as.vector(eigenvalues), rank=rank, Xcentered=as.matrix(Xcentered), centerofX=as.vector(centerofX))) } } "extradir"<-function(data, ndirect, seed=0, all=T){ n <- nrow(data) p <- ncol(data) B2 <- matrix(data=0, nrow = ndirect, ncol = p) rowindex <- 1 i <- 1 if(all == T) { while( (i < n) & (rowindex <= ndirect) ) { j <- i + 1 while( (j <= n) & (rowindex <= ndirect) ) { B2[rowindex,] <- data[i,] - data[j,] j <- j + 1 rowindex <- rowindex + 1 } i <- i + 1 } } else { while(rowindex <= ndirect) { sseed<-randomset(n,2,seed) seed<-sseed$seed B2[rowindex,] <- data[sseed$ranset[1],] - data[sseed$ranset[2],] rowindex <- rowindex + 1 } } return(B2) } "randomset"<-function(tot,nel,seed){ out<-list() for(j in 1:nel){ randseed<-uniran(seed) seed<-randseed$seed num<-floor(randseed$random*tot)+1 if(j > 1){ while(any(out$ranset==num)){ randseed<-uniran(seed) seed<-randseed$seed num<-floor(randseed$random*tot)+1 } } out$ranset[j]<-num } out$seed<-seed return(out) } "uniran"<-function(seed = 0){ out <- list() seed<-floor(seed*5761)+999 quot<-floor(seed/65536) out$seed<-floor(seed)-floor(quot*65536) out$random<-out$seed/65536 return(out) } "unimcd"<-function(y,quan){ out<-list() ncas<-length(y) len<-ncas-quan+1 if(len==1){ out$tmcd<-mean(y) out$smcd<-sqrt(var(y)) } else { ay<-c() I<-order(y) y<-y[I] ay[1]<-sum(y[1:quan]) for(samp in 2:len){ ay[samp]<-ay[samp-1]-y[samp-1]+y[samp+quan-1] } ay2<-ay^2/quan sq<-c() sq[1]<-sum(y[1:quan]^2)-ay2[1] for(samp in 2:len){ sq[samp]<-sq[samp-1]-y[samp-1]^2+y[samp+quan-1]^2-ay2[samp]+ay2[samp-1] } sqmin<-min(sq) Isq<-order(sq) ndup<-sum(sq == sqmin) ii<-Isq[1:ndup] slutn<-c() slutn[1:ndup]<-ay[ii] initmean<-slutn[floor((ndup+1)/2)]/quan initcov<-sqmin/(quan-1) res<-(y-initmean)^2/initcov sortres<-sort(res) factor<-sortres[quan]/qchisq(quan/ncas,1) initcov<-factor*initcov res<-(y-initmean)^2/initcov quantile<-qchisq(0.975,1) out$weights<-(resop.pro){ temp<-marpca(x,p=it,N1=N1,N2=N2,tol=tol,N2p=N2p,Nran=Nran,Nkeep=Nkeep, SEED=SEED) rat<-temp$var.op/bot ratval[it,2]<-rat }}} if(!is.null(pval)){ if(pval>=m)stop("This method assumes pval1)temp$points(x[outid,],col="red") } if(reg.plane){ vals<-regfun(x[,1:2],x[,3])$coef temp$plane(vals,col="blue") } } rmba<-function(x, csteps = 5) { # gets the reweighted MBA estimator, assume p > 1 # Code supplied by David Olive # p <- dim(x)[2] n <- dim(x)[1] ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) mns <- apply(as.matrix(x[md2 <= medd2, ]), 2, mean) covs <- var(x[md2 <= medd2, ]) } covb <- covs mnb <- mns ##get the square root of det(covb) critb <- prod(diag(chol(covb))) ##get the resistant estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start mns <- apply(as.matrix(x[md2 <= medd2, ]), 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) # mns <- apply(as.matrix(x[md2 <= medd2, ]), 2, mean) mns <- apply(as.matrix(x[md2 <= medd2, ]), 2, mean) covs <- var(x[md2 <= medd2, ]) } crit <- prod(diag(chol(covs))) if(crit < critb) { critb <- crit covb <- covs mnb <- mns } ##scale for better performance at MVN rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/(qchisq(0.5, p)) covb <- const * covb ##reweight the above MBA estimator (mnb,covb) for efficiency rd2 <- mahalanobis(x, mnb, covb) up <- qchisq(0.975, p) rmnb <- apply(as.matrix(x[rd2 <= up, ]), 2, mean) rcovb <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnb, rcovb) const <- median(rd2)/(qchisq(0.5, p)) rcovb <- const * rcovb ## reweight again rd2 <- mahalanobis(x, rmnb, rcovb) up <- qchisq(0.975, p) rmnb <- apply(as.matrix(x[rd2 <= up, ]), 2, mean) rcovb <- var(x[rd2 <= up, ]) rd2 <- mahalanobis(x, rmnb, rcovb) const <- median(rd2)/(qchisq(0.5, p)) rcovb <- const * rcovb list(center = rmnb, cov = rcovb) } tbscov <- function(x,eps=1e-3,maxiter=20,r=.45,alpha=.05){ # Rocke's contrained s-estimator # returns covariance matrix only. For both locatiion and scatter, use tbs # # r=.45 is the breakdown point # alpha=.05 is the asymptotic rejection probability. # if(!is.matrix(x))stop("x should be a matrix with two or more columns") x<-elimna(x) library(MASS) temp<-cov.mve(x) t1<-temp$center s<-temp$cov n <- nrow(x) p <- ncol(x) if(p==1)stop("x should be a matrix with two or more columns") c1M<-cgen.bt(n,p,r,alpha,asymp=FALSE) c1<-c1M$c1 if(c1==0)c1<-.001 #Otherwise get division by zero M<-c1M$M b0 <- erho.bt(p,c1,M) crit <- 100 iter <- 1 w1d <- rep(1,n) w2d <- w1d while ((crit > eps)&(iter <= maxiter)) { t.old <- t1 s.old <- s wt.old <- w1d v.old <- w2d d2 <- mahalanobis(x,center=t1,cov=s) d <- sqrt(d2) k <- ksolve.bt(d,p,c1,M,b0) d <- d/k w1d <- wt.bt(d,c1,M) w2d <- v.bt(d,c1,M) t1 <- (w1d %*% x)/sum(w1d) s <- s*0 for (i in 1:n) { xc <- as.vector(x[i,]-t1) s <- s + as.numeric(w1d[i])*(xc %o% xc) } s <- p*s/sum(w2d) mnorm <- sqrt(as.vector(t.old) %*% as.vector(t.old)) snorm <- eigen(s.old)$values[1] crit1 <- max(abs(t1 - t.old)) # crit <- max(crit1,crit2) crit <- max(abs(w1d-wt.old))/max(w1d) iter <- iter+1 } # mnorm <- sqrt(as.vector(t1) %*% as.vector(t1)) # snorm <- eigen(s)$values[1] # return(list(t1=t1,s=s)) s } erho.bt <- function(p,c1,M) # expectation of rho(d) under chi-squared p return(chi.int(p,2,M)/2 +(M^2/2+c1*(5*c1+16*M)/30)*chi.int2(p,0,M+c1) +(M^2/2-M^2*(M^4-5*M^2*c1^2+15*c1^4)/(30*c1^4))*( chi.int(p,0,M+c1)-chi.int(p,0,M)) +(1/2+M^4/(2*c1^4)-M^2/c1^2)*(chi.int(p,2,M+c1)-chi.int(p,2,M)) +(4*M/(3*c1^2)-4*M^3/(3*c1^4))*(chi.int(p,3,M+c1)-chi.int(p,3,M)) +(3*M^2/(2*c1^4)-1/(2*c1^2))*(chi.int(p,4,M+c1)-chi.int(p,4,M)) -(4*M/(5*c1^4))*(chi.int(p,5,M+c1)-chi.int(p,5,M)) +(1/(6*c1^4))*(chi.int(p,6,M+c1)-chi.int(p,6,M))) chi.int <- function(p,a,c1) # partial expectation d in (0,c1) of d^a under chi-squared p return( exp(lgamma((p+a)/2)-lgamma(p/2))*2^{a/2}*pchisq(c1^2,p+a) ) chi.int2 <- function(p,a,c1) # partial expectation d in (c1,\infty) of d^a under chi-squared p return( exp(lgamma((p+a)/2)-lgamma(p/2))*2^{a/2}*(1-pchisq(c1^2,p+a))) cgen.bt <- function(n,p,r,alpha,asymp=FALSE){ # find constants c1 and M that gives a specified breakdown r # and rejection point alpha if (asymp == FALSE){if (r > (n-p)/(2*n) ) r <- (n-p)/(2*n)} # maximum achievable breakdown # # if rejection is not achievable, use c1=0 and best rejection # limvec <- rejpt.bt.lim(p,r) if (1-limvec[2] <= alpha) { c1 <- 0 M <- sqrt(qchisq(1-alpha,p)) } else { c1.plus.M <- sqrt(qchisq(1-alpha,p)) M <- sqrt(p) c1 <- c1.plus.M - M iter <- 1 crit <- 100 eps <- 1e-5 while ((crit > eps)&(iter<100)) { deps <- 1e-4 M.old <- M c1.old <- c1 er <- erho.bt(p,c1,M) fc <- er - r*(M^2/2+c1*(5*c1+16*M)/30) fcc1 <- (erho.bt(p,c1+deps,M)-er)/deps fcM <- (erho.bt(p,c1,M+deps)-er)/deps fcp <- fcM - fcc1 - r*(M-(5*c1+16*M)/30+c1*9/30) M <- M - fc/fcp if (M >= c1.plus.M ){M <- (M.old + c1.plus.M)/2} c1 <- c1.plus.M - M # if (M-c1 < 0) M <- c1.old+(M.old-c1.old)/2 crit <- abs(fc) iter <- iter+1 } } list(c1=c1,M=M,r1=r) } erho.bt.lim <- function(p,c1) # expectation of rho(d) under chi-squared p return(chi.int(p,2,c1)+c1^2*chi.int2(p,0,c1)) erho.bt.lim.p <- function(p,c1) # derivative of erho.bt.lim wrt c1 return(chi.int.p(p,2,c1)+c1^2*chi.int2.p(p,0,c1)+2*c1*chi.int2(p,0,c1)) rejpt.bt.lim <- function(p,r){ # find p-value of translated biweight limit c # that gives a specified breakdown c1 <- 2*p iter <- 1 crit <- 100 eps <- 1e-5 while ((crit > eps)&(iter<100)) { c1.old <- c1 fc <- erho.bt.lim(p,c1) - c1^2*r fcp <- erho.bt.lim.p(p,c1) - 2*c1*r c1 <- c1 - fc/fcp if (c1 < 0) c1 <- c1.old/2 crit <- abs(fc) iter <- iter+1 } return(c(c1,pchisq(c1^2,p),log10(1-pchisq(c1^2,p)))) } chi.int.p <- function(p,a,c1) return( exp(lgamma((p+a)/2)-lgamma(p/2))*2^{a/2}*dchisq(c1^2,p+a)*2*c1 ) chi.int2.p <- function(p,a,c1) return( -exp(lgamma((p+a)/2)-lgamma(p/2))*2^{a/2}*dchisq(c1^2,p+a)*2*c1 ) ksolve.bt <- function(d,p,c1,M,b0){ # find a constant k which satisfies the s-estimation constraint # for modified biweight k <- 1 iter <- 1 crit <- 100 eps <- 1e-5 while ((crit > eps)&(iter<100)) { k.old <- k fk <- mean(rho.bt(d/k,c1,M))-b0 fkp <- -mean(psi.bt(d/k,c1,M)*d/k^2) k <- k - fk/fkp if (k < k.old/2) k <- k.old/2 if (k > k.old*1.5) k <- k.old*1.5 crit <- abs(fk) iter <- iter+1 } return(k) } rho.bt <- function(x,c1,M) { x1 <- (x-M)/c1 ivec1 <- (x1 < 0) ivec2 <- (x1 > 1) return(ivec1*(x^2/2) +ivec2*(M^2/2+c1*(5*c1+16*M)/30) +(1-ivec1-ivec2)*(M^2/2-M^2*(M^4-5*M^2*c1^2+15*c1^4)/(30*c1^4) +(1/2+M^4/(2*c1^4)-M^2/c1^2)*x^2 +(4*M/(3*c1^2)-4*M^3/(3*c1^4))*x^3 +(3*M^2/(2*c1^4)-1/(2*c1^2))*x^4 -4*M*x^5/(5*c1^4)+x^6/(6*c1^4))) } psi.bt <- function(x,c1,M) { x1 <- (x-M)/c1 ivec1 <- (x1 < 0) ivec2 <- (x1 > 1) return(ivec1*x+(1-ivec1-ivec2)*x*(1-x1^2)^2) } psip.bt <- function(x,c1,M) { x1 <- (x-M)/c1 ivec1 <- (x1 < 0) ivec2 <- (x1 > 1) return(ivec1+(1-ivec1-ivec2)*((1-x1^2)^2+4*x*x1*(1-x1^2)/c1)) } wt.bt <- function(x,c1,M) { x1 <- (x-M)/c1 ivec1 <- (x1 < 0) ivec2 <- (x1 > 1) return(ivec1+(1-ivec1-ivec2)*(1-x1^2)^2) } v.bt <- function(x,c1,M) return(x*psi.bt(x,c1,M)) gvarg<-function(m,var.fun=cov.mba){ # # Compute the generalized variance of a matrix m # It is assumed that var.fun returns a covariance matrix only # # (Some functions return a a covariance matrix in list mode: $cov # These functions do not work here.) # # other possible choices for var.fun: # skipcov # tbscov # covout # covogk # mgvcov # mvecov # mcdcov # m<-elimna(m) m<-as.matrix(m) temp<-var.fun(m) gvar<-prod(eigen(temp)$values) gvar } marpca<-function(x,p=ncol(x)-1,N1=3,N2=2,tol=.001,N2p=10,Nran=50, Nkeep=10,SEED=T,LSCALE=T,SCORES=F){ # # Marrona (2005, Technometrics, 47, 264-273) robust PCA # # x is an n by m matrix, pNran)stop("Must have Nkeep<=Nran") if(SEED)set.seed(2) n<-nrow(x) m<-ncol(x) q<-m-p if(q<0)stop("p should have value between 0 and ncol(x)") if(q>0){ bkeep<-array(dim=c(q,m,Nran)) akeep<-matrix(nrow=Nran,ncol=q) sig.val<-NA for(it in 1:Nran){ temp<-marpca.sub(x,p,N1=N1,N2=N2,tol=tol,LSCALE=LSCALE) bkeep[,,it]<-temp$B akeep[it,]<-temp$a sig.val[it]<-temp$var.op } ord<-order(sig.val) bkeep2<-array(dim=c(q,m,Nkeep)) cmatkeep<-array(dim=c(m,m,Nkeep)) akeep2<-matrix(nrow=Nkeep,ncol=q) sig.val2<-NA for(it in 1:Nkeep){ temp<-marpca.sub(x,p,N1=0,N2=N2p,tol=tol,B=bkeep[,,ord[it]],a=akeep[ord[it],], LSCALE=LSCALE) bkeep2[,,it]<-temp$B akeep2[it,]<-temp$a sig.val2[it]<-temp$var.op cmatkeep[,,it]<-temp$wt.cov } ord<-order(sig.val2) B<-bkeep2[,,ord[1]] a<-akeep2[ord[1],] var.op<-sig.val2[ord[1]] Cmat<-cmatkeep[,,ord[1]] } wt.mu<-NULL if(q==0){ output<-marpca.sub(x,0,LSCALE=LSCALE) B<-output$B a<-output$a var.op<-output$var.op wt.mu<-output$mu Cmat<-output$wt.cov } scores<-NULL if(SCORES){ ev<-eigen(Cmat) ord.val<-order(ev$values) mn1<-m-p+1 wt.mu<-marpca.sub(x,p=p)$mu Bp<-ev$vectors[,ord.val[mn1:m]] #m by m xmmu<-x for(j in 1:m)xmmu[,j]<-x[,j]-wt.mu[j] scores<-matrix(ncol=p,nrow=n) for(i in 1:n)scores[i,]<-t(Bp)%*%as.matrix(xmmu[i,]) } list(B=B,a=a,var.op=var.op,wt.cov=Cmat,wt.mu=wt.mu,scores=scores) } marpca.sub<-function(x,p=ncol(x)-1,N1=3,N2=2,tol=.001,B=NULL,a=NULL, LSCALE=T){ # # Marrona (2005, Technometrics, 47, 264-273) robust PCA # # Note: setting # p=0 causes B to be the identity matrix, which is used in the case # p=ncol(x) to estimate proportion of unexplained variance. # wt.cov<-NULL if(!is.null(B)){ B<-as.matrix(B) if(ncol(B)==1)B<-t(B) } n<-nrow(x) m<-ncol(x) q<-m-p if(q<0)stop("p and q should have values between 1 and ncol(x)") hval<-floor((n + m - q + 2)/2) DEL<-Inf sig0<-Inf if(is.null(B)){ if(p>0 && ptol){ r<-NA for(i in 1:n)r[i]<-sum(Bx[i,]-a)^2 if(LSCALE)sig<-lscale(r,m,q) if(!LSCALE){ delta<-delta<-(n-m+q-1)/(2*n) sig<-mscale(r,delta) } DEL<-1-sig/sig0 sig0<-sig ord.r<-order(r) w<-rep(0,n) w[ord.r[1:hval]]<-1 xx<-x for(i in 1:n)xx[i,]<-x[i,]*w[i] mu<-apply(xx,2,FUN="sum")/sum(w) #m by 1 locations Cmat<-matrix(0,nrow=m,ncol=m) for(i in 1:n){ temp<-w[i]*as.matrix(x[i,]-mu)%*%t(as.matrix(x[i,]-mu)) Cmat<-Cmat+temp } wt.cov<-Cmat/sum(w) if(it>N1){ temp<-eigen(wt.cov) ord.eig<-order(temp$values) for(iq in 1:q)B[iq,]<-temp$vectors[,ord.eig[iq]] } a<-B%*%mu it<-it+1 } list(B=B,a=a,var.op=sig,mu=mu,wt.cov=wt.cov) } bwimcp<-function(J,K,x,tr=.2,JK=J*K,grp=c(1:JK),alpha=.05){ # # Multiple comparisons for interactions # in a split-plot design. # The analysis is done by taking difference scores # among all pairs of dependent groups and # determining which of # these differences differ across levels of Factor A # using trimmed means. # # For MOM or M-estimators, use spmcpi which uses a bootstrap method # # The s-plus variable x is assumed to contain the raw # data stored in list mode or in a matrix. # If in list mode, x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second: level 1,2 # x[[K]] is the data for level 1,K # x[[K+1]] is the data for level 2,1, x[[2K]] is level 2,K, etc. # # If the data are in a matrix, column 1 is assumed to # correspond to x[[1]], column 2 to x[[2]], etc. # # When in list mode x is assumed to have length JK, the total number of # groups being tested, but a subset of the data can be analyzed # using grp # if(is.matrix(x)) { y <- list() for(j in 1:ncol(x)) y[[j]] <- x[, j] x <- y } JK<-J*K if(JK!=length(x))stop("Something is wrong. Expected ",JK," groups but x contains ", length(x), "groups instead.") MJ<-(J^2-J)/2 MK<-(K^2-K)/2 JMK<-J*MK MJMK<-MJ*MK Jm<-J-1 data<-list() for(j in 1:length(x)){ data[[j]]<-x[[grp[j]]] # Now have the groups in proper order. } x<-data output<-matrix(0,MJMK,7) dimnames(output)<-list(NULL,c("A","A","B","B","psihat","p.value","p.crit")) jp<-1-K kv<-0 kv2<-0 test<-NA for(j in 1:J){ jp<-jp+K xmat<-matrix(NA,ncol=K,nrow=length(x[[jp]])) for(k in 1:K){ kv<-kv+1 xmat[,k]<-x[[kv]] } xmat<-elimna(xmat) for(k in 1:K){ kv2<-kv2+1 x[[kv2]]<-xmat[,k] }} m<-matrix(c(1:JK),J,K,byrow=T) ic<-0 for(j in 1:J){ for(jj in 1:J){ if(j 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.005,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01){ dvec<-alpha/c(1:ncon) dvec[1]<-alpha/2 } temp2<-order(0-test) zvec<-dvec[1:ncon] sigvec<-(test[temp2]>=zvec) output[temp2,7]<-zvec output[,7]<-2*output[,7] output } qregsm<-function(x, y,est=hd,qval=.5,sm=T,plotit=T,pyhat=F,fr=0.8,nboot=40,xlab="X", ylab="Y") { # # Do a smooth of x versus the quantiles of y # # qval indicates quantiles of interest. # Example: qval=c(.2,.8) will create two smooths, one for the # .2 quantile and the other for the .8 quantile. # # est can be any quantile estimator having the argument qval, indicating # the quantile to be used. # # est = hd uses Harrel Davis estimator, # est = qest uses a single order statistic. # # sm=T, bagging will be used. # pyhat=T returns the estimates # x<-as.matrix(x) X<-cbind(x,y) X<-elimna(X) np<-ncol(X) p<-np-1 x<-X[,1:p] x<-as.matrix(x) y<-X[,np] vals<-matrix(NA,ncol=length(y),nrow=length(qval)) for(i in 1:length(qval)){ if(sm)vals[i,]<-rplotsm(x,y,est=est,q=qval[i],pyhat=T,plotit=F,fr=fr,nboot=nboot)$yhat if(!sm)vals[i,]<-rungen(x,y,est=est,q=qval[i],pyhat=T,plotit=F,fr=fr)$output } if(p==1){ if(plotit){ plot(x,y,xlab=xlab,ylab=ylab) for(i in 1:length(qval)){ sx <- sort(x) xorder <- order(x) sysm <- vals[i,] #lines(sx, sysm) lines(sx, sysm[xorder]) }}} output <- "Done" if(pyhat)output <- vals output } L1median <- function(X, tol = 1e-08, maxit = 200, m.init = apply(X, 2, median), trace = FALSE) { ## L1MEDIAN calculates the multivariate L1 median ## I/O: mX=L1median(X,tol); ## ## X : the data matrix ## tol: the convergence criterium: ## the iterative process stops when ||m_k - m_{k+1}|| < tol. ## maxit: maximum number of iterations ## init.m: starting value for m; typically coordinatewise median ## ## Ref: Hossjer and Croux (1995) ## "Generalizing Univariate Signed Rank Statistics for Testing ## and Estimating a Multivariate Location Parameter"; ## Non-parametric Statistics, 4, 293-308. ## ## Implemented by Kristel Joossens ## Many thanks to Martin Maechler for improving the program! ## slightly faster version of 'sweep(x, 2, m)': centr <- function(X,m) X - rep(m, each = n) ## computes objective function in m based on X and a: mrobj <- function(X,m) sum(sqrt(rowSums(centr(X,m)^2))) d <- dim(X); n <- d[1]; p <- d[2] m <- m.init if(!is.numeric(m) || length(m) != p) stop("'m.init' must be numeric of length p =", p) k <- 1 if(trace) nstps <- 0 while (k <= maxit) { mold <- m obj.old <- if(k == 1) mrobj(X,mold) else obj X. <- centr(X, m) Xnorms <- sqrt(rowSums(X. ^ 2)) inorms <- order(Xnorms) dx <- Xnorms[inorms] # smallest first, i.e., 0's if there are X <- X [inorms,] X. <- X.[inorms,] ## using 1/x weighting {MM: should this be generalized?} w <- ## (0 norm -> 0 weight) : if (all(dn0 <- dx != 0)) 1/dx else c(rep.int(0, length(dx)- sum(dn0)), 1/dx[dn0]) delta <- colSums(X. * rep(w,p)) / sum(w) nd <- sqrt(sum(delta^2)) maxhalf <- if (nd < tol) 0 else ceiling(log2(nd/tol)) m <- mold + delta # computation of a new estimate ## If step 'delta' is too far, we try halving the stepsize nstep <- 0 while ((obj <- mrobj(X, m)) >= obj.old && nstep <= maxhalf) { nstep <- nstep+1 m <- mold + delta/(2^nstep) } if(trace) { if(trace >= 2) cat(sprintf("k=%3d obj=%19.12g m=(",k,obj), paste(formatC(m),collapse=","), ")", if(nstep) sprintf(" nstep=%2d halvings",nstep) else "", "\n", sep="") nstps[k] <- nstep } if (nstep > maxhalf) { ## step halving failed; keep old m <- mold ## warning("step halving failed in ", maxhalf, " steps") break } k <- k+1 } if (k > maxit) warning("iterations did not converge in ", maxit, " steps") if(trace == 1) cat("needed", k, "iterations with a total of", sum(nstps), "stepsize halvings\n") return(m) } outpca<-function(x,cor=F,loadings=T,covlist=NULL, SCORES=F,ALL=T,pval=NULL,cop=3,...){ # # Remove outliers with outpro # (using projection method) # apply standard principle compoenents to remaining data # # ALL=T, when computing scores, all of the data are used, not just # the data left after outliers are removed. # x<-elimna(x) # removes any rows having missing values m<-ncol(x) #flag<-outfun(x,...)$keep flag<-outpro(x,cop=cop)$keep remx<-x temp2<-princomp(remx) x<-x[flag,] loc<-apply(x,2,mean) temp<-princomp(x,cor=cor,scores=T,covlist=covlist) if(!SCORES)temp<-summary(temp,loadings=loadings) if(SCORES){ if(is.null(pval)) stop("When computing scores, specify pval, number of components") if (!ALL)temp<-temp$scores[,1:pval] if(ALL){ temp<-summary(temp,loadings=T) B<-temp[2]$loadings[1:m,1:m] # Use robust loadings #loc<-temp[3]$center # used skipped measure of location z<-remx for(i in 1:nrow(z))z[i,]<-z[i,]-loc temp<-t(B)%*%t(z) temp<-t(temp) temp<-temp[,1:pval] }} temp } llocv2<-function(x,est=median,...){ if(!is.list(x))val<-est(x,...) if(is.list(x)){ val<-NA for(i in 1:length(x))val[i]<-est(x[[i]],...) } if(is.matrix(x))val<-apply(x,2,est,...) list(center=val) } mcppb<-function(x,crit=NA,con=0,tr=.2,alpha=.05,nboot=2000,grp=NA,WIN=F, win=.1){ # # Compute a 1-alpha confidence interval for a set of d linear contrasts # involving trimmed means using the percentile bootstrap method. # Independent groups are assumed. # # The data are assumed to be stored in x in list mode. Thus, # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. Length(x)=the number of groups = J, say. # # Or the data can be stored in a matrix with J columns # # By default, all pairwise comparisons are performed, but contrasts # can be specified with the argument con. # The columns of con indicate the contrast coefficients. # Con should have J rows, J=number of groups. # For example, con[,1]=c(1,1,-1,-1,0,0) and con[,2]=c(,1,-1,0,0,1,-1) # will test two contrasts: (1) the sum of the first two trimmed means is # equal to the sum of the second two, and (2) the difference between # the first two is equal to the difference between the trimmed means of # groups 5 and 6. # # The default number of bootstrap samples is nboot=2000 # # con<-as.matrix(con) if(is.matrix(x)){ xx<-list() for(i in 1:ncol(x)){ xx[[i]]<-x[,i] } x<-xx } if(!is.list(x))stop("Data must be stored in list mode or in matrix mode.") if(!is.na(sum(grp))){ # Only analyze specified groups. xx<-list() for(i in 1:length(grp))xx[[i]]<-x[[grp[1]]] x<-xx } J<-length(x) tempn<-0 for(j in 1:J){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. tempn[j]<-length(temp) x[[j]]<-temp } Jm<-J-1 d<-ifelse(sum(con^2)==0,(J^2-J)/2,ncol(con)) if(is.na(crit) && tr != .2)stop("A critical value must be specified when the amount of trimming differs from .2") if(WIN){ if(tr < .2)warning("When Winsorizing, the amount of trimming should be at least .2") if(win > tr)stop("Amount of Winsorizing must <= amount of trimming") if(min(tempn) < 15){warning("Winsorizing with sample sizes less than 15 can") warning(" result in poor control over the probability of a Type I error") } for (j in 1:J){ x[[j]]<-winval(x[[j]],win) } } if(is.na(crit)){ if(d==1)crit<-alpha/2 if(d==2 && alpha==.05 && nboot==1000)crit<-.014 if(d==2 && alpha==.05 && nboot==2000)crit<-.014 if(d==3 && alpha==.05 && nboot==1000)crit<-.009 if(d==3 && alpha==.05 && nboot==2000)crit<-.0085 if(d==3 && alpha==.025 && nboot==1000)crit<-.004 if(d==3 && alpha==.025 && nboot==2000)crit<-.004 if(d==3 && alpha==.01 && nboot==1000)crit<-.001 if(d==3 && alpha==.01 && nboot==2000)crit<-.001 if(d==4 && alpha==.05 && nboot==2000)crit<-.007 if(d==5 && alpha==.05 && nboot==2000)crit<-.006 if(d==6 && alpha==.05 && nboot==1000)crit<-.004 if(d==6 && alpha==.05 && nboot==2000)crit<-.0045 if(d==6 && alpha==.025 && nboot==1000)crit<-.002 if(d==6 && alpha==.025 && nboot==2000)crit<-.0015 if(d==6 && alpha==.01 && nboot==2000)crit<-.0005 if(d==10 && alpha==.05 && nboot<=2000)crit<-.002 if(d==10 && alpha==.05 && nboot==3000)crit<-.0023 if(d==10 && alpha==.025 && nboot<=2000)crit<-.0005 if(d==10 && alpha==.025 && nboot==3000)crit<-.001 if(d==15 && alpha==.05 && nboot==2000)crit<-.0016 if(d==15 && alpha==.025 && nboot==2000)crit<-.0005 if(d==15 && alpha==.05 && nboot==5000)crit<-.0026 if(d==15 && alpha==.025 && nboot==5000)crit<-.0006 } if(is.na(crit) && alpha==.05)crit<-0.0268660714*(1/d)-0.0003321429 if(is.na(crit))crit<-alpha/(2*d) if(d> 10 && nboot <5000)warning("Suggest using nboot=5000 when the number of contrasts exceeds 10.") icl<-round(crit*nboot)+1 icu<-round((1-crit)*nboot) if(sum(con^2)==0){ con<-matrix(0,J,d) id<-0 for (j in 1:Jm){ jp<-j+1 for (k in jp:J){ id<-id+1 con[j,id]<-1 con[k,id]<-0-1 }}} psihat<-matrix(0,ncol(con),6) dimnames(psihat)<-list(NULL,c("con.num","psihat","se","ci.lower", "ci.upper","p.value")) if(nrow(con)!=length(x))stop("The number of groups does not match the number of contrast coefficients.") bvec<-matrix(NA,nrow=J,ncol=nboot) set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") for(j in 1:J){ print(paste("Working on group ",j)) data<-matrix(sample(x[[j]],size=length(x[[j]])*nboot,replace=T),nrow=nboot) bvec[j,]<-apply(data,1,mean,tr) # Bootstrapped trimmed means for jth group } test<-NA for (d in 1:ncol(con)){ top<-0 for (i in 1:J){ top<-top+con[i,d]*bvec[i,] } test[d]<-sum((top>0))/nboot test[d]<-min(test[d],1-test[d]) top<-sort(top) psihat[d,4]<-top[icl] psihat[d,5]<-top[icu] } for (d in 1:ncol(con)){ psihat[d,1]<-d testit<-lincon(x,con[,d],tr,pr=F) psihat[d,6]<-test[d] psihat[d,2]<-testit$psihat[1,2] psihat[d,3]<-testit$test[1,4] } print("Reminder: To control FWE, reject if the p-value is less than") print("the crit.p.value listed in the output.") list(psihat=psihat,crit.p.value=crit,con=con) } llocv2<-function(x,est=median,...){ if(!is.list(x))val<-est(x,...) if(is.list(x)){ val<-NA for(i in 1:length(x))val[i]<-est(x[[i]],...) } if(is.matrix(x))val<-apply(x,2,est,...) list(center=val) } NMpca<-function(x,B,...){ # # Robust PCA using random orthogonal matrices and # robust generalized variance method # This function is used by Ppca # n<-x[1] m<-x[2] p=x[3] x=matrix(x[4:length(x)],ncol=m) B=matrix(B,ncol=m) vals<-NA z<-matrix(nrow=n,ncol=p) B <- t(ortho(t(B))) # so rows are orthogonal for(i in 1:n)z[i,]<-B%*%as.matrix(x[i,]) vals<-0-gvarg(z) vals } ancbbpb<-function(x1,y1,x2,y2,fr1=1,fr2=1,nboot=200,pts=NA,plotit=T, SEED=T,alpha=.05,RNA=T){ # # Compare two independent groups using an ancova method. # A running-interval smooth is used to estimate the regression lines and is # based in part on bootstrap bagging. # # This function is limited to two groups and one covariate. # # No assumption is made about the parametric form of the regression # lines. # Confidence intervals are computed using a percentile bootstrap # method. Comparisons are made at five empirically chosen design points when # pts=NA. To compare groups at specified x values, use pts. # Example: pts=c(60,70,80) will compare groups at the three design points # 60, 70 and 80. # # Assume data are in x1 y1 x2 and y2 # # fr1 and fr2 are the spans used by the smooth. # # RNA=F, when computing bagged estimate, NA values are not removed # resulting in no estimate of Y at the specified design point, # RNA=T, missing values are removed and the remaining values are used. # if(SEED)set.seed(2) if(is.na(pts[1])){ isub<-c(1:5) # Initialize isub test<-c(1:5) xorder<-order(x1) y1<-y1[xorder] x1<-x1[xorder] xorder<-order(x2) y2<-y2[xorder] x2<-x2[xorder] n1<-1 n2<-1 vecn<-1 for(i in 1:length(x1))n1[i]<-length(y1[near(x1,x1[i],fr1)]) for(i in 1:length(x1))n2[i]<-length(y2[near(x2,x1[i],fr2)]) for(i in 1:length(x1))vecn[i]<-min(n1[i],n2[i]) sub<-c(1:length(x1)) isub[1]<-min(sub[vecn>=12]) isub[5]<-max(sub[vecn>=12]) isub[3]<-floor((isub[1]+isub[5])/2) isub[2]<-floor((isub[1]+isub[3])/2) isub[4]<-floor((isub[3]+isub[5])/2) mat<-matrix(NA,5,7) dimnames(mat)<-list(NULL,c("X","n1","n2","DIF","ci.low","ci.hi","p.value")) gv1<-vector("list") for (i in 1:5){ j<-i+5 temp1<-y1[near(x1,x1[isub[i]],fr1)] temp2<-y2[near(x2,x1[isub[i]],fr2)] temp1<-temp1[!is.na(temp1)] temp2<-temp2[!is.na(temp2)] mat[i,1]<-x1[isub[i]] mat[i,2]<-length(temp1) mat[i,3]<-length(temp2) mat[,4]<-runmbo(x1,y1,pts=x1[isub],pyhat=T,plotit=F,SEED=F,est=tmean)- runmbo(x2,y2,pts=x1[isub],pyhat=T,plotit=F,SEED=F,est=tmean) gv1[[i]]<-temp1 gv1[[j]]<-temp2 } I1<-diag(5) I2<-0-I1 con<-rbind(I1,I2) estmat1<-matrix(nrow=nboot,ncol=length(isub)) estmat2<-matrix(nrow=nboot,ncol=length(isub)) data1<-matrix(sample(length(y1),size=length(y1)*nboot,replace=T),nrow=nboot) data2<-matrix(sample(length(y2),size=length(y2)*nboot,replace=T),nrow=nboot) # for(ib in 1:nboot){ estmat1[ib,]=runmbo(x1[data1[ib,]],y1[data1[ib,]],pts=x1[isub], pyhat=T,plotit=F,SEED=F,est=tmean) estmat2[ib,]=runmbo(x2[data2[ib,]],y2[data2[ib,]],pts=x1[isub], pyhat=T,plotit=F,SEED=F,est=tmean) } dif<-(estmat1 maxhalf) { ## step halving failed; keep old m <- mold ## warning("step halving failed in ", maxhalf, " steps") break } k <- k+1 } if (k > maxit) warning("iterations did not converge in ", maxit, " steps") if(trace == 1) cat("needed", k, "iterations with a total of", sum(nstps), "stepsize halvings\n") # return(m) list(center=m) } Ppca<-function(x,p=ncol(x)-1,locfun=L1medcen,loc.val=NULL,SCORES=F, gvar.fun=cov.mba,...){ # # Robust PCA aimed at finding scores that maximize a # robust generalized variance given the goal of reducing data from # m dimensions to # p, which defaults to m-1 # # locfun, location used to center design space. # by default, use the spatial median # alternatives are mcd, tauloc, ... # # Output: the projection matrix. If # SCORES=T, the projected scores are returned. # x<-elimna(x) n<-nrow(x) m<-ncol(x) xdat=c(n,m,p,as.vector(x)) if(is.null(loc.val))info<-locfun(x,...)$center if(!is.null(loc.val))info<-loc.val for(i in 1:n)x[i,]<-x[i,]-info vals<-NA z<-matrix(nrow=n,ncol=p) np=p*m B=robpca(x,pval=p,plotit=F)$P B=t(B) Bs=nelderv2(xdat,np,NMpca,START=B) Bop=matrix(Bs,nrow=p,ncol=m) Bop=t(ortho(t(Bop))) z<-matrix(nrow=n,ncol=p) zval<-NULL for(i in 1:n)z[i,]<-Bop%*%as.matrix(x[i,]) if(SCORES)zval<-z val=gvarg(z) list(B=Bop,gen.var=val,scores=zval) } matl<-function(x){ # # take data in list mode and store it in a matrix # J=length(x) nval=NA for(j in 1:J)nval[j]=length(x[[j]]) #temp<-matrix(NA,ncol=length(x),nrow=length(x[[1]])) temp<-matrix(NA,ncol=J,nrow=max(nval)) for(j in 1:J)temp[1:nval[j],j]<-x[[j]] temp } Aband<-function(x,alpha=.05,plotit=T,sm=T,SEED=T,nboot=500,grp=c(1:4), xlab="X (First Factor)",ylab="Delta",crit=NA,print.all=F,plot.op=F){ # # Apply the shift function when analyzing main effect in a # 2 by 2 design. # # For variables x1, x2, x3 and x4, # In effect, this function applies a shift function to the distributions # d1=(x1+x2)/2 and d2=(x3+x4)/2 # That is, focus on first factor. # For second factor, use Bband. # # grp indicates the groups to be compared. By default grp=c(1,2,3,4) # meaning that the first level of factor A consists of groups 1 and 2 # and the 2nd level of factor A consists of groups 3 and 4. # (So level 1 of factor B consists of groups 1 and 3 # # print.all=F, # returns number sig, meaning number of confidence intervals that do not # contain zero, # the critical value used as well as the KS test statistics. # print.all=T reports all confidence intervals, the number of which can # be large. # if(!is.list(x) && !is.matrix(x))stop("store data in list mode or a matrix") if(SEED)set.seed(2) if(is.matrix(x))x<-listm(x) for(j in 1:length(x))x[[j]]=elimna(x[[j]])/2 if(length(grp)<4)stop("There must be at least 4 groups") if(length(x)!=4)stop("The argument grp must have 4 values") x<-x[grp] n<-c(length(x[[1]]),length(x[[2]]),length(x[[3]]),length(x[[4]])) # Approximate the critical value # vals<-NA y<-list() if(is.na(crit)){ print("Approximating critical value. Please wait.") for(i in 1:nboot){ for(j in 1:4) y[[j]]<-rnorm(n[j]) temp<-ks.test(outer(y[[1]],y[[2]],FUN="+"),outer(y[[3]],y[[4]],FUN="+")) vals[i]<-temp[1]$statistic } vals<-sort(vals) ic<-(1-alpha)*nboot crit<-vals[ic] } if(plot.op){ plotit<-F g2plot(v1,v2) } output<-sband(outer(x[[1]],x[[2]],FUN="+"),outer(x[[3]],x[[4]],FUN="+"), plotit=plotit,crit=crit,flag=F,sm=sm,xlab=xlab,ylab=ylab) if(!print.all){ numsig<-output$numsig ks.test.stat<-ks.test(outer(x[[1]],x[[2]],FUN="+"), outer(x[[3]],x[[4]],FUN="+"))$statistic output<-matrix(c(numsig,crit,ks.test.stat),ncol=1) dimnames(output)<-list(c("number sig","critical value","KS test statistics"), NULL) } output } Bband<-function(x,alpha=.05,plotit=T,sm=T,SEED=T,nboot=500,grp=c(1:4), xlab="X (First Level)",ylab="Delta",crit=NA,print.all=F,plot.op=F){ # # Apply the shift function when analyzing main effect in a # 2 by 2 design. # # For variables x1, x2, x3 and x4, # In effect, this function applies a shift function to the distributions # d1=(x1+x3)/2 and d2=(x2+x4)/2. # That is, focus on main effects of Factor B. # # grp indicates the groups to be compared. By default grp=c(1,2,3,4) # meaning that the first level of factor A consists of groups 1 and 2 # and the 2nd level of factor A consists of groups 3 and 4. # (So level 1 of factor B consists of groups 1 and 3 # # print.all=F, # returns number sig, meaning number of confidence intervals that do not # contain zero, # the critical value used as well as the KS test statistics. # print.all=T reports all confidence intervals, the number of which can # be large. # if(!is.list(x) && !is.matrix(x))stop("store data in list mode or a matrix") if(SEED)set.seed(2) if(is.matrix(x))x<-listm(x) for(j in 1:length(x))x[[j]]=elimna(x[[j]])/2 if(length(x)<4)stop("There must be at least 4 groups") if(length(grp)!=4)stop("The argument grp must have 4 values") x<-x[grp] grp=c(1,3,2,4) x<-x[grp] # Arrange groups for main effects on factor B n<-c(length(x[[1]]),length(x[[2]]),length(x[[3]]),length(x[[4]])) # Approximate the critical value # vals<-NA y<-list() if(is.na(crit)){ print("Approximating critical value. Please wait.") for(i in 1:nboot){ for(j in 1:4) y[[j]]<-rnorm(n[j]) temp<-ks.test(outer(y[[1]],y[[2]],FUN="+"),outer(y[[3]],y[[4]],FUN="+")) vals[i]<-temp[1]$statistic } vals<-sort(vals) ic<-(1-alpha)*nboot crit<-vals[ic] } if(plot.op){ plotit<-F g2plot(v1,v2) } output<-sband(outer(x[[1]],x[[2]],FUN="+"),outer(x[[3]],x[[4]],FUN="+"), plotit=plotit,crit=crit,flag=F,sm=sm,xlab=xlab,ylab=ylab) if(!print.all){ numsig<-output$numsig ks.test.stat<-ks.test(outer(x[[1]],x[[2]],FUN="+"), outer(x[[3]],x[[4]],FUN="+"))$statistic output<-matrix(c(numsig,crit,ks.test.stat),ncol=1) dimnames(output)<-list(c("number sig","critical value","KS test statistics"), NULL) } output } iband<-function(x,alpha=.05,plotit=T,sm=T,SEED=T,nboot=500,grp=c(1:4), xlab="First Difference",ylab="Delta",crit=NA,print.all=F,plot.op=F){ # # Apply the shift function when analyzing interactions in a # 2 by 2 design. # # For variables x1, x2, x3 and x4, # In effect, this function applies a shift function to the distributions # d1=x1-x2 and d2=x3-x4 # # grp indicates the groups to be compared. By default grp=c(1,2,3,4) # meaning that the first four groups are used with the difference between # the first two compared to the difference between the second two. # (Rows are being compared in a 2 by 2 design # To compare difference between groups 1 and 3 versus 2 and 4 (columns in # a 2 by 2 design), set grp=c(1,3,2,4). # # print.all=F, # returns number sig, meaning number of confidence intervals that do not # contain zero, # the critical value used as well as the KS test statistics. # print.all=T reports all confidence intervals, the number of which can # be large. # if(!is.list(x) && !is.matrix(x))stop("store data in list mode or a matrix") if(SEED)set.seed(2) if(is.matrix(x))x<-listm(x) if(length(x)<4)stop("There must be at least 4 groups") for(j in 1:length(x))x[[j]]=elimna(x[[j]]) if(length(grp)!=4)stop("The argument grp must have 4 values") x<-x[grp] n<-c(length(x[[1]]),length(x[[2]]),length(x[[3]]),length(x[[4]])) # Approximate the critical value # vals<-NA y<-list() if(is.na(crit)){ print("Approximating critical value. Please wait.") for(i in 1:nboot){ for(j in 1:4) y[[j]]<-rnorm(n[j]) temp<-ks.test(outer(y[[1]],y[[2]],FUN="-"),outer(y[[3]],y[[4]],FUN="-")) vals[i]<-temp[1]$statistic } vals<-sort(vals) ic<-(1-alpha)*nboot crit<-vals[ic] } if(plot.op){ plotit<-F g2plot(v1,v2) } output<-sband(outer(x[[1]],x[[2]],FUN="-"),outer(x[[3]],x[[4]],FUN="-"), plotit=plotit,crit=crit,flag=F,sm=sm,xlab=xlab,ylab=ylab) if(!print.all){ numsig<-output$numsig ks.test.stat<-ks.test(outer(x[[1]],x[[2]],FUN="-"), outer(x[[3]],x[[4]],FUN="-"))$statistic output<-matrix(c(numsig,crit,ks.test.stat),ncol=1) dimnames(output)<-list(c("number sig","critical value","KS test statistics"), NULL) } output } disband<-function(x,sm=T,op=1,grp=c(1:4),xlab="First Group", ylab="Delta"){ # # When plotting, the median of x is marked with a + and the two # quaratiles are marked with o. # # sm=T, shift function is smoothed using: # op!=1, running interval smoother, # otherwise use lowess. # if(is.matrix(x))x=listm(x) if(length(grp)!=4)stop("The argument grp must have 4 values") x=x[grp] for(j in 1:4)x[[j]]=elimna(x[[j]]) pc<-NA crit= 1.36 * sqrt((length(x[[1]]) + length(x[[2]]))/(length(x[[1]]) * length(x[[2]]))) remx=x for(iloop in 1:2){ if(iloop==1){ x=remx[[1]] y=remx[[2]] } if(iloop==2){ x=remx[[3]] y=remx[[4]] } xsort<-sort(x) ysort<-c(NA,sort(y)) l<-0 u<-0 ysort[length(y)+1+1]<-NA for(ivec in 1:length(x)) { isub<-max(0,ceiling(length(y)*(ivec/length(x)-crit))) l[ivec]<-ysort[isub+1]-xsort[ivec] isub<-min(length(y)+1,floor(length(y)*(ivec/length(x)+crit))+1) u[ivec]<-ysort[isub+1]-xsort[ivec] } num<-length(l[l>0 & !is.na(l)])+length(u[u<0 & !is.na(u)]) qhat<-c(1:length(x))/length(x) m<-matrix(c(qhat,l,u),length(x),3) dimnames(m)<-list(NULL,c("qhat","lower","upper")) xsort<-sort(x) ysort<-sort(y) del<-0 for (i in 1:length(x)){ ival<-round(length(y)*i/length(x)) if(ival<=0)ival<-1 if(ival>length(y))ival<-length(y) del[i]<-ysort[ival]-xsort[i] } if(iloop==1){ allx<-c(xsort,xsort,xsort) ally<-c(del,m[,2],m[,3]) } if(iloop==2){ allx<-c(allx,xsort,xsort,xsort) ally<-c(ally,del,m[,2],m[,3]) plot(allx,ally,type="n",ylab=ylab,xlab=xlab) } ik<-rep(F,length(xsort)) if(sm){ if(op==1){ ik<-duplicated(xsort) del<-lowess(xsort,del)$y } if(op!=1)del<-runmean(xsort,del,pyhat=T) } if(iloop==1){ xsort1=xsort[!ik] del1=del[!ik] } if(iloop==2){ lines(xsort1,del1,lty=iloop) lines(xsort[!ik],del[!ik],lty=iloop) }} done="Done" done } scor<-function(x,y=NA,corfun=pcor,gval=NA,plotit=T,op=T,cop=2,xlab="VAR 1", ylab="VAR 2"){ # # Compute a skipped correlation coefficient. # # Eliminate outliers using a projection method # That is, compute Donoho-Gasko median, for each point # consider the line between it and the median, # project all points onto this line, and # check for outliers using a boxplot rule. # Repeat this for all points. A point is declared # an outlier if for any projection it is an outlier # using a modification of the usual boxplot rule. # # For information about the argument cop, see the function # outpro. # # Eliminate any outliers and compute correlation using # remaining data. # # corfun=pcor means Pearson's correlation is used. # corfun=spear means Spearman's correlation is used. if(is.na(y[1]))m<-x if(!is.na(y[1]))m<-cbind(x,y) m<-elimna(m) temp<-outpro(m,gval=gval,plotit=plotit,op=op,cop=cop, xlab=xlab,ylab=ylab)$keep tcor<-corfun(m[temp,])$cor if(ncol(m)==2)tcor<-tcor[1,2] test<-abs(tcor*sqrt((nrow(m)-2)/(1-tcor**2))) if(ncol(m)!=2)diag(test)<-NA crit<-6.947/nrow(m)+2.3197 print(list(cor.values=tcor,test.stat=test,crit.05=crit)) } cov.mba<-function(x,COR=F){ val<-covmba2(x)$cov if(COR){ val=val/outer(sqrt(diag(val)),sqrt(diag(val))) } val } qregci<-function(x,y,nboot=100,alpha=.05, qval=.5,SEED=T,pr=T,xout=F,outfun=out,...){ # # Test the hypothesis that the quantile regression slopes are zero. # Can use the .5 quantile regression line only. # Suggest only using quantiles between # .2 and .8. If using both .2 and .8 quantiles, or # the .2, .5 and .8 quantile regression lines. # FWE is controlled for alpha=.1, .05, .025 and .01. # xx<-elimna(cbind(x,y)) np<-ncol(xx) p<-np-1 y<-xx[,np] x<-xx[,1:p] x<-as.matrix(x) if(xout){ x<-as.matrix(x) flag<-outfun(x,...)$keep x<-x[flag,] y<-y[flag] } x<-as.matrix(x) n<-length(y) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. if(pr)print("Taking bootstrap samples. Please wait.") data<-matrix(sample(length(y),size=length(y)*nboot,replace=T),nrow=nboot) # determine critical value. crit<-NA if(alpha==.1)crit<-1.645-1.19/sqrt(n) if(alpha==.05)crit<-1.96-1.37/sqrt(n) if(alpha==.025)crit<-2.24-1.18/sqrt(n) if(alpha==.01)crit<-2.58-1.69/sqrt(n) crit.fwe<-crit if(length(qval)==2 || p==2){ if(alpha==.1)crit.fwe<-1.98-1.13/sqrt(n) if(alpha==.05)crit.fwe<-2.37-1.56/sqrt(n) if(alpha==.025)crit.fwe<-2.60-1.04/sqrt(n) if(alpha==.01)crit.fwe<-3.02-1.35/sqrt(n) } if(length(qval)==3 || p==3){ if(alpha==.1)crit.fwe<-2.145-1.31/sqrt(n) if(alpha==.05)crit.fwe<-2.49-1.49/sqrt(n) if(alpha==.025)crit.fwe<-2.86-1.52/sqrt(n) if(alpha==.01)crit.fwe<-3.42-1.85/sqrt(n) } if(is.na(crit.fwe)){ print("Could not determine a critical value") print("Only alpha=.1, .05, .025 and .01 are allowed") } if(p==1){ bvec<-apply(data,1,qindbt.sub,x,y,qval=qval) estsub<-NA for(i in 1:length(qval)){ estsub[i]<-qreg(x,y,qval[i])$coef[2] } if(is.matrix(bvec))se.val<-sqrt(apply(bvec,1,FUN=var)) if(!is.matrix(bvec))se.val<-sqrt(var(bvec)) test<-abs(estsub)/se.val ci.mat<-matrix(nrow=length(qval),ncol=3) dimnames(ci.mat)<-list(NULL,c("Quantile","ci.lower","ci.upper")) ci.mat[,1]<-qval ci.mat[,2]<-estsub-crit*se.val ci.mat[,3]<-estsub+crit*se.val } if(p>1){ if(length(qval)>1){ print("With p>1 predictors,only the first qval value is used") } bvec<-apply(data,1,regboot,x,y,regfun=qreg,qval=qval[1]) se.val<-sqrt(apply(bvec,1,FUN=var)) estsub<-qreg(x,y,qval=qval[1])$coef test<-abs(estsub)/se.val ci.mat<-matrix(nrow=np,ncol=3) dimnames(ci.mat)<-list(NULL,c("Predictor","ci.lower","ci.upper")) ci.mat[,1]<-c(0:p) ci.mat[,2]<-estsub-crit*se.val ci.mat[,3]<-estsub+crit*se.val } list(test=test,se.val=se.val,crit.val=crit,crit.fwe=crit.fwe, slope.est=estsub,ci=ci.mat) } rqtest<-function(x,y,qval=.5,nboot=200,alpha=.05,SEED=T,xout=F,outfun=out,...){ # # Omnibus test when using a quantile regression estimator # x<-as.matrix(x) if(xout){ x<-as.matrix(x) flag<-outfun(x,...)$keep x<-x[flag,] y<-y[flag] } x<-as.matrix(x) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") data<-matrix(sample(length(y),size=length(y)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,rqtest.sub,x,y,qval=qval) # bvec is a p+1 by nboot matrix. The first row # contains the bootstrap intercepts, the second row # contains the bootstrap values for first predictor, etc. p<-ncol(x) if(p==1)stop("Use qregci when p=1") n<-length(y) np<-p+1 bvec<-t(bvec) semat<-var(bvec[,2:np]) temp<-rqfit(x,y,qval=qval)$coef[2:np] temp<-as.matrix(temp) test<-t(temp)%*%solve(semat)%*%temp test<-test*(n-p)/((n-1)*p) p.value<-1-pf(test,p,n-p) # Determine adjusted critical level, if possible. adjusted.alpha=alpha if(n<=60){ if(alpha==.1){ if(p==2){ b1<-0-0.001965 b0<-.2179 } if(p==3){ b1<-0-.003 b0<-.2814 } if(p==4){ b1<-0-.0058 b0<-.4478 } if(p==5){ b1<-0-.00896 b0<-.6373 } if(p>=6){ b1<-0-.0112 b0<-.7699 }} if(alpha==.05){ if(p==2){ b1<-0-0.001173 b0<-.1203 } if(p==3){ b1<-0-.00223 b0<-.184 } if(p==4){ b1<-0-.00476 b0<-.3356 } if(p==5){ b1<-0-.0063 b0<-.425 } if(p==6){ b1<-0-.00858 b0<-.5648 }} if(alpha==.025){ if(p==2){ b1<-0-0.00056 b0<-.05875 } if(p==3){ b1<-0-.00149 b0<-.1143 } if(p==4){ b1<-0-.00396 b0<-.2624 } if(p==5){ b1<-0-.00474 b0<-.3097 } if(p==6){ b1<-0-.0064 b0<-.4111 }} if(alpha==.01){ if(p==2){ b1<-0-0.00055 b0<-.043 } if(p==3){ b1<-0-.00044 b0<-.0364 } if(p==4){ b1<-0-.0024 b0<-.1546 } if(p==5){ b1<-0-.00248 b0<-.159 } if(p==6){ b1<-0-.00439 b0<-.2734 }} adjusted.alpha<-b1*n+b0 adjusted.alpha<-max(alpha,adjusted.alpha) } list(test.stat=test,p.value=p.value,adjusted.alpha=adjusted.alpha) } covmba2<-function(x, csteps = 5) { # Perform the median ball algorithm. # # It returns a measure of location and scatter for the # multivariate data in x, which is assumed to have # p>-2 column and n rows. # # This code is based on a very slight modificatiion of code originally # written by David Olive # x<-as.matrix(x) if(!is.matrix(x))stop("x should be a matrix") p <- dim(x)[2] #if(p==1)stop("x should be a matrix with two or more columns of variables") ##get the DGK estimator covs <- var(x) mns <- apply(x, 2, mean) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) # mns <- apply(x[md2 <= medd2, ], 2, mns <- apply(as.matrix(x[md2 <= medd2, ]), 2, mean) covs <- var(x[md2 <= medd2, ]) } covb <- covs mnb <- mns ##get the square root of det(covb) critb <- prod(diag(chol(covb))) ##get the resistant estimator covv <- diag(p) med <- apply(x, 2, median) md2 <- mahalanobis(x, center = med, covv) medd2 <- median(md2) ## get the start # mns <- apply(x[md2 <= medd2, ], 2, mean) mns <- apply(as.matrix(x[md2 <= medd2, ]), 2, mean) covs <- var(x[md2 <= medd2, ]) ## concentrate for(i in 1:csteps) { md2 <- mahalanobis(x, mns, covs) medd2 <- median(md2) # mns <- apply(x[md2 <= medd2, ], 2,mean) mns <- apply(as.matrix(x[md2 <= medd2, ]), 2, mean) covs <- var(x[md2 <= medd2, ]) } crit <- prod(diag(chol(covs))) if(crit < critb) { critb <- crit covb <- covs mnb <- mns } ##scale for better performance at MVN rd2 <- mahalanobis(x, mnb, covb) const <- median(rd2)/(qchisq(0.5, p)) covb <- const * covb list(center = mnb, cov = covb) } rmmcp<-function(x, con = 0, tr = 0.2, alpha = 0.05,dif=T){ # # MCP on trimmed means with FWE controlled with Rom's method # flagcon=F if(!is.matrix(x))x<-matl(x) if(!is.matrix(x))stop("Data must be stored in a matrix or in list mode.") con<-as.matrix(con) J<-ncol(x) xbar<-vector("numeric",J) x<-elimna(x) # Remove missing values nval<-nrow(x) h1<-nrow(x)-2*floor(tr*nrow(x)) df<-h1-1 for(j in 1: J)xbar[j]<-mean(x[,j],tr) if(sum(con^2!=0))CC<-ncol(con) if(sum(con^2)==0)CC<-(J^2-J)/2 ncon<-CC if(alpha==.05){ dvec<-c(.05,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) if(ncon > 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.01,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01)dvec<-alpha/c(1:ncon) if(sum(con^2)==0){ flagcon<-T psihat<-matrix(0,CC,5) dimnames(psihat)<-list(NULL,c("Group","Group","psihat","ci.lower","ci.upper")) test<-matrix(NA,CC,6) dimnames(test)<-list(NULL,c("Group","Group","test","p.value","p.crit","se")) temp1<-0 jcom<-0 for (j in 1:J){ for (k in 1:J){ if (j < k){ jcom<-jcom+1 q1<-(nrow(x)-1)*winvar(x[,j],tr) q2<-(nrow(x)-1)*winvar(x[,k],tr) q3<-(nrow(x)-1)*wincor(x[,j],x[,k],tr)$cov sejk<-sqrt((q1+q2-2*q3)/(h1*(h1-1))) if(!dif){ test[jcom,6]<-sejk test[jcom,3]<-(xbar[j]-xbar[k])/sejk temp1[jcom]<-2 * (1 - pt(abs(test[jcom,3]), df)) test[jcom,4]<-temp1[jcom] psihat[jcom,1]<-j psihat[jcom,2]<-k test[jcom,1]<-j test[jcom,2]<-k psihat[jcom,3]<-(xbar[j]-xbar[k]) } if(dif){ dv<-x[,j]-x[,k] test[jcom,6]<-trimse(dv,tr) temp<-trimci(dv,alpha=alpha/CC,pr=F) test[jcom,3]<-temp$test.stat temp1[jcom]<-temp$p.value test[jcom,4]<-temp1[jcom] psihat[jcom,1]<-j psihat[jcom,2]<-k test[jcom,1]<-j test[jcom,2]<-k psihat[jcom,3]<-mean(dv,tr=tr) psihat[jcom,4]<-temp$ci[1] psihat[jcom,5]<-temp$ci[2] } }}} temp2<-order(0-temp1) zvec<-dvec[1:ncon] sigvec<-(test[temp2]>=zvec) if(sum(sigvec)0){ if(nrow(con)!=ncol(x))warning("The number of groups does not match the number of contrast coefficients.") ncon<-ncol(con) psihat<-matrix(0,ncol(con),4) dimnames(psihat)<-list(NULL,c("con.num","psihat","ci.lower","ci.upper")) test<-matrix(0,ncol(con),5) dimnames(test)<-list(NULL,c("con.num","test","p.value","p.crit","se")) temp1<-NA for (d in 1:ncol(con)){ psihat[d,1]<-d if(!dif){ psihat[d,2]<-sum(con[,d]*xbar) sejk<-0 for(j in 1:J){ for(k in 1:J){ djk<-(nval-1)*wincor(x[,j],x[,k], tr)$cov/(h1*(h1-1)) sejk<-sejk+con[j,d]*con[k,d]*djk }} sejk<-sqrt(sejk) test[d,1]<-d test[d,2]<-sum(con[,d]*xbar)/sejk test[d,5]<-sejk temp1[d]<-2 * (1 - pt(abs(test[d,2]), df)) } if(dif){ for(j in 1:J){ if(j==1)dval<-con[j,d]*x[,j] if(j>1)dval<-dval+con[j,d]*x[,j] } temp1[d]<-trimci(dval,tr=tr,pr=F)$p.value test[d,1]<-d test[d,2]<-trimci(dval,tr=tr,pr=F)$test.stat test[d,5]<-trimse(dval,tr=tr) psihat[d,2]<-mean(dval,tr=tr) }} test[,3]<-temp1 temp2<-order(0-temp1) zvec<-dvec[1:ncon] sigvec<-(test[temp2,3]>=zvec) if(sum(sigvec)0] v2<-vec2[vec2>0] slope<-v1/v2 allvar<-NA for(i in 1:length(slope))allvar[i]<-sc(y-slope[i]*x,...) temp<-order(allvar) coef<-0 coef[2]<-slope[temp[1]] coef[1]<-median(y)-coef[2]*median(x) res<-y-coef[2]*x-coef[1] chk<-abs(res-median(res))/mad(res) xx<-x[chk<=2] yy<-y[chk<=2] temp<-tsreg(xx,yy) list(coef=temp$coef,residuals=temp$res) } gyreg<-function(x,y,rinit=lmsreg,K=2.5){ library(MASS) res<-rinit(x,y)$res res.scale<-abs(res)/mad(res) flag<-(res.scale >=K) i0<-sum(flag) il<-length(y)-i0+1 res.sort<-sort(res.scale) if(i0>0){ dval<-pnorm(res.sort[il:length(y)])-c(il:length(y))/length(y) } if(i0<=0)dval<-0 dval<-max(dval) ndval<-floor(length(y)*dval) if(ndval<0)ndval<-0 iup<-length(y)-ndval rord<-order(res.scale) flag<-rord[1:iup] x=as.matrix(x) temp<-lsfit(x[flag,],y[flag]) list(coef=temp$coef,res=temp$residual) } ancmppb<-function(x1,y1,x2,y2,fr1=1,fr2=1,alpha=.05,pts=NA,est=tmean,nboot=NA, bhop=F,SEED=T,...){ # # Compare two independent groups using the ancova method # with multiple covariates. # No parametric assumption is made about the form of # the regression lines--a running interval smoother is used. # Design points are chosen based on depth of points in x1 if pts=NA # Assume data are in x1 y1 x2 and y2 # if(is.na(pts[1])){ x1<-as.matrix(x1) pts<-ancdes(x1) } nvec=NULL pts<-as.matrix(pts) if(nrow(pts)>=29){ print("WARNING: More than 28 design points") print("Only first 28 are used.") pts<-pts[1:28,] } n1<-1 n2<-1 vecn<-1 mval1<-cov.mve(x1) mval2<-cov.mve(x2) for(i in 1:nrow(pts)){ n1[i]<-length(y1[near3d(x1,pts[i,],fr1,mval1)]) n2[i]<-length(y2[near3d(x2,pts[i,],fr2,mval2)]) } flag<-rep(T,nrow(pts)) for(i in 1:nrow(pts))if(n1[i]<10 || n2[i]<10)flag[i]<-F pts<-pts[flag,] if(sum(flag)==1)pts<-t(as.matrix(pts)) if(sum(flag)==0)stop("No comparable design points found, might increase span.") mat<-matrix(NA,nrow(pts),7) dimnames(mat)<-list(NULL,c("n1","n2","DIF","TEST","se","ci.low","ci.hi")) g1<-list() ip<-nrow(pts) ncom<-0 nc2<-ip con<-matrix(0,nrow=2*ip,ncol=nrow(pts)) for (i in 1:nrow(pts)){ ip<-ip+1 ncom<-ncom+1 nc2<-nc2+1 con[ncom,i]<-1 con[nc2,i]<-0-1 temp<-y1[near3d(x1,pts[i,],fr1,mval1)] g1[[i]]<-temp[!is.na(temp)] nvec[i]=length(g1[[i]]) temp<-y2[near3d(x2,pts[i,],fr2,mval2)] g1[[ip]]<-temp[!is.na(temp)] nvec[ip]=length(g1[[ip]]) if(i==nrow(pts))nvec=matrix(nvec,ncol=2,byrow=F) } mat<-pbmcp(g1,alpha=alpha,nboot=nboot,est=est,con=con,bhop=bhop,SEED=SEED,...) list(points=pts,sample.sizes=nvec,output=mat) } bwrmcp<-function(J,K,x,grp=NA,alpha=.05,bhop=F){ # # Do all pairwise comparisons of # main effects for Factor A and B and all interactions # using a rank-based method that tests for equal distributions. # # A between by within subjects design is assumed. # Levels of Factor A are assumed to be independent and # levels of Factor B are dependent. # # The data are assumed to be stored in x in list mode or in a matrix. # If grp is unspecified, it is assumed x[[1]] contains the data # for the first level of both factors: level 1,1. # x[[2]] is assumed to contain the data for level 1 of the # first factor and level 2 of the second factor: level 1,2 # x[[j+1]] is the data for level 2,1, etc. # If the data are in wrong order, grp can be used to rearrange the # groups. For example, for a two by two design, grp<-c(2,4,3,1) # indicates that the second group corresponds to level 1,1; # group 4 corresponds to level 1,2; group 3 is level 2,1; # and group 1 is level 2,2. # # Missing values are automatically removed. # if(is.list(x))xrem=matl(x) JK <- J * K if(is.matrix(x)){ xrem=x x <- listm(x) } if(!is.na(grp[1])) { yy <- x x<-list() for(j in 1:length(grp)) x[[j]] <- yy[[grp[j]]] } if(!is.list(x)) stop("Data must be stored in list mode or a matrix.") # for(j in 1:JK) { # xx <- x[[j]] # x[[j]] <- xx[!is.na(xx)] # Remove missing values # } # if(JK != length(x))warning("The number of groups does not match the number of contrast coefficients.") for(j in 1:JK){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. x[[j]]<-temp } # CC<-(J^2-J)/2 # Determine critical values ncon<-CC*(K^2-K)/2 if(!bhop){ if(alpha==.05){ dvec<-c(.05,.025,.0169,.0127,.0102,.00851,.0073,.00639,.00568,.00511) if(ncon > 10){ avec<-.05/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha==.01){ dvec<-c(.01,.005,.00334,.00251,.00201,.00167,.00143,.00126,.00112,.00101) if(ncon > 10){ avec<-.01/c(11:ncon) dvec<-c(dvec,avec) }} if(alpha != .05 && alpha != .01)dvec<-alpha/c(1:ncon) } if(bhop)dvec<-(ncon-c(1:ncon)+1)*alpha/ncon Fac.A<-matrix(0,CC,5) dimnames(Fac.A)<-list(NULL,c("Level","Level","test.stat","p-value","sig.crit")) mat<-matrix(c(1:JK),ncol=K,byrow=T) ic<-0 for(j in 1:J){ for(jj in 1:J){ if(j < jj){ ic<-ic+1 Fac.A[ic,1]<-j Fac.A[ic,2]<-jj datsub=xrem[,c(mat[j,],mat[jj,])] datsub=elimna(datsub) #temp<-bwrank(2,K,elimna(x[,c(mat[j,],mat[jj,])])) temp<-bwrank(2,K,datsub) Fac.A[ic,3]<-temp$test.A Fac.A[ic,4]<-temp$p.value.A }}} temp2<-order(0-Fac.A[,4]) Fac.A[temp2,5]<-dvec[1:length(temp2)] CCB<-(K^2-K)/2 ic<-0 Fac.B<-matrix(0,CCB,5) dimnames(Fac.B)<-list(NULL,c("Level","Level","test.stat","p-value","sig.crit")) for(k in 1:K){ for(kk in 1:K){ if(k=12]) isub[5]<-max(sub[vecn>=12]) isub[3]<-floor((isub[1]+isub[5])/2) isub[2]<-floor((isub[1]+isub[3])/2) isub[4]<-floor((isub[3]+isub[5])/2) mat<-matrix(NA,5,7) dimnames(mat)<-list(NULL,c("X","n1","n2","DIF","ci.low","ci.hi","p.value")) gv1<-vector("list") for (i in 1:5){ j<-i+5 temp1<-y1[near(x1,x1[isub[i]],fr1)] temp2<-y2[near(x2,x1[isub[i]],fr2)] temp1<-temp1[!is.na(temp1)] temp2<-temp2[!is.na(temp2)] mat[i,1]<-x1[isub[i]] mat[i,2]<-length(temp1) mat[i,3]<-length(temp2) mat[,4]<-runmbo(x1,y1,pts=x1[isub],pyhat=T,plotit=F,SEED=F,est=tmean)- runmbo(x2,y2,pts=x1[isub],pyhat=T,plotit=F,SEED=F,est=median) gv1[[i]]<-temp1 gv1[[j]]<-temp2 } I1<-diag(5) I2<-0-I1 con<-rbind(I1,I2) estmat1<-matrix(nrow=nboot,ncol=length(isub)) estmat2<-matrix(nrow=nboot,ncol=length(isub)) data1<-matrix(sample(length(y1),size=length(y1)*nboot,replace=T),nrow=nboot) data2<-matrix(sample(length(y2),size=length(y2)*nboot,replace=T),nrow=nboot) # for(ib in 1:nboot){ estmat1[ib,]=runmbo(x1[data1[ib,]],y1[data1[ib,]],pts=x1[isub], pyhat=T,plotit=F,SEED=F,est=median) estmat2[ib,]=runmbo(x2[data2[ib,]],y2[data2[ib,]],pts=x1[isub], pyhat=T,plotit=F,SEED=F,est=median) } dif<-(estmat1=.5)stop("Amount of trimming must be less than .5") if(is.list(m))m<-matl(m) if(!is.matrix(m))stop("Data must be stored in a matrix or in list mode.") if(ncol(m)==1){ if(tr<.5)val<-mean(m,tr) } if(ncol(m)>1){ temp<-NA if(ncol(m)!=2){ # Use approximate depth if(fast)temp<-fdepth.for(m,pr=F,cop=cop) if(!fast){ if(dop==1)temp<-fdepth(m,plotit=F,cop=cop) if(dop==2)temp<-fdepthv2(m) }} # Use exact depth if ncol=2 if(ncol(m)==2){ if(fast)temp<-depth2.for(m,pr=F,plotit=F) if(!fast){ for(i in 1:nrow(m)) temp[i]<-depth(m[i,1],m[i,2],m) }} mdep<-max(temp) flag<-(temp==mdep) flag2<-(temp>=tr) if(sum(flag2)==0)stop("Trimmed all of the data") if(sum(flag2)==1){ if(pr)print("Warning: Trimmed all but one point") val<-0 } if(sum(flag2)>1)val<-var(m[flag2,]) } if(pr && fast)print(val) val } medr<-function(x,est=median,alpha=.05,nboot=500,grp=NA,op=1,MM=F,cop=3,pr=T, SEED=T,...){ # # Test the hypothesis that the distribution for each pairwise # difference has a measure of location = 0 # By default, the median estimator is used # # Independent groups are assumed. # # The data are assumed to be stored in x in list mode or in a matrix. # If stored in list mode, # x[[1]] contains the data for the first group, x[[2]] the data # for the second group, etc. Length(x)=the number of groups = J, say. # If stored in a matrix, columns correspond to groups. # # By default, all pairwise differences are used, but contrasts # can be specified with the argument con. # The columns of con indicate the contrast coefficients. # Con should have J rows, J=number of groups. # For example, con[,1]=c(1,1,-1,-1,0,0) and con[,2]=c(,1,-1,0,0,1,-1) # will test two contrasts: (1) the sum of the first # two measures of location is # equal to the sum of the second two, and (2) the difference between # the first two is equal to the difference between the # measures of location for groups 5 and 6. # # The default number of bootstrap samples is nboot=500 # # op controls how depth is measured # op=1, Mahalanobis # op=2, Mahalanobis based on MCD covariance matrix # op=3, Projection distance # op=4, Projection distance using FORTRAN version # # for arguments MM and cop, see pdis. # if(is.matrix(x)){ xx<-list() for(i in 1:ncol(x)){ xx[[i]]<-x[,i] } x<-xx } if(!is.list(x))stop("Data must be stored in list mode or in matrix mode.") if(!is.na(grp)){ # Only analyze specified groups. xx<-list() for(i in 1:length(grp))xx[[i]]<-x[[grp[1]]] x<-xx } J<-length(x) mvec<-NA for(j in 1:J){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. x[[j]]<-temp mvec[j]<-est(temp,...) } Jm<-J-1 d<-(J^2-J)/2 data<-list() bvec<-matrix(NA,ncol=d,nrow=nboot) if(SEED)set.seed(2) # set seed of random number generator so that # results can be duplicated. if(pr)print("Taking bootstrap samples. Please wait.") for(it in 1:nboot){ for(j in 1:J)data[[j]]<-sample(x[[j]],size=length(x[[j]]),replace=T) dval<-0 for(j in 1:J){ for(k in 1:J){ if(j=dv[1:nboot])/nboot if(op==4)print(sig.level) list(sig.level=sig.level,output=output) } medind<-function(x,y,qval=.5,nboot=1000,com.pval=F,SEED=T,alpha=.05,pr=T, xout=F,outfun=out,chk.table=F,make.table=F,...){ # # Test the hypothesis that the regression surface is a flat # horizontal plane. # The method is based on a modification of a method derived by # He and Zhu 2003, JASA, 98, 1013-1022. # Here, resampling is avoided using approximate critical values if # com.pval=F # # critical values are available for 10<=n<=400, p=1,...,8 and # quantiles # qval=.25,.5, .75. # # To get a p-value, via simulations, set com.pval=T # nboot is number of simulations used to determine the p-value. # # Note: the arguments chk.table and make.table should not be used # as yet. Some bugs need be work out. # if(pr){ if(!com.pval)print("To get a p-value, set com.pval=T") print("Reject if the test statistic exceeds the critical value") } store.it=F x<-as.matrix(x) p<-ncol(x) pp1<-p+1 p.val<-NULL crit.val<-NULL yx<-elimna(cbind(y,x)) #Eliminate missing values. y<-yx[,1] x<-yx[,2:pp1] x<-as.matrix(x) if(xout){ flag<-outfun(x,...)$keep x<-x[flag,] y<-y[flag] } n<-length(y) if(qval==.5){ resmat1=matrix(c( 0.0339384580, 0.044080032, 0.050923441, 0.064172557, 0.0153224731, 0.021007108, 0.027687963, 0.032785044, 0.0106482053, 0.014777728, 0.018249546, 0.023638611, 0.0066190573, 0.009078091, 0.011690825, 0.014543009, 0.0031558563, 0.004374515, 0.005519069, 0.007212951, 0.0015448987, 0.002231473, 0.002748314, 0.003725916, 0.0007724197, 0.001021767, 0.001370776, 0.001818037),ncol=4,nrow=7,byrow=T) if(make.table) write(c(10,1,.5,resmat1[1,],20,1,.5,resmat1[2,]),"medind.crit",ncolumns=7) resmat2=matrix(c( 0.052847794, 0.061918744, 0.071346969, 0.079163419, 0.021103277, 0.027198076, 0.031926052, 0.035083610, 0.013720585, 0.018454145, 0.022177381, 0.026051716, 0.008389969, 0.010590374, 0.012169233, 0.015346065, 0.004261627, 0.005514060, 0.007132021, 0.008416836, 0.001894753, 0.002416311, 0.003085230, 0.003924706, 0.001045346, 0.001347837, 0.001579373, 0.001864344),ncol=4,nrow=7,byrow=T) resmat3=matrix(c( 0.071555715, 0.082937665, 0.089554679, 0.097538044, 0.031060795, 0.035798539, 0.043862556, 0.053712151, 0.019503635, 0.023776479, 0.027180121, 0.030991367, 0.011030001, 0.013419347, 0.015557409, 0.017979524, 0.005634478, 0.006804788, 0.007878358, 0.008807657, 0.002552182, 0.003603778, 0.004275965, 0.005021989, 0.001251044, 0.001531919, 0.001800608, 0.002037870),ncol=4,nrow=7,byrow=T) resmat4=matrix(c( 0.093267532, 0.101584002, 0.108733965, 0.118340448, 0.038677863, 0.045519806, 0.051402903, 0.060097046, 0.024205231, 0.029360145, 0.034267265, 0.039381482, 0.013739157, 0.015856343, 0.018065898, 0.019956084, 0.006467562, 0.007781030, 0.009037972, 0.010127143, 0.003197162, 0.003933525, 0.004656625, 0.005929469, 0.001652690, 0.001926060, 0.002363874, 0.002657071),ncol=4,nrow=7,byrow=T) resmat5=matrix(c( 0.117216934, 0.124714114, 0.129458602, 0.136456163, 0.048838630, 0.055608712, 0.060580045, 0.067943676, 0.030594644, 0.035003872, 0.040433885, 0.047648696, 0.016940240, 0.019527491, 0.022047442, 0.025313443, 0.008053039, 0.009778574, 0.011490394, 0.013383628, 0.003760567, 0.004376294, 0.005097890, 0.005866240, 0.001894616, 0.002253522, 0.002612405, 0.002938808),ncol=4,nrow=7,byrow=T) resmat6=matrix(c( 0.136961531, 0.144120225, 0.149003907, 0.152667432, 0.055909481, 0.062627211, 0.069978086, 0.081189957, 0.034634825, 0.040740587, 0.044161376, 0.047722045, 0.020165417, 0.023074738, 0.025881208, 0.028479913, 0.009436297, 0.011246968, 0.013220963, 0.015100546, 0.004644596, 0.005334418, 0.006040595, 0.007237195, 0.002277590, 0.002635712, 0.002997398, 0.003669488),ncol=4,nrow=7,byrow=T) resmat7=matrix(c( 0.156184672, 0.163226643, 0.171754686, 0.177142753, 0.070117003, 0.077052773, 0.082728047, 0.090410797, 0.041774517, 0.047379662, 0.053101833, 0.057674454, 0.023384451, 0.026014421, 0.029609042, 0.032619018, 0.010856382, 0.012567043, 0.013747870, 0.016257014, 0.005164004, 0.006131755, 0.006868101, 0.008351046, 0.002537642, 0.003044154, 0.003623654, 0.003974469),ncol=4,nrow=7,byrow=T) resmat8=matrix(c( 0.178399742, 0.180006714, 0.193799396, 0.199585892, 0.078032767, 0.085624186, 0.091511226, 0.102491785, 0.045997886, 0.052181615, 0.057362163, 0.062630424, 0.025895739, 0.029733034, 0.033764463, 0.037873655, 0.012195876, 0.013663248, 0.015487587, 0.017717864, 0.005892418, 0.006876488, 0.007893475, 0.008520783, 0.002839731, 0.003243909, 0.003738571, 0.004124057),ncol=4,nrow=7,byrow=T) crit5=array(cbind(resmat1,resmat2,resmat3,resmat4,resmat5,resmat6,resmat7, resmat8),c(7,4,8)) flag=T crit.val=NULL if(p > 8)flag=F if(n<10 || n>=400)flag=F aval<-c(.1,.05,.025,.01) aokay<-duplicated(c(alpha,aval)) if(sum(aokay)==0)flag=F if(flag){ nalpha=c(0:4) asel=c(0,aval) ialpha=nalpha[aokay] critit=crit5[,ialpha,p] nvec<-c(10,20,30,50,100,200,400) nval<-duplicated(c(n,nvec)) nval<-nval[2:8] if(sum(nval)>0)crit.val<-critit[nval] loc<-rank(c(n,nvec)) xx<-c(1/nvec[loc[1]-1]^1.5,1/nvec[loc[1]]^1.5) yy<-c(critit[loc[1]-1],critit[loc[1]]) icoef<-tsp1reg(xx,yy)$coef crit.val<-icoef[1]+icoef[2]/n^1.5 }} mqval<-min(c(qval,1-qval)) if(mqval==.25){ resmat1=matrix(c( 0.029933486, 0.0395983678, 0.054087714, 0.062961453, 0.011122294, 0.0149893431, 0.018154062, 0.022685244, 0.009207200, 0.0113020766, 0.014872309, 0.019930730, 0.004824185, 0.0070402246, 0.010356886, 0.013176896, 0.002370379, 0.0033146605, 0.004428004, 0.005122988, 0.001106460, 0.0016110185, 0.001984450, 0.002650256, 0.000516646, 0.0006796144, 0.000868751, 0.001202042),ncol=4,nrow=7,byrow=T) resmat2=matrix(c( 0.0448417783, 0.0602598211, 0.066001091, 0.087040667, 0.0173410522, 0.0224713157, 0.027370822, 0.033435727, 0.0121205549, 0.0150409465, 0.018938516, 0.022643559, 0.0064894201, 0.0084611518, 0.010700320, 0.013232000, 0.0029734778, 0.0040641310, 0.004911086, 0.005769038, 0.0015149104, 0.0020584993, 0.002582982, 0.003114029, 0.0007984207, 0.0009929547, 0.001182739, 0.001398774),ncol=4,nrow=7,byrow=T) resmat3=matrix(c( 0.0636530860, 0.072974943, 0.083840562, 0.097222407, 0.0216586978, 0.027436566, 0.031875356, 0.036830302, 0.0152898678, 0.018964066, 0.021728817, 0.028959751, 0.0083568493, 0.010071525, 0.012712862, 0.015254576, 0.0039033578, 0.004764140, 0.005577071, 0.006660322, 0.0019139215, 0.002343152, 0.002833612, 0.003465269, 0.0009598105, 0.001146689, 0.001355930, 0.001547572),ncol=4,nrow=7,byrow=T) resmat4=matrix(c( 0.085071252, 0.095947936, 0.104197413, 0.118449765, 0.029503024, 0.034198704, 0.039543410, 0.045043759, 0.019203266, 0.022768842, 0.026886843, 0.033481535, 0.011440493, 0.013555017, 0.016138970, 0.018297815, 0.004863139, 0.005756305, 0.007385239, 0.009114958, 0.002635144, 0.003111160, 0.003769051, 0.004215897, 0.001188837, 0.001435179, 0.001727871, 0.001956372),ncol=4,nrow=7,byrow=T) resmat5=matrix(c( 0.102893512, 0.114258558, 0.122545016, 0.130222265, 0.036733497, 0.042504996, 0.048663576, 0.055456582, 0.024192946, 0.028805967, 0.032924489, 0.038209545, 0.012663224, 0.014635216, 0.017275594, 0.019736410, 0.006105572, 0.007310803, 0.008960242, 0.009745320, 0.003067163, 0.003614637, 0.003997615, 0.004812373, 0.001441008, 0.001732819, 0.002078651, 0.002307551),ncol=4,nrow=7,byrow=T) resmat6=matrix(c( 0.117642769, 0.126566104, 0.133106804, 0.142280074, 0.044309420, 0.049731991, 0.053912739, 0.060512997, 0.028607224, 0.033826020, 0.038616476, 0.043546500, 0.015445120, 0.017557181, 0.020040720, 0.022747707, 0.007334749, 0.008406468, 0.009392098, 0.010919651, 0.003352200, 0.003814582, 0.004380562, 0.005252154, 0.001703698, 0.002001713, 0.002338651, 0.002772864),ncol=4,nrow=7,byrow=T) resmat7=matrix(c( 0.106573121, 0.113058950, 0.117388191, 0.121286795, 0.052170054, 0.058363322, 0.064733684, 0.069749344, 0.030696897, 0.035506926, 0.039265698, 0.044437674, 0.016737307, 0.019605734, 0.021253610, 0.022922988, 0.007767232, 0.009231789, 0.010340874, 0.011471110, 0.003998261, 0.004590177, 0.005506926, 0.006217415, 0.001903372, 0.002174748, 0.002519055, 0.002858655),ncol=4,nrow=7,byrow=T) resmat8=matrix(c( 0.119571179, 0.126977461, 0.130120853, 0.133258294, 0.059499563, 0.067185338, 0.071283297, 0.079430577, 0.034310968, 0.039827130, 0.044451690, 0.048512464, 0.018599530, 0.021093909, 0.023273085, 0.027471116, 0.009135712, 0.010901687, 0.012288682, 0.013729545, 0.004382249, 0.005191810, 0.005598429, 0.006484433, 0.002196973, 0.002525918, 0.002818550, 0.003242426),ncol=4,nrow=7,byrow=T) crit5=array(cbind(resmat1,resmat2,resmat3,resmat4,resmat5,resmat6,resmat7, resmat8),c(7,4,8)) flag=T crit.val=NULL if(p > 8)flag=F if(n<10 || n>=400)flag=F aval<-c(.1,.05,.025,.01) aokay<-duplicated(c(alpha,aval)) if(sum(aokay)==0)flag=F if(flag){ nalpha=c(0:4) asel=c(0,aval) ialpha=nalpha[aokay] critit=crit5[,ialpha,p] nvec<-c(10,20,30,50,100,200,400) nval<-duplicated(c(n,nvec)) nval<-nval[2:8] if(sum(nval)>0)crit.val<-critit[nval,p] loc<-rank(c(n,nvec)) xx<-c(1/nvec[loc[1]-1]^1.5,1/nvec[loc[1]]^1.5) yy<-c(critit[loc[1]-1],critit[loc[1]]) icoef<-tsp1reg(xx,yy)$coef crit.val<-icoef[1]+icoef[2]/n^1.5 }} if(is.null(crit.val))com.pval=T # no critical value found, so a p-value will be computed # the code for checking the file medind.crit, which appears # next, is not working yet. if(is.null(crit.val)){ # no critical value found yet, check file medind.crit if(chk.table){ z<-read.table("medind.crit") nz1<-nrow(z)+1 flag1<-as.matrix(duplicated(c(n,z[,1]))) flag2<-as.matrix(duplicated(c(p,z[,2]))) flag3<-as.matrix(duplicated(c(qval,z[,3]))) zz<-cbind(flag1,flag2,flag3) zz<-zz[2:nz1,] find.row<-apply(zz,1,sum) if(max(find.row)==3){ ir<-order(find.row) nir<-length(ir) ir<-ir[nir] critvals<-z[ir,4:7] if(pr){print("The .1, .05, .025 and .01 critical values are:") print(critvals) } crit.val<-critvals[2] }} if(max(find.row)!=3){ store.it=T if(!com.pval){ print("Critical values not available, will set com.pval=T") print("and compute them") com.pval<-T }}} gdot<-cbind(rep(1,n),x) gdot<-ortho(gdot) x<-gdot[,2:pp1] x<-as.matrix(x) coef<-NULL if(qval==.5)coef<-median(y) if(qval==.25)coef<-idealf(y)$ql if(qval==.75)coef<-idealf(y)$qu if(is.null(coef))coef<-qest(y,q=qval) res<-y-coef psi<-NA psi<-ifelse(res>0,qval,qval-1) rnmat<-matrix(0,nrow=n,ncol=pp1) ran.mat<-apply(x,2,rank) flagvec<-apply(ran.mat,1,max) for(j in 1:n){ flag<-ifelse(flagvec<=flagvec[j],T,F) flag<-as.numeric(flag) rnmat[j,]<-apply(flag*psi*gdot,2,sum) } rnmat<-rnmat/sqrt(n) temp<-matrix(0,pp1,pp1) for(i in 1:n)temp<-temp+rnmat[i,]%*%t(rnmat[i,]) temp<-temp/n test<-max(eigen(temp)$values) if(com.pval){ if(SEED)set.seed(2) p.val<-0 rem<-0 for(i in 1:nboot){ yboot<-rnorm(n) if(p==1)xboot<-rnorm(n) if(p>1)xboot<-rmul(n,p=p) temp3<-medindsub(x,yboot,qval=qval) if(test>=temp3)p.val<-p.val+1 rem[i]<-temp3 } ic10<-round(.9*nboot) ic05<-round(.95*nboot) ic025<-round(.975*nboot) ic001<-round(.99*nboot) rem<-sort(rem) p.val<-1-p.val/nboot # now remember the critical values by storing them in "medind.crit" if(store.it) write(c(n,p,qval,rem[ic10],rem[ic05],rem[ic025],rem[ic001]),"medind.crit", append=T,ncolumns=7) print("The .1, .05, .025 and .001 critical values are:") print(c(rem[ic10],rem[ic05],rem[ic025],rem[ic001])) crit.val<-rem[ic05] } names(crit.val)="" list(test.stat=test,crit.value=crit.val,p.value=p.val) } medindsub<-function(x,y,qval=.5){ # x<-as.matrix(x) n<-length(y) p<-ncol(x) pp1<-p+1 tvec<-c(qval,0-qval,1-qval,qval-1) pval<-c((1-qval)/2,(1-qval)/2,qval/2,qval/2) gdot<-cbind(rep(1,n),x) gdot<-ortho(gdot) x<-gdot[,2:pp1] x<-as.matrix(x) if(qval==.5)coef<-median(y) if(qval!=.5)coef<-qest(y) res<-y-coef psi<-NA psi<-ifelse(res>0,qval,qval-1) rnmat<-matrix(0,nrow=n,ncol=pp1) ran.mat<-apply(x,2,rank) flagvec<-apply(ran.mat,1,max) for(j in 1:n){ #flag<-ifelse(flagvec<=flagvec[j],T,F) flag<-ifelse(flagvec>=flagvec[j],T,F) rnmat[j,]<-apply(flag*psi*gdot,2,sum) } rnmat<-rnmat/sqrt(n) temp<-matrix(0,pp1,pp1) for(i in 1:n)temp<-temp+rnmat[i,]%*%t(rnmat[i,]) temp<-temp/n test<-max(eigen(temp)$values) test } linplot<-function(x,con=0,plotfun=akerd,nboot=800,plotit=T,pyhat=F,...){ # # plot distribtion of the linear contrast # c_1X_2+c_2X_2+... # # con contains contrast coefficients. If not specified, # con<-c(1,1,...,1) # if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in list mode or in matrix mode.") J<-length(x) tempn<-0 mvec<-NA for(j in 1:J){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. tempn[j]<-length(temp) x[[j]]<-temp } Jm<-J-1 # # Determine contrast matrix # If not specified, assume distribution of the sum is to be plotted # if(sum(con^2)==0)con<-matrix(1,J,1) bvec<-matrix(NA,nrow=J,ncol=nboot) for(j in 1:J){ data<-matrix(sample(x[[j]],size=nboot,replace=T),nrow=nboot) bvec[j,]<-data } bcon<-t(con)%*%bvec #ncon by nboot matrix bcon<-as.vector(bcon) dval<-plotfun(bcon,pyhat=pyhat,...) dval } lin2plot<-function(x,con,op=4,nboot=800,plotit=T,pyhat=F){ # # plot two distribtions. # The first is the distribtion of the linear contrast # c_1X_2+c_2X_2+... c_i>0 # and the second is the distribution of c_1X_2+c_2X_2+... c_i<0 # # con contains contrast coefficients. If not specified, # function terminates. # # if(is.matrix(x))x<-listm(x) if(!is.list(x))stop("Data must be stored in list mode or in matrix mode.") J<-length(x) if(J != length(con)){ stop("Number of contrast coefficients must equal the number of groups") } for(j in 1:J){ temp<-x[[j]] temp<-temp[!is.na(temp)] # Remove missing values. x[[j]]<-temp } # # Determine contrast matrix for positive contrast coefficients # flag<-(con<0) con1<-con con1[flag]<-0 # Determine contrast matrix for negative contrast coefficients flag<-(con>0) con2<-con con2[flag]<-0 bvec<-matrix(NA,nrow=J,ncol=nboot) for(j in 1:J){ data<-matrix(sample(x[[j]],size=nboot,replace=T),nrow=nboot) bvec[j,]<-data } bcon1<-t(con1)%*%bvec bcon2<-t(con2)%*%bvec bcon1<-as.vector(bcon1) bcon2<-as.vector(bcon2) fval<-g2plot(bcon1,bcon2,op=op,rval=15,fr=0.8,aval=0.5,xlab="X",ylab="") fval } adrunl<-function(x,y,est=tmean,iter=10,pyhat=F,plotit=T,fr=.8, theta=50,phi=25,expand=.5,scale=F,zscale=T,xout=F,outfun=out,...){ # # additive model based on running interval smoother # and backfitting algorithm # m<-elimna(cbind(x,y)) x<-as.matrix(x) p<-ncol(x) if(p==1)val<-lplot(x[,1],y,pyhat=T,plotit=plotit,span=fr)$yhat.values if(p>1){ library(MASS) library(akima) np<-p+1 x<-m[,1:p] y<-m[,np] fhat<-matrix(NA,ncol=p,nrow=length(y)) fhat.old<-matrix(NA,ncol=p,nrow=length(y)) res<-matrix(NA,ncol=np,nrow=length(y)) dif<-1 for(i in 1:p) fhat.old[,i]<-lplot(x[,i],y,pyhat=T,plotit=F,span=fr)$yhat.values eval<-NA for(it in 1:iter){ for(ip in 1:p){ res[,ip]<-y for(ip2 in 1:p){ if(ip2 != ip)res[,ip]<-res[,ip]-fhat.old[,ip2] } fhat[,ip]<-lplot(x[,ip],res[,ip],pyhat=T,plotit=F,span=fr)$yhat.values } eval[it]<-sum(abs(fhat/sqrt(sum(fhat^2))-fhat.old/sqrt(sum(fhat.old^2)))) if(it > 1){ itm<-it-1 dif<-abs(eval[it]-eval[itm]) } fhat.old<-fhat if(dif<.01)break } val<-apply(fhat,1,sum) aval<-est(y-val,...) val<-val+aval if(plotit && p==2){ fitr<-val iout<-c(1:length(fitr)) nm1<-length(fitr)-1 for(i in 1:nm1){ ip1<-i+1 for(k in ip1:length(fitr))if(sum(x[i,]==x[k,])==2)iout[k]<-0 } fitr<-fitr[iout>=1] # Eliminate duplicate points in the x-y plane # This is necessary when doing three dimensional plots # with the S-PLUS function interp mkeep<-x[iout>=1,] fitr<-interp(mkeep[,1],mkeep[,2],fitr) persp(fitr,theta=theta,phi=phi,xlab="x1",ylab="x2",zlab="",expand=expand, scale=scale) }} if(!pyhat)val<-"Done" val } Rpca<-function(x,p=ncol(x)-1,locfun=llocv2,loc.val=NULL,iter=100,SCORES=F, gvar.fun=cov.mba,SEED=T,...){ # # Robust PCA using random orthogonal matrices and # robust generalized variance method # # locfun, by default, use the marginal medians # alternatives are mcd, tauloc, spat,... # if(SEED)set.seed(2) x<-elimna(x) n<-nrow(x) m<-ncol(x) if(is.null(loc.val))info<-locfun(x,...)$center if(!is.null(loc.val))info<-loc.val for(i in 1:n)x[i,]<-x[i,]-info vals<-NA z<-matrix(nrow=n,ncol=p) bval<-array(NA,c(p,m,iter)) for(it in 1:iter){ B<-matrix(runif(p*m),nrow=p,ncol=m) B <- t(ortho(t(B))) # so rows are orthogonal bval[,,it]<-B for(i in 1:n)z[i,]<-B%*%as.matrix(x[i,]) #vals[it]<-gvar(z) vals[it]<-gvarg(z,var.fun=gvar.fun) } iord<-order(vals) Bop<-0-bval[,,iord[iter]] zval<-NULL if(SCORES){ for(i in 1:n)z[i,]<-Bop%*%as.matrix(x[i,]) zval<-z } list(B=Bop,gen.var=vals[iord[iter]],scores=zval) } Rsq<-function(x,y){ res=lsfit(x,y)$residuals yhat=y-res rsq=var(yhat)/var(y) rsq } ols<-function(x,y,xout=F,outfun=out,plotit=T){ # # Performs OLS regression calling built-in R or S+ funtions. # # xout=T will eliminate any leverage points (outliers among x values) # if one predictor, # plotit=T will plot the points and the regression line # m<-elimna(cbind(x,y)) x<-as.matrix(x) p<-ncol(x) pp<-p+1 x<-m[,1:p] y<-m[,pp] if(xout){ m<-cbind(x,y) flag<-outfun(x,plotit=F)$keep m<-m[flag,] x<-m[,1:p] y<-m[,pp] } temp<-summary(lm(y~x)) coef<-temp[4]$coefficients if(p==1){ if(plotit){ plot(x,y) abline(coef[,1]) }} Ftest<-temp[10]$fstatistic Ftest.p.value<-1-pf(Ftest[1],Ftest[2],Ftest[3]) Rval=Rsq(x,y) list(coef=coef,Ftest.p.value=Ftest.p.value,R.squared=Rval) } olstest<-function(x,y,nboot=500,SEED=T,RAD=T,xout=F,outfun=out){ # # Test the hypothesis that all OLS slopes are zero. # Heteroscedasticity is allowed. # # RAD=T: use Rademacher function to generate wild bootstrap values. # RAD=F, use standardized uniform distribution. # if(SEED)set.seed(2) m<-elimna(cbind(x,y)) x<-as.matrix(x) p<-ncol(x) pp<-p+1 x<-m[,1:p] y<-m[,pp] if(xout){ m<-cbind(x,y) flag<-outfun(x,plotit=F)$keep m<-m[flag,] x<-m[,1:p] y<-m[,pp] } x<-as.matrix(x) temp<-lsfit(x,y) yhat<-mean(y) res<-y-yhat test<-sum(temp$coef[2:pp]^2) print("Taking bootstrap sample, please wait.") if(RAD)data<-matrix(ifelse(rbinom(length(y)*nboot,1,0.5)==1,-1,1),nrow=nboot) if(!RAD){ data<-matrix(runif(length(y)*nboot),nrow=nboot)# data<-(data-.5)*sqrt(12) # standardize the random numbers. } rvalb<-apply(data,1,olstests1,yhat,res,x) p.val<-sum(rvalb>=test)/nboot list(p.value=p.val) } qrchk<-function(x,y,qval=.5,nboot=1000,com.pval=F,SEED=T,alpha=.05,pr=T, xout=F,outfun=out,chk.table=F,...){ # # Test of a linear fit based on quantile regression # The method stems from He and Zhu 2003, JASA, 98, 1013-1022. # Here, resampling is avoided using approximate critical values if # com.pval=F # # To get a p-value, via simulations, set com.pval=T # nboot is number of simulations used to determine p-value. # Execution time can be quite high # # This function quickly determines .1, .05, .025 and .01 # critical values for n<=400 and p<=6 (p= number of predictors) # and when dealing with the .5 quantile. # Otherwise, critical values are determined via simulations, which # can have high execution time. # # But, once critical values are determined for a given n, p and # quantile qval, the function will remember these values and use them # in the future. They are stored in a file called qrchk.crit # Currently, however, when you source the Rallfun files, these values # will be lost. You might save the file qrchk.crit in another file, # source Rallfun, then copy the save file back to qrchk.crit # if(pr){ if(!com.pval)print("To get a p-value, set com.pval=T") print("Reject if test statistic is >= critical value") } x<-as.matrix(x) p<-ncol(x) pp1<-p+1 yx<-elimna(cbind(y,x)) #Eliminate missing values. y<-yx[,1] x<-yx[,2:pp1] store.it=F x<-as.matrix(x) p.val<-NULL crit.val<-NULL x<-as.matrix(x) if(xout){ flag<-outfun(x,...)$keep x<-x[flag,] y<-y[flag] } # shift the marginal x values so that the test statistic is # invariant under changes in location n<-length(y) #x=x-matrix(rep(loc,n),ncol=length(loc),byrow=T) x=standm(x) if(p<=6){ if(qval==.5){ aval<-c(.1,.05,.025,.01) aokay<-duplicated(c(alpha,aval)) aokay<-sum(aokay) if(aokay>0){ crit10<-matrix(c(.0254773,.008372,.00463254,.0023586,.000959315,.00042248, .00020069, .039728,.012163,.0069332,.0036521,.001571,.0006882, .0003621, .055215,.0173357,.009427,.004581,.0021378,.00093787,.00045287, .075832,.0228556,.0118571,.005924,.00252957,.0011593,.00056706, .103135,.0298896,.0151193,.0073057,.00305456,.0014430,.000690435, .12977,.03891,.018989,.009053,.0036326,.001617,.000781457),ncol=6,nrow=7) crit05<-matrix(c(.031494,.010257,.00626,.00303523,.0012993,.000562247, .00025972, .046296,.015066,.00885556,.0045485,.0110904,.00086946,.000452978, .063368,.0207096546,.010699,.005341,.0025426,.0011305,.000539873, .085461,.027256,.014067,.0071169,.002954,.0013671,.000660338, .11055,.03523,.017511,.0084263,.0036533,.0016338,.00081289, .13692,.043843,.0222425,.0102265,.004283,.0019,.000907241),ncol=6,nrow=7) crit025<-matrix(c(.0361936,.012518,.007296,.0036084,.00172436,.000725365, .000327776, .05315,.017593,.0102389,.0055043,.00227459,.0010062,.000523526, .07214,.023944,.013689,.0060686,.0028378,.00136379,.000635645, .093578,.0293223,.0156754,.0086059,.0035195,.001694,.00074467, .118414,.03885,.0201468,.0094298,.0040263,.00182437,.000916557, .14271,.047745,.0253974,.011385,.004725,.00207588,.0010191),ncol=6,nrow=7) crit01<-matrix(c(.0414762,.0146553,.0098428,.0045274,.00219345,.00096244, .000443827, .058666,.020007,.01129658,.0063092,.002796,.0011364,.000628054, .079446,.0267958,.015428,.0071267,.0034163,.0015876,.000734865, .102736,.0357572,.017786,.0093682,.0042367,.0019717,.000868506, .125356,.041411,.0234916,.0106895,.0047028,.0020759,.00101052, .14837,.053246,.027759,.012723,.00528,.002437,.00116065),ncol=6,nrow=7) if(alpha==.1)critit<-crit10 if(alpha==.05)critit<-crit05 if(alpha==.025)critit<-crit025 if(alpha==.01)critit<-crit01 nvec<-c(10,20,30,50,100,200,400) nval<-duplicated(c(n,nvec)) nval<-nval[2:7] if(sum(nval)>0)crit.val<-critit[nval,p] if(is.null(crit.val)){ if(n<=400){ loc<-rank(c(n,nvec)) xx<-c(1/nvec[loc[1]-1]^1.5,1/nvec[loc[1]]^1.5) yy<-c(critit[loc[1]-1,p],critit[loc[1],p]) } icoef<-lsfit(xx,yy)$coef crit.val<-icoef[1]+icoef[2]/n^1.5 }}}} if(is.null(crit.val) && !chk.table){ print("To check file qrchk.crit for a critical value") print("set chk.table=T") } if(is.null(crit.val) && chk.table){ # no critical value found yet, check file qrchk.crit (if chk.table=T) z<-read.table("qrchk.crit") nz1<-nrow(z)+1 flag1<-as.matrix(duplicated(c(n,z[,1]))) flag2<-as.matrix(duplicated(c(p,z[,2]))) flag3<-as.matrix(duplicated(c(qval,z[,3]))) zz<-cbind(flag1,flag2,flag3) zz<-zz[2:nz1,] find.row<-apply(zz,1,sum) if(max(find.row)==3){ ir<-order(find.row) nir<-length(ir) ir<-ir[nir] critvals<-z[ir,4:7] if(pr){print("The .1, .05, .025 and .01 critical values are:") print(critvals) } crit.val<-critvals[2] } if(max(find.row)!=3){ store.it=T if(!com.pval){ print("Critical values not available, will set com.pval=T") print("and compute them") com.pval<-T }}} gdot<-cbind(rep(1,n),x) gdot<-ortho(gdot) x<-gdot[,2:pp1] x<-as.matrix(x) temp<-rqfit(x,y,qval=qval,res=T) coef<-temp$coef psi<-NA psi<-ifelse(temp$residuals>0,qval,qval-1) rnmat<-matrix(0,nrow=n,ncol=pp1) ran.mat<-apply(x,2,rank) flagvec<-apply(ran.mat,1,max) for(j in 1:n){ flag<-ifelse(flagvec<=flagvec[j],T,F) #flag<-ifelse(flagvec>=flagvec[j],T,F) flag<-as.numeric(flag) rnmat[j,]<-apply(flag*psi*gdot,2,sum) } rnmat<-rnmat/sqrt(n) temp<-matrix(0,pp1,pp1) for(i in 1:n)temp<-temp+rnmat[i,]%*%t(rnmat[i,]) temp<-temp/n test<-max(eigen(temp)$values) if(com.pval){ if(SEED)set.seed(2) p.val<-0 rem<-0 for(i in 1:nboot){ yboot<-rnorm(n) if(p==1)xboot<-rnorm(n) if(p>1)xboot<-rmul(n,p=p) #temp3<-qrchksub(xboot,yboot,qval=qval) temp3<-qrchkv2(xboot,yboot,qval=qval) if(test>=temp3)p.val<-p.val+1 rem[i]<-temp3 } ic10<-round(.9*nboot) ic05<-round(.95*nboot) ic025<-round(.975*nboot) ic001<-round(.99*nboot) rem<-sort(rem) p.val<-1-p.val/nboot # now remember the critical values by storing them in "qrchk.crit" if(store.it) write(c(n,p,qval,rem[ic10],rem[ic05],rem[ic025],rem[ic001]),"qrchk.crit", append=T,ncolumns=7) print("The .1, .05, .025 and .001 critical values are:") print(c(rem[ic10],rem[ic05],rem[ic025],rem[ic001])) crit.val<-rem[ic05] } list(test.stat=test,crit.value=crit.val,p.value=p.val) } qrchkv2<-function(x,y,qval=.5,...){ # # Test of a linear fit based on quantile regression # The method stems from He and Zhu 2003, JASA, 98, 1013-1022. # Here, resampling is avoided using approximate critical values if # com.pval=F # # To get a p-value, via simulations, set com.pval=T # nboot is number of simulations used to determine p-value. # Execution time can be quite high # # This function quickly determines .1, .05, .025 and .01 # critical values for n<=400 and p<=6 (p= number of predictors) # and when dealing with the .5 quantile. # Otherwise, critical values are determined via simulations, which # can have high execution time. # # But, once critical values are determined for a given n, p and # quantile qval, the function will remember these values and use them # in the future. They are stored in a file called qrchk.crit # Currently, however, when you source the Rallfun files, these values # will be lost. You might save the file qrchk.crit in another file, # source Rallfun, then copy the save file back to qrchk.crit # x=as.matrix(x) p<-ncol(x) pp1<-p+1 yx<-elimna(cbind(y,x)) #Eliminate missing values. y<-yx[,1] x<-yx[,2:pp1] store.it=F x<-as.matrix(x) p.val<-NULL crit.val<-NULL x<-as.matrix(x) # shift the marginal x values so that the test statistic is # invariant under changes in location n<-length(y) x=standm(x) gdot<-cbind(rep(1,n),x) gdot<-ortho(gdot) x<-gdot[,2:pp1] x<-as.matrix(x) temp<-rqfit(x,y,qval=qval,res=T) coef<-temp$coef psi<-NA psi<-ifelse(temp$residuals>0,qval,qval-1) rnmat<-matrix(0,nrow=n,ncol=pp1) ran.mat<-apply(x,2,rank) flagvec<-apply(ran.mat,1,max) for(j in 1:n){ flag<-ifelse(flagvec<=flagvec[j],T,F) flag<-as.numeric(flag) rnmat[j,]<-apply(flag*psi*gdot,2,sum) } rnmat<-rnmat/sqrt(n) temp<-matrix(0,pp1,pp1) for(i in 1:n)temp<-temp+rnmat[i,]%*%t(rnmat[i,]) temp<-temp/n test<-max(eigen(temp)$values) test } sm2str<-function(xx,y,iv=c(1,2),nboot=100,SEED=T,xout=F,outfun=outpro,...){ # # Compare robust measures of association of two predictors # based on a smooth # if(!is.matrix(xx))stop("x should be a matrix with 2 or more columns") if(ncol(xx)<2)stop("x should be a matrix with 2 or more columns") val1=NA val2=NA x=xx[,iv] xy=elimna(cbind(x,y)) x=xy[,1:2] y=xy[,3] if(xout){ x<-as.matrix(x) flag<-outfun(x,...)$keep x<-x[flag,] y<-y[flag] x<-as.matrix(x) } if(SEED)set.seed(2) data1<-matrix(sample(length(y),size=length(y)*nboot,replace=T),nrow=nboot) data2<-matrix(sample(length(y),size=length(y)*nboot,replace=T),nrow=nboot) bvec1=apply(data1,1,sm2str.sub,x[,1],y) # 2 by nboot matrix bvec2=apply(data2,1,sm2str.sub,x[,2],y) # 2 by nboot matrix bvecd=bvec1-bvec2 pv=akerdcdf(bvecd,pts=0) vcor=cor(x,method="kendall") pv=2*min(c(pv,1-pv)) p.crit=.25*abs(vcor[1,2])+.05+(100-length(y))/10000 p.crit=max(c(.05,p.crit)) list(p.value=pv,p.crit=p.crit) } sm2str.sub<-function(isub,x,y){ xmat<-x[isub] val1<-lplot(xmat,y[isub],plotit=F)$Explanatory.power val1 } akerdcdf<-function(xx,hval=NA,aval=.5,op=1,fr=.8,pyhat=T,pts=0,plotit=F, xlab="",ylab=""){ # # Compute cumulative adaptive kernel density estimate # for univariate data # (See Silverman, 1986) # By default (univiate case) determine P(X<=pts), # pts=0 by default. # # op=1 Use expected frequency as initial estimate of the density # op=2 Univariate case only # Use normal kernel to get initial estimate of the density # fval<-"Done" if(is.matrix(xx)){ if(ncol(xx)>1)fval<-akerdmul(xx,pts=pts,hval=hval,aval=aval,fr=fr,pr=pyhat,plotit=plotit) plotit<-F } if(is.matrix(xx) && ncol(xx)==1)xx<-xx[,1] if(!is.matrix(xx)){ x<-sort(xx) if(op==1){ m<-mad(x) if(m==0){ temp<-idealf(x) m<-(temp$qu-temp$ql)/(qnorm(.75)-qnorm(.25)) } if(m==0)m<-sqrt(winvar(x)/.4129) if(m==0)stop("All measures of dispersion are equal to 0") fhat <- rdplot(x,pyhat=T,plotit=F,fr=fr) if(m>0)fhat<-fhat/(2*fr*m) } if(op==2){ init<-density(xx) fhat <- init$y x<-init$x } n<-length(x) if(is.na(hval)){ sig<-sqrt(var(x)) temp<-idealf(x) iqr<-(temp$qu-temp$ql)/1.34 A<-min(c(sig,iqr)) if(A==0)A<-sqrt(winvar(x))/.64 hval<-1.06*A/length(x)^(.2) # See Silverman, 1986, pp. 47-48 } gm<-exp(mean(log(fhat[fhat>0]))) alam<-(fhat/gm)^(0-aval) dhat<-NA if(is.na(pts[1]))pts<-x pts<-sort(pts) for(j in 1:length(pts)){ temp<-(pts[j]-x)/(hval*alam) sq5=0-sqrt(5) epan=.75*(temp-.2*temp^3/3)/sqrt(5)-.75*(sq5-.2*sq5^3/3)/sqrt(5) flag=(temp>=sqrt(5)) epan[flag]=1 flag=(temp= 0 negres <- res <= 0 lplus <- cumsum(posres) rplus <- lplus[n] - lplus lmin <- cumsum(negres) rmin <- lmin[n] - lmin depth <- pmin(lplus + rmin, rplus + lmin) min(depth) } depthcom<-function(x1,y1,x2,y2,est=tmean,fr=1){ temp1=depthcomsub(x1,y1,x2,y2,est=est,fr=fr) temp2=depthcomsub(x2,y2,x1,y1,est=est,fr=fr) dep=max(c(abs(temp1$dep1-temp1$dep2),abs(temp2$dep1-temp2$dep2))) dep } depthcomsub<-function(x1,y1,x2,y2,est=tmean,fr=1){ x1=(x1-median(x1))/mad(x1) x2=(x2-median(x2))/mad(x2) yh1=runhat(x1,y1,est=tmean,fr=fr) yh2=runhat(x2,y2,pts=x1,est=tmean,fr=fr) flag=is.na(yh2) res1=y1-yh1 res2=y1[!flag]-yh2[!flag] dep1=resdepth(x1,res1) dep2=resdepth(x1[!flag],res2) list(dep1=dep1,dep2=dep2) } ancsm<-function(x1,y1,x2,y2,crit.mat=NULL,nboot=200,SEED=T,REP.CRIT=F, est=tmean,fr=NULL,plotit=T,sm=F){ # # Compare two nonparametric # regression lines corresponding to two independent groups # using the depths of smooths. # One covariate only is allowed. # # # sm=T will create smooths using bootstrap bagging. # xy=elimna(cbind(x1,y1)) x1=xy[,1] xord=order(x1) x1=x1[xord] y1=xy[xord,2] xy=elimna(cbind(x2,y2)) x2=xy[,1] xord=order(x2) x2=x2[xord] y2=xy[xord,2] n1=length(y1) n2=length(y2) if(is.null(fr)){ fr=1 if(min(n1,n2)>150)fr=.2 if(max(n1,n2)<35)fr=.5 } if(SEED)set.seed(2) if(is.null(crit.mat[1])){ crit.val=NA yall=c(y1,y2) xall=c(x1,x2) nn=n1+n2 il=n1+1 for(i in 1:nboot){ data=sample(nn,nn,T) yy1=yall[data[1:n1]] yy2=yall[data[il:nn]] xx1=xall[data[1:n1]] xx2=xall[data[il:nn]] crit.mat[i]=depthcom(xx1,yy1,xx2,yy2,est=est,fr=fr) }} if(plotit)runmean2g(x1,y1,x2,y2,fr=fr,est=mean,tr=tr,sm=sm) dep=depthcom(x1,y1,x2,y2,est=est,fr=fr) n=min(n1,n2) pv=1-mean(crit.mat=crit)p.value<-c("Less than .1") crit<-15.49/n+2.68 if(test>=crit)p.value<-c("Less than .05") crit<-14.22/n+3.26 if(test>=crit)p.value<-c("Less than .025") crit<-24.83/n+3.74 if(test>=crit)p.value<-c("Less than .01") p.values[ic,3]=p.value }}} list(cor=val,test.results=info,p.values=p.values) } resdepth.sub<-function(x,res) { ########################################################################## # This function computes the regression depth of a regression line based # on its residuals. The fit could be, for example, a nonparmatric # regression or smooth. # # The algorithm is based on a simple modification of # # Rousseeuw, P.J. and Hubert, M. (1996), # Regression Depth, Technical report, University of Antwerp # ########################################################################## if(!is.vector(x)) stop("x should be vectors") n <- length(x) if(n < 2) stop("you need at least two observations") flag=is.na(res) x=x[!flag] res[!flag] xord=order(x) x=x[xord] res=res[xord] posres <- res >= 0 negres <- res <= 0 lplus <- cumsum(posres) rplus <- lplus[n] - lplus lmin <- cumsum(negres) rmin <- lmin[n] - lmin depth <- pmin(lplus + rmin, rplus + lmin) min(depth) } tbs <- function(x,eps=1e-3,maxiter=20,r=.45,alpha=.05,init.est=cov.mcd){ # Rocke's contrained s-estimator # # r=.45 is the breakdown point # alpha=.05 is the asymptotic rejection probability. # library(MASS) #if(!is.matrix(x))stop("x should be a matrix with two or more columns") x<-elimna(x) x=as.matrix(x) n <- nrow(x) p <- ncol(x) LIST=F if(p==1){ LIST=T p=2 x=cbind(x,rnorm(nrow(x))) # Yes, this code is odd, but for moment easiest way of handling p=1 } temp<-init.est(x) # very poor outside rate per obs under normality. t1<-temp$center s<-temp$cov #if(p==1)stop("x should be a matrix with two or more columns") c1M<-cgen.bt(n,p,r,alpha,asymp=FALSE) c1<-c1M$c1 if(c1==0)c1<-.001 #Otherwise get division by zero M<-c1M$M b0 <- erho.bt(p,c1,M) crit <- 100 iter <- 1 w1d <- rep(1,n) w2d <- w1d while ((crit > eps)&(iter <= maxiter)) { t.old <- t1 s.old <- s wt.old <- w1d v.old <- w2d d2 <- mahalanobis(x,center=t1,cov=s) d <- sqrt(d2) k <- ksolve.bt(d,p,c1,M,b0) d <- d/k w1d <- wt.bt(d,c1,M) w2d <- v.bt(d,c1,M) t1 <- (w1d %*% x)/sum(w1d) s <- s*0 for (i in 1:n) { xc <- as.vector(x[i,]-t1) s <- s + as.numeric(w1d[i])*(xc %o% xc) } s <- p*s/sum(w2d) mnorm <- sqrt(as.vector(t.old) %*% as.vector(t.old)) snorm <- eigen(s.old)$values[1] crit1 <- max(abs(t1 - t.old)) # crit <- max(crit1,crit2) crit <- max(abs(w1d-wt.old))/max(w1d) iter <- iter+1 } if(LIST){ v1=t1[1] v2=s[1,1] return(list(center=v1,var=v2)) } if(!LIST)return(list(center=t1,cov=s)) } pcorhc4sub<-function(x,y,CN=F){ # # Compute a .95 confidence interval for Pearson's correlation coefficient. # using the HC4 method # # CN=T uses Student's t with n-p degrees of freedom for critical value # CN=F uses normal distribution, which is better for some purposes. # xy<-elimna(cbind(x,y)) x<-xy[,1] y<-xy[,2] z1=(x-mean(x))/sqrt(var(x)) z2=(y-mean(y))/sqrt(var(y)) ans=olshc4sub(z1,z2,CN=CN) ci=ans$ci[2,3:4] ci } TWOpNOV<-function(x,y){ # # Compute a .95 confidence interval # for the difference between two dependent Pearson correlations, # non-overlapping case. # # Both x and y are assumed to be matrices with two columns. # The function compares the correlation between x[,1] and y[,1] # to the correlation between x[,2] and y[,2]. # # For simulation results, see Wilcox (2008). # COMPARING PEARSON CORRELATIONS: DEALING WITH # HETEROSCEDASTICITY AND NON-NORMALITY, unpublished tech report. # # if(!is.matrix(x))stop("x should be a matrix") if(!is.matrix(y))stop("y should be a matrix") if(ncol(x)!=2)stop("x should be a matrix with 2 columns") if(ncol(y)!=2)stop("y should be a matrix with 2 columns") xy=cbind(x,y) x1=xy[,1] x2=xy[,2] y1=xy[,3] y2=xy[,4] r12=cor(x1,x2) r13=cor(x1,y1) r14=cor(x1,y2) r23=cor(x2,y1) r24=cor(x2,y2) r34=cor(y1,y2) term1=.5*r12*r34*(r13^2+r14^2+r23^2+r24^2) term2=r12*r13*r14+r12*r23*r24+r13*r23*r34+r14*r24*r34 corhat=(term1+r13*r24+r14*r23-term2)/((1-r12^2)*(1-r34^2)) temp=pcorbv4(x1,x2,SEED=F) ci12=temp$ci[1] ci12[2]=temp$ci[2] temp=pcorbv4(y1,y2,SEED=F) ci34=temp$ci[1] ci34[2]=temp$ci[2] terml=2*corhat*(r12-ci12[1])*(ci34[2]-r34) termu=2*corhat*(ci12[2]-r12)*(r34-ci34[1]) L=r12-r34-sqrt((r12-ci12[1])^2+(ci34[2]-r34)^2-terml) U=r12-r34+sqrt((r12-ci12[2])^2+(ci34[1]-r34)^2-termu) list(ci.lower=L,ci.upper=U) } TWOpov<-function(x,y,alpha=.05,CN=F){ # # Comparing two dependent correlations: Overlapping case # # x is assumed to be a matrix with 2 columns # # Compare correlation of x[,1] with y to x[,2] with y # if(!is.matrix(x))stop("x should be a matrix") if(ncol(x)!=2)stop("x should be a matrix with two columns") xy=elimna(cbind(x,y)) x1=xy[,1] x2=xy[,2] y=xy[,3] r12=cor(x1,y) r13=cor(x2,y) r23=cor(x1,x2) ci12=pcorhc4(x1,y,alpha=alpha,CN=CN)$ci ci13=pcorhc4(x2,y,alpha=alpha,CN=CN)$ci corhat=((r23-.5*r12*r13)*(1-r12^2-r13^2-r23^2)+r23^3)/((1-r12^2)*(1-r13^2)) term1=2*corhat*(r12-ci12[1])*(ci13[2]-r13) term2=2*corhat*(r12-ci12[2])*(ci13[1]-r13) L=r12-r13-sqrt((r12-ci12[1])^2+(ci13[2]-r13)^2-term1) U=r12-r13+sqrt((r12-ci12[2])^2+(ci13[1]-r13)^2-term2) c(L,U) } sm2str.sub<-function(isub,x,y){ xmat<-x[isub] val1<-lplot(xmat,y[isub],plotit=F)$Explanatory.power val1 } sm2strv7<-function(xx,y,iv=c(1,2),nboot=100,SEED=T,xout=F,outfun=outpro,...){ # # Compare robust measures of association of two predictors # based on a smooth # # x is a matrix with two columns # robust explanatory of x[,1] with y is compared to x[,2] with y. # xout=T eliminates any leverage points found with outfun, which # defaults to outpro, a projecion method for detecting outliers. # if(!is.matrix(xx))stop("x should be a matrix with 2 or more columns") if(ncol(xx)<2)stop("x should be a matrix with 2 or more columns") val1=NA val2=NA x=xx[,iv] xy=elimna(cbind(x,y)) x=xy[,1:2] y=xy[,3] if(xout){ x<-as.matrix(x) flag<-outfun(x,...)$keep x<-x[flag,] y<-y[flag] x<-as.matrix(x) } if(SEED)set.seed(2) data1<-matrix(sample(length(y),size=length(y)*nboot,replace=T),nrow=nboot) data2<-matrix(sample(length(y),size=length(y)*nboot,replace=T),nrow=nboot) bvec1=apply(data1,1,sm2str.sub,x[,1],y) # 2 by nboot matrix bvec2=apply(data2,1,sm2str.sub,x[,2],y) # 2 by nboot matrix bvecd=bvec1-bvec2 pv=akerdcdf(bvecd,pts=0) vcor=cor(x,method="kendall") pv=2*min(c(pv,1-pv)) p.crit=.25*abs(vcor[1,2])+.05+(100-length(y))/10000 p.crit=max(c(.05,p.crit)) list(p.value=pv,p.crit=p.crit) } pcorhc4<-function(x,y,alpha=.05,CN=F){ # # Compute a .95 confidence interval for Pearson's correlation coefficient. # using the HC4 method # # CN=T uses Student's t with n-p degrees of freedom for critical value # CN=F uses normal distribution, which is better for some purposes. # xy<-elimna(cbind(x,y)) x<-xy[,1] y<-xy[,2] z1=(x-mean(x))/sqrt(var(x)) z2=(y-mean(y))/sqrt(var(y)) ans=olshc4(z1,z2,alpha=alpha,CN=CN) list(r=ans$r,ci=ans$ci[2,3:4],p.value=ans$ci[2,5]) } regpreS<-function(x,y,regfun=lsfit,error=absfun,nboot=100, mval=round(5*log(length(y))),locfun=mean,pr=T, xout=F,outfun=out, plotit=T,xlab="Model Number",ylab="Prediction Error",SEED=T,...){ # # Stepwise selection of predictors based on # estimates of prediction error using the regression method # regfun, # which defaults to least squares. Prediction error # is estimated with .632 method. # (See Efron and Tibshirani, 1993, pp. 252--254) # # The predictor values are assumed to be in the n by p matrix x. # The default number of bootstrap samples is nboot=100 # # Prediction error is the expected value of the function error. # The argument error defaults to absolute error. To use # squared error, set error=sqfun. # # regfun can be any s-plus function that returns the coefficients in # the vector regfun$coef, the first element of which contains the # estimated intercept, the second element contains the estimate of # the first predictor, etc. # # The default value for mval, the number of observations to resample # for each of the B bootstrap samples is based on results by # Shao (JASA, 1996, 655-665). (Resampling n vectors of observations, # model selection may not lead to the correct model as n->infinity. # if(SEED)set.seed(2) q=ncol(x) qm1=q-1 x<-as.matrix(x) d<-ncol(x) p1<-d+1 temp<-elimna(cbind(x,y)) x<-temp[,1:d] y<-temp[,d+1] x<-as.matrix(x) if(xout){ x<-as.matrix(x) flag<-outfun(x,SEED=F,...)$keep x<-x[flag,] y<-y[flag] x<-as.matrix(x) } adit=NULL pval=c(1:ncol(x)) #pval=c(1:q) allp=pval for(ip in 1:qm1){ model=list() for(j in 1:length(pval))model[[j]]=c(adit,pval[j]) temp=regpre(x,y,model=model,pr=F,plotit=F,adz=F,regfun=regfun, SEED=SEED)$estimates pbest=order(temp[,5]) adit=model[[pbest[1]]] pval=allp[-adit] } output=model[[pbest[1]]] output=c(output,allp[-output]) output } ebarplot<-function(x,y=NULL,nse=1, liw = uiw, aui=NULL, ali=aui, err="y", ylim=NULL, sfrac = 0.01, gap=0, add=FALSE, col=par("col"), lwd=par("lwd"), slty=par("lty"), xlab="Group", ylab=NULL, ...) { # plots error bars using the data in # x, which is assumed to be a matrix with J columns (J groups) or # x has list mode. # nse indicates how many standard errors to use when plotting. # # Missing values are automatically removed. # if(!is.null(y)){ if(is.matrix(x))stop("When y is given, x should not be a matrix") if(is.list(x))stop("When y is given, x should not be in list mode") rem=x x=list() x[[1]]=rem x[[2]]=y } if(is.matrix(x))x<-listm(x) mval<-NA if(!is.list(x) && is.null(y))stop("This function assumes there are two or more groups") for(j in 1:length(x))mval[j]<-mean(x[[j]],na.rm=T) se<-NA for(j in 1:length(x))se[j]<-sqrt(var(x[[j]],na.rm=T)/length(x[[j]])) uiw<-nse*se plotCI(mval,y=NULL, uiw=uiw, liw = uiw, aui=NULL, ali=aui, err="y", ylim=NULL, sfrac = 0.01, gap=0, add=FALSE, col=par("col"), lwd=par("lwd"), slty=par("lty"), xlab=xlab, ylab=ylab) } akp.effect<-function(x,y,EQVAR=T,tr=.2){ # # Computes the robust effect size suggested by #Algina, Keselman, Penfield Pcyh Methods, 2005, 317-328 library(MASS) x<-elimna(x) y<-elimna(y) n1<-length(x) n2<-length(y) s1sq=winvar(x,tr=tr) s2sq=winvar(y,tr=tr) spsq<-(n1-1)*s1sq+(n2-1)*s2sq sp<-sqrt(spsq/(n1+n2-2)) cterm=1 if(tr>0)cterm=area(dnormvar,qnorm(tr),qnorm(1-tr))+2*(qnorm(tr)^2)*tr cterm=sqrt(cterm) if(EQVAR)dval<-cterm*(tmean(x)-tmean(y))/sp if(!EQVAR){ dval<-cterm*(tmean(x)-tmean(y))/sqrt(s1sq) dval[2]=cterm*(tmean(x)-tmean(y))/sqrt(s2sq) } dval } wwwtrim<-function(J,K,L,data,tr=.2,grp=c(1:p),alpha=.05,p=J*K*L){ # Perform a within by within by within (three-way) anova on trimmed means where # # That is, there are three factors with a total of JKL dependent groups. # # The variable data is assumed to contain the raw # data stored in list mode. data[[1]] contains the data # for the first level of all three factors: level 1,1,1. # data][2]] is assumed to contain the data for level 1 of the # first two factors and level 2 of the third factor: level 1,1,2 # data[[L]] is the data for level 1,1,L # data[[L+1]] is the data for level 1,2,1. data[[2L]] is level 1,2,L. # data[[KL+1]] is level 2,1,1, etc. # # The default amount of trimming is tr=.2 # # It is assumed that data has length JKL, the total number of # groups being tested. # if(is.list(data))data=listm(elimna(matl(data))) if(is.matrix(data))data=listm(elimna(data)) if(!is.list(data))stop("Data are not stored in list mode or a matrix") if(p!=length(data)){ print("The total number of groups, based on the specified levels, is") print(p) print("The number of groups in data is") print(length(data)) print("Warning: These two values are not equal") } tmeans<-0 h<-0 v<-0 for (i in 1:p){ tmeans[i]<-mean(data[[grp[i]]],tr) h[i]<-length(data[[grp[i]]])-2*floor(tr*length(data[[grp[i]]])) # h is the effective sample size } v=covmtrim(data,tr=tr) ij<-matrix(c(rep(1,J)),1,J) ik<-matrix(c(rep(1,K)),1,K) il<-matrix(c(rep(1,L)),1,L) jm1<-J-1 cj<-diag(1,jm1,J) cj<-diag(1,jm1,J) for (i in 1:jm1)cj[i,i+1]<-0-1 km1<-K-1 ck<-diag(1,km1,K) for (i in 1:km1)ck[i,i+1]<-0-1 lm1<-L-1 cl<-diag(1,lm1,L) for (i in 1:lm1)cl[i,i+1]<-0-1 # Do test for factor A cmat<-kron(cj,kron(ik,il)) # Contrast matrix for factor A Qa=bwwtrim.sub(cmat, tmeans, v, h,p) Qa.siglevel <- 1 - pf(Qa, J - 1, 999) # Do test for factor B cmat<-kron(ij,kron(ck,il)) # Contrast matrix for factor B Qb=bwwtrim.sub(cmat, tmeans, v, h,p) Qb.siglevel <- 1 - pf(Qb, K - 1, 999) # Do test for factor C cmat<-kron(ij,kron(ik,cl)) # Contrast matrix for factor C Qc<-bwwtrim.sub(cmat, tmeans, v, h,p) Qc.siglevel <- 1 - pf(Qc, L - 1, 999) # Do test for factor A by B interaction cmat<-kron(cj,kron(ck,il)) # Contrast matrix for factor A by B Qab<-bwwtrim.sub(cmat, tmeans, v, h,p) Qab.siglevel <- 1 - pf(Qab, (J - 1) * (K - 1), 999) # Do test for factor A by C interaction cmat<-kron(cj,kron(ik,cl)) # Contrast matrix for factor A by C Qac<-bwwtrim.sub(cmat, tmeans, v, h,p) Qac.siglevel <- 1 - pf(Qac, (J - 1) * (L - 1), 999) # Do test for factor B by C interaction cmat<-kron(ij,kron(ck,cl)) # Contrast matrix for factor B by C Qbc<-bwwtrim.sub(cmat, tmeans, v, h,p) Qbc.siglevel <- 1 - pf(Qbc, (K - 1) * (L - 1), 999) # Do test for factor A by B by C interaction cmat<-kron(cj,kron(ck,cl)) # Contrast matrix for factor A by B by C Qabc<-bwwtrim.sub(cmat, tmeans, v, h,p) Qabc.siglevel <-1-pf(Qabc,(J-1)*(K-1)*(L-1), 999) list(Qa=Qa,Qa.p.value=Qa.siglevel,Qb=Qb,Qb.crit=Qb.siglevel, Qc=Qc,Qc.p.value=Qc.siglevel,Qab=Qab,Qab.p.value=Qab.siglevel, Qac=Qac,Qac.p.value=Qac.siglevel,Qbc=Qbc,Qbc.p.value=Qbc.siglevel, Qabc=Qabc,Qabc.p.value=Qabc.siglevel) } ltsR<-function(x,y,RES=F,varfun=pbvar,corfun=pbcor){ # library(MASS) xy=elimna(cbind(x,y)) p1=ncol(xy) p=p1-1 x=xy[,1:p] y=xy[,p1] temp=ltsreg(x,y)$coef x=as.matrix(x) p=ncol(x)+1 res<-y-x%*%temp[2:p]-temp[1] yhat<-y-res if(!RES)res=NULL e.pow<-varfun(yhat)/varfun(y) if(is.na(e.pow))e.pow<-1 if(e.pow>=1)e.pow<-corfun(yhat,y)$cor^2 list(coef=temp,residuals=res,Explanatory.Power=e.pow, Strength.Assoc=sqrt(e.pow)) } standmar<-function(x,locfun=lloc,est=mean,scat=var,...){ # standardize a matrix x # x=as.matrix(x) m1=lloc(x,est=est,na.rm=T) v1=apply(x,2,scat,na.rm=T) p=ncol(x) for(j in 1:p)x[,j]=(x[,j]-m1[j])/sqrt(v1[j]) x } qsmcobs<-function(x,y,qval=.5,xlab="X",ylab="Y",FIT=T,pc=".",plotit=T){ # # Plots smooths of quantile regression lines using R package cobs # # qval is the quantile # qsmcobs(x,y,qval=c(.2,.5,.8)) will plot three smooths corresponding to # the .2, .5 and .8 quantile regression lines. # # FIT=T, uses the values returned by predict # FIT=F, determines predicted Y for each X and plots the results library(cobs) yhat=NULL res=NULL if(plotit)plot(x,y,xlab=xlab,ylab=ylab,pch=pc) if(FIT){ for(j in 1:length(qval)){ if(plotit)lines(predict(cobs(x,y,tau=qval[j],print.mesg=F,print.warn=F))) }} if(!FIT){ for(j in 1:length(qval)){ temp=cobs(x,y,tau=qval[j],print.mesg=F,print.warn=F) xord=order(x) if(plotit)lines(x[xord],temp$fitted[xord]) } if(length(qval)==1){ yhat=temp$fitted res=y-yhat }} list(yhat=yhat,residuals=res) } Qdepthcom<-function(x1,y1,x2,y2,qval){ temp1=Qdepthcomsub(x1,y1,x2,y2,qval) temp2=Qdepthcomsub(x2,y2,x1,y1,qval) dep=max(c(abs(temp1$dep1-temp1$dep2),abs(temp2$dep1-temp2$dep2))) dep } Qdepthcomsub<-function(x1,y1,x2,y2,qval){ x1=(x1-median(x1))/mad(x1) x2=(x2-median(x2))/mad(x2) yh1=qsmcobs(x1,y1,FIT=F,qval=qval,plotit=F)$yhat temp2=cobs(x2,y2,print.mesg=F,print.warn=F,tau=qval) yh2=predict(temp2,z=x1) yh2=yh2[,2] flag=is.na(yh2) res1=y1-yh1 res2=y1[!flag]-yh2[!flag] dep1=resdepth(x1,res1) dep2=resdepth(x1[!flag],res2) list(dep1=dep1,dep2=dep2) } Qancsm<-function(x1,y1,x2,y2,crit.mat=NULL,nboot=200,SEED=T,REP.CRIT=F, qval=.5,xlab="X",ylab="Y",plotit=T){ # # Compare two nonparametric # regression lines corresponding to two independent groups # using the depths of smooths. # # NULL hypothesis: regression lines are identical in terms of the median # of Y, given$X, for all X # The method is based on comparing the depth of the fitted regression lines # and is essentially a slight variation of the method in Wilcox # (in press) Journal of Data Science. # # One covariate only is allowed. # if(SEED)set.seed(2) xy=elimna(cbind(x1,y1)) x1=xy[,1] xord=order(x1) x1=x1[xord] y1=xy[xord,2] xy=elimna(cbind(x2,y2)) x2=xy[,1] xord=order(x2) x2=x2[xord] y2=xy[xord,2] n1=length(y1) n2=length(y2) if(is.null(crit.mat[1])){ crit.val=NA yall=c(y1,y2) xall=c(x1,x2) nn=n1+n2 il=n1+1 for(i in 1:nboot){ data=sample(nn,nn,T) yy1=yall[data[1:n1]] yy2=yall[data[il:nn]] xx1=xall[data[1:n1]] xx2=xall[data[il:nn]] crit.mat[i]=Qdepthcom(xx1,yy1,xx2,yy2,qval=qval) }} dep=Qdepthcom(x1,y1,x2,y2,qval=qval) pv=1-mean(crit.mat0] v2<-vec2[vec2>0] slope<-median(v1/v2,na.rm=T) coef<-median(y,na.rm=T)-slope*median(x,na.rm=T) names(coef)<-"Intercept" coef<-c(coef,slope) if(plotit){ plot(x,y,xlab="X",ylab="Y") abline(coef) } res<-y-slope*x-coef[1] list(coef=coef,residuals=res) } tsreg<-function(x,y,xout=F,outfun=out,iter=10,varfun=pbvar, corfun=pbcor,...){ # # Compute Theil-Sen regression estimator # # Use Gauss-Seidel algorithm # when there is more than one predictor # # MAR=T assumes missing values occur at random # x<-as.matrix(x) xx<-cbind(x,y) xx<-elimna(xx) x<-xx[,1:ncol(x)] x<-as.matrix(x) y<-xx[,ncol(x)+1] temp<-NA x<-as.matrix(x) if(xout){ x<-as.matrix(x) flag<-outfun(x,...)$keep x<-x[flag,] y<-y[flag] x<-as.matrix(x) } if(ncol(x)==1){ temp1<-tsp1reg(x,y) coef<-temp1$coef res<-temp1$res } if(ncol(x)>1){ for(p in 1:ncol(x)){ temp[p]<-tsp1reg(x[,p],y)$coef[2] } res<-y-x%*%temp alpha<-median(res) r<-matrix(NA,ncol=ncol(x),nrow=nrow(x)) tempold<-temp for(it in 1:iter){ for(p in 1:ncol(x)){ r[,p]<-y-x%*%temp-alpha+temp[p]*x[,p] temp[p]<-tsp1reg(x[,p],r[,p],plotit=F)$coef[2] } alpha<-median(y-x%*%temp) tempold<-temp } coef<-c(alpha,temp) res<-y-x%*%temp-alpha } yhat<-y-res e.pow<-varfun(yhat)/varfun(y) if(e.pow>=1)e.pow<-corfun(yhat,y)$cor^2 stre=sqrt(e.pow) list(coef=coef,residuals=res,Strength.Assoc=stre,Explanatory.Power=e.pow) } gplot<-function(x,xlab="Group",ylab="",xnum=F){ if(is.matrix(x))x<-listm(x) if(!xnum)par(xaxt="n") mval<-NA vals<-x[[1]] gval<-rep(1,length(x[[1]])) for(j in 2:length(x)){ vals<-c(vals,x[[j]]) gval<-c(gval,rep(j,length(x[[j]]))) } plot(gval,vals,xlab=xlab,ylab=ylab) } trimpb<-function(x,tr=.2,alpha=.05,nboot=2000,WIN=F,win=.1, plotit=F,pop=1,null.value=0,pr=T,xlab="X",fr=NA){ # # Compute a 1-alpha confidence interval for # a trimmed mean. # # The default number of bootstrap samples is nboot=2000 # # win is the amount of Winsorizing before bootstrapping # when WIN=T. # # Missing values are automatically removed. # # nv is null value. That test hypothesis trimmed mean equals nv # # plotit=T gives a plot of the bootstrap values # pop=1 results in the expected frequency curve. # pop=2 kernel density estimate # pop=3 boxplot # pop=4 stem-and-leaf # pop=5 histogram # pop=6 adaptive kernel density estimate. # # fr controls the amount of smoothing when plotting the bootstrap values # via the function rdplot. fr=NA means the function will use fr=.8 # (When plotting bivariate data, rdplot uses fr=.6 by default.) # if(pr){ print("The p-value returned by the this function is based on the") print("null value specified by the argument null.value, which defaults to 0") } x<-x[!is.na(x)] if(WIN){ if(win > tr)stop("The amount of Winsorizing must be <= to the amount of trimming") x<-winval(x,win) } crit<-alpha/2 icl<-round(crit*nboot)+1 icu<-nboot-icl bvec<-NA set.seed(2) # set seed of random number generator so that # results can be duplicated. print("Taking bootstrap samples. Please wait.") data<-matrix(sample(x,size=length(x)*nboot,replace=T),nrow=nboot) bvec<-apply(data,1,mean,tr) # Bootstrapped trimmed means bvec<-sort(bvec) #p.value<-sum(bvec