library(KernSmooth) library(ks) library(sm) library(sampling) library(mda) #codes created by Aurore Delaigle for the JRSSB paper Delaigle, A. and Hall, P. (2012). Achieving near-perfect classification for functional data. JRSS,B, 74, 267--286 #How to cite the code? Cite the paper and say you used the associated codes. #These codes are NOT those used in the paper. They were written on January 15, 2014 so as to try and make them readable. I might have introduced errors in doing this. #Please inform A. Delaigle if you find errors in the codes. #You'll need the FDA codes from Frederic Ferraty: #https://www.math.univ-toulouse.fr/~ferraty/SOFTWARES/NPFDA/index.html #In this example, we apply the perfect classification on a real data example. #We split the data in two: a training sample, which we choose randomly from the full data set, and a test sample (the rest of the data) #--------------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------------- # Prepare the data and store them in matrices and vectors #--------------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------------- #Discrete grid of t points where the functions are observed: replace by your own grid tt=1:200 Deltat=tt[2]-tt[1] longt=length(tt) #Size of test data set (number of curves to classify). Example here where ntest=50 ntest=50 #Data matrix : put your curves to classify in this matrix. Example here of a matrix of that size XXtest=matrix(0,nrow=ntest,ncol=longt) #Size of training data set (number of training curves available). Example here when ntrain=100 ntrain=100 #Data matrix : put your training data curves in this matrix. Need to be a matrix of size ntrain x longt. Example here of a matrix of that size XXdata=matrix(0,nrow=ntrain,ncol=longt) #Class labels (0 or 1): put your training class labels here. Needs to be a vector of length ntrain #Example here where the first 50 are from class 0 and the next 50 are from class 1 Y=rep(0,ntrain) Y[51:100]=1 #Compute indices of training curves with class label 0 or 1, and put the curves from class label 0 or 1 in two separate matrices indClass0tmp=which(Y==0) indClass1tmp=which(Y==1) n0=length(indClass0tmp) n1=length(indClass1tmp) XXtrain0=XXdata[indClass0tmp,] XXtrain1=XXdata[indClass1tmp,] #Create a matrix that puts first all the training data from group 0 and then all the data from group 1 XXtrain=rbind(XXtrain0,XXtrain1) #indices of class label 0 or 1 indClass0=1:n0 indClass1=1:n1+n0 Y=matrix(0,nrow=ntrain,ncol=1) Y[indClass1]=rep(1,n1) #Rescale the data in some way to avoid too many numerical problems. For example, do it this way: varXX=var(as.vector(XXtrain)) muX=mean(as.vector(XXtrain)) XXtrain=(XXtrain-muX)/sqrt(varXX) XXtest=(XXtest-muX)/sqrt(varXX) #--------------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------------- # Start procedure #--------------------------------------------------------------------------------------------------------- #--------------------------------------------------------------------------------------------------------- #----------------------------------- #Computations useful for later #----------------------------------- # Calculate means in each class barXClass0=XXtrain0[1,] for(i in 2:n0) barXClass0=barXClass0+XXtrain0[i,] barXClass1=XXtrain1[1,] for(i in 2:n1) barXClass1=barXClass1+XXtrain1[i,] barXClass0=barXClass0/n0 barXClass1=barXClass1/n1 #Construct PC basis estimators based on the data from the two groups centered to their mean ZZ=XXtrain0-outer(rep(1, n0), barXClass0) ZZ=rbind(ZZ,XXtrain1-outer(rep(1, n1), barXClass1)) compo=prcomp(ZZ) phi=(compo$rotation) phi=phi/sqrt(Deltat) lambda=(compo$sdev) lambda=lambda^2*Deltat # npsir is the maximum number of terms considered in our sums cumul=cumsum(lambda)/sum(lambda) npsir=min(which(cumul>1-1/ntrain^2)) npsir=min(npsir,ntrain/2) #Difference between means + projections of that difference on the basis fucntions. Plays the role of the mu_j's in the paper DiffbarX=barXClass1-barXClass0 hatmuj=DiffbarX%*%phi[,]*Deltat #------------------------------------------------------------ #------------------------------------------------------------ # METHOD 1: Estimate psir computed through the PC components #------------------------------------------------------------ #------------------------------------------------------------ #Compute the numbers of terms in the sum using CV. #Take the global minimum among thr first two local minima CV=rep(0,npsir) #nbcv is the number of local minima found so far nbCV=0 #npsirA will be the number of components we keep npsirA=1 #B5fold = Number of data partitions created to compute the cross-validation criterion B=200 K=5 KKCV=round(ntrain/K) #ind5foldCV: indices of the samples Xstarb1,...,Xstarbm for b=1..B ind5foldCV=matrix(0,nrow=B,ncol=KKCV) for(b in 1:B) { s=runif(KKCV,0,ntrain) s=ceiling(s) ind5foldCV[b,]=s } #To save time later, compute now the eigen functions and eigen values for each of the CV samples and save them in matirces and vectors for(b in 1:B) { i=ind5foldCV[b,] XXCV=ZZ[-i,] compoCV=prcomp(XXCV) phiCV=(compo$rotation) eval(parse(text=paste("phiCV",b,"=phiCV/sqrt(Deltat)", sep = ""))) lambdaCV=(compoCV$sdev) eval(parse(text=paste("lambdaCV",b,"=lambdaCV^2*Deltat", sep = ""))) } for(j in 1:npsir) { if(nbCV<2) { CV[j]=CVpsi(XXtrain,j) if(j>2) { #check your CV function to see if CV is flat in some parts. If it is flat you can only find the mins if you use a non strict inequality if((CV[j-1]1 }#end while j }#end for j #Compute the function hatpsir hatpsir=0*phi[,1] for(j in 1:npsirA) hatpsir=hatpsir+hatmuj[j]/lambda[j]*phi[,j] #-------------------------------------------------------------------------- #-------------------------------------------------------------------------- # METHOD 2: Estimate psir computed through MPLSR (partial least squares) #-------------------------------------------------------------------------- #-------------------------------------------------------------------------- #Compute number of components to keep by CV CV=rep(0,npsir) #m will be the number of components we keep m=1 for(j in 1:npsir) { if(nbCV<2) { CV[j]=CVmplsr(XXtrain,Y,j) if(j>2) { #check your CV function to see if CV is flat in some parts. If it is flat you can only find the mins if you use a non strict inequality if((CV[j-1]1 }#end while j }#end for j #Compute the resulting beta_r hatpsipls=mplsr(XXtrain,Y,m)$COEF #-------------------------------------------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------------------------------------------- # Apply the centroid classifier to the test data using each of the functions psir and compute the number of errors made by the classifier #-------------------------------------------------------------------------------------------------------------------------------------------------- #-------------------------------------------------------------------------------------------------------------------------------------------------- #Will contain labels of classes to which each of the Xtest data are assigned byb the classifier ClassTestPC=rep(0,ntest) ClassTestPLS=rep(0,ntest) for(b in 1:ntest) { #Test curve X=XXtest[b,] #project the data on the estimated psir function (PCA method) Xproj=X%*%hatpsir XXtrainproj=XXtrain%*%hatpsir #Compute statistic used to classify the data TofX=(Xproj-mean(XXtrainproj[indClass1]))^2-(Xproj-mean(XXtrainproj[indClass0]))^2 #Compute class label if(TofX<0) ClassTestPC[b]=1 #project the data on the estimated funtion computed by PLS Xproj=X%*%hatpsipls*Deltat XXtrainproj=XXtrain%*%hatpsipls*Deltat #Compute statistic used to classify the data TofX=(Xproj-mean(XXtrainproj[indClass1]))^2-(Xproj-mean(XXtrainproj[indClass0]))^2 #Compute class label if(TofX<0) ClassTestPLS[b]=1 }