Kernel Parcimonious Gaussian Process

The paper entitled Parsimonious Gaussian Process Models for the Classification of Hyperspectral Remote Sensing Images has been accepted in the IEEE Geoscience and Remote Sensing Letters. A version of the paper can be found here.

The Python functions that implement the models can be found https://github.com/mfauvel/PGPDA. In this post, I am going to describe how the results presented in the paper have been obtained. Scikit-learn should be installed. Also, for best performances, a good linear algebra library is required. Openblas is a good option.

Experimental Setup

I use the following script to perform the experiment. Just change the name of the data to perform the corresponding data set. Send me an email to get the data in Scipy format.

import scipy as sp
import pgpda
import kernels
import accuracy_index as ai
import time
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.grid_search import GridSearchCV
import pickle

# Functions definitions
def compute_PGPDA(x,y,xt,yt,sig_r,dc_r,threshold_r,mod):
    model=pgpda.PGPDA(model=mod)
    sig,threshold,err=model.cross_validation(x, y,sig_r=sig_r,dc_r=dc_r,threshold_r=threshold_r)
    model.train(x,y)
    yp=model.predict(xt,x,y)
    conf = ai.CONFUSION_MATRIX()
    conf.compute_confusion_matrix(yp,yt)
    return conf.OA,conf.Kappa,conf.confusion_matrix

def compute_NPGPDA(x,y,xt,yt,sig_r,dc_r,threshold_r,mod):
    model=pgpda.NPGPDA(model=mod)
    sig,threshold,err=model.cross_validation(x, y,sig_r=sig_r,dc_r=dc_r,threshold_r=threshold_r)
    model.train(x,y)
    yp=model.predict(xt,x,y)
    conf = ai.CONFUSION_MATRIX()
    conf.compute_confusion_matrix(yp,yt)
    return conf.OA,conf.Kappa,conf.confusion_matrix

def compute_SVM(x,y,xt,yt,param_grid_svm):
    y.shape=(y.size,)    
    cv = KFold(y.size, n_folds=5)
    grid = GridSearchCV(SVC(), param_grid=param_grid_svm, cv=cv,n_jobs=2)
    grid.fit(x, y)
    clf = grid.best_estimator_
    clf.fit(x,y)
    yp = clf.predict(xt).reshape(yt.shape)
    conf = ai.CONFUSION_MATRIX()
    conf.compute_confusion_matrix(yp,yt)
    return conf.OA,conf.Kappa,conf.confusion_matrix

def compute_RF(x,y,xt,yt,param_grid_rf):
    y.shape=(y.size,)    
    cv = KFold(y.size, n_folds=5)
    grid = GridSearchCV(RandomForestClassifier(), param_grid=param_grid_rf, cv=cv,n_jobs=2)
    grid.fit(x, y)
    clf = grid.best_estimator_
    clf.fit(x,y)
    yp = clf.predict(xt).reshape(yt.shape)
    conf = ai.CONFUSION_MATRIX()
    conf.compute_confusion_matrix(yp,yt)
    return conf.OA,conf.Kappa,conf.confusion_matrix

def compute_KDA(x,y,xt,yt,mu_r,sig_r):
    model=pgpda.KDA()
    sig,mu,err=model.cross_validation(x, y,sig_r=sig_r,mu_r=mu_r)
    model.train(x,y)
    yp=model.predict(xt,x,y)
    conf = ai.CONFUSION_MATRIX()
    conf.compute_confusion_matrix(yp,yt)
    return conf.OA,conf.Kappa,conf.confusion_matrix


if __name__ == '__main__':
    # Load spectra
    data_name = 'aisa' # 'ksc' 'uni' 'aisa'
    data  = sp.load(data_name+'.npz')
    X = data['X']
    Y = data['Y']
    X = pgpda.scale(X)[0]
    del data

    # Set Parameters
    MODELS_NPG = ['NM0','NM1','NM2','NM3','NM4']
    MODELS_PG = ['M0','M1','M2','M3','M4','M5','M6']
    REP = 20
    Nt = 50
    NT = 1000
    n,d=X.shape
    C = Y.max()

    sig_r = 2.0**sp.arange(-8,4) # PGPDA
    threshold_r = sp.linspace(0.70,0.9999,15) # PGPDA
    dc_r =sp.arange(2,Nt,3) # PGPDA
    penalty = 10.0**sp.arange(-2,6) # SVM
    param_grid_svm = dict(gamma=sig_r, C=penalty) # SVM
    n_estimators=sp.arange(10,Nt,(Nt-10)/10) # RF
    param_grid_rf = dict(n_estimators=n_estimators) # RF
    mu_r =10.0**sp.arange(-8,2) # KDA

    RES = {'M0':[[],[],[],[]],'M1':[[],[],[],[]],'M2':[[],[],[],[]],'M3':[[],[],[],[]],'M4':[[],[],[],[]],'M5':[[],[],[],[]],'M6':[[],[],[],[]],'NM0':[[],[],[],[]],'NM1':[[],[],[],[]],'NM2':[[],[],[],[]],'NM3':[[],[],[],[]],'NM4':[[],[],[],[]],'SVM':[[],[],[],[]],'RF':[[],[],[],[]],'KDA':[[],[],[],[]]}

    # Start the repetition
    r=0
    while r<REP:
        print "Repetition number ",str(r)
        # Random selection of the sample
        x = sp.array([]).reshape(0,d)
        y = sp.array([]).reshape(0,1)
        xt = sp.array([]).reshape(0,d)
        yt = sp.array([]).reshape(0,1)    

        sp.random.seed(r)
        for i in range(C):            
            t = sp.where((i+1)==Y)[0]
            nc = t.size
            rp =  sp.random.permutation(nc)
            x = sp.concatenate((X[t[rp[0:Nt]],:],x))
            xt = sp.concatenate((X[t[rp[-NT:]],:],xt))
            y = sp.concatenate((Y[t[rp[0:Nt]]],y))
            yt = sp.concatenate((Y[t[rp[-NT:]]],yt))

        # PGPDA
        for mod in MODELS_PG:
            ts=time.time()
            res = compute_PGPDA(x,y,xt,yt,sig_r,dc_r,threshold_r,mod)
            [RES[mod][i].append(res[i]) for i in range(3)]
            RES[mod][3].append(time.time()-ts)

        # NPGPDA
        for mod in MODELS_NPG:
            ts=time.time()
            res = compute_NPGPDA(x,y,xt,yt,sig_r,dc_r,threshold_r,mod)
            [RES[mod][i].append(res[i]) for i in range(3)]
            RES[mod][3].append(time.time()-ts)

        # SVM
        ts=time.time()
        res = compute_SVM(x,y,xt,yt,param_grid_svm)
        [RES['SVM'][i].append(res[i]) for i in range(3)]
        RES['SVM'][3].append(time.time()-ts)

        # RF
        ts=time.time()
        res = compute_RF(x,y,xt,yt,param_grid_rf)
        [RES['RF'][i].append(res[i]) for i in range(3)]
        RES['RF'][3].append(time.time()-ts)

        # KDA
        ts=time.time()
        res = compute_KDA(x,y,xt,yt,mu_r,sig_r)
        [RES['KDA'][i].append(res[i]) for i in range(3)]
        RES['KDA'][3].append(time.time()-ts)
        r += 1

    # Save Results
    output = open('res_full'+data_name, 'wb')
    pickle.dump(RES, output)
    output.close()

Compute accuracy

I generate the org table with the following python code (again, change the name of the data set accordingly).

import scipy as sp
import scipy.stats as stats
import pickle

# Load data
RES = pickle.load(open("res_fullaisa.npz","rb"))
NC = 16
NS = 20

# Name of methods
NAME = ['M0','M1','M2','M3','M4','M5','M6','NM0','NM1','NM2','NM3','NM3','KDA','RF','SVM']

# File setup
orgfile = open("res_aisa.org","w")
orgfile.write('|Method')
for name in NAME:
    orgfile.write('|'+name)
orgfile.write('|\n')
orgfile.write('|--+ --+--|\n')

# Write OA
orgfile.write('|OA')
for name in NAME:
    val =int(1000*sp.mean(RES[name][0]))/10.0
    orgfile.write('|'+str(val))
orgfile.write('|\n')

# Write Kappa
orgfile.write('|Kappa')
for name in NAME:
    val =int(1000*sp.mean(RES[name][1]))/1000.0
    orgfile.write('|'+str(val))
orgfile.write('|\n')

# Write CPU
orgfile.write('|CPT')
for name in NAME:
    val =int(sp.mean(RES[name][3]))
    orgfile.write('|'+str(val))
orgfile.write('|\n')

# Compute mean confusion matrix
UA = []
PA = []
for name in NAME:
    confu = sp.zeros(RES['M0'][2][0].shape)
    for i in range(NS):
        confu+=RES[name][2][i]
    confu/=20
    PA.append(sp.diag(confu)/confu.sum(axis=0))
    UA.append(sp.diag(confu)/confu.sum(axis=1))


orgfile.write('|--+ --+--|\n')
for j in range(NC):
    orgfile.write('|PA '+str(j))
    for i in range(len(NAME)):
        orgfile.write('|'+str(int(1000.0*PA[i][j])/10.0))
    orgfile.write('|\n')

orgfile.write('|--+ --+--|\n')
for j in range(NC):
    orgfile.write('|UA '+str(j))
    for i in range(len(NAME)):
        orgfile.write('|'+str(int(1000.0*UA[i][j])/10.0))
    orgfile.write('|\n')  

orgfile.write('\n')
orgfile.write('\n')  

# Write RANK TEST
orgfile.write('| ')
for name in NAME:
    orgfile.write('|'+name)
orgfile.write('|\n')
orgfile.write('|--+ --+--|\n')

# Rank test
for name in NAME:
    orgfile.write('|'+name)
    for name1 in NAME:
        z,p=stats.ranksums(RES[name][1],RES[name1][1])
        orgfile.write('|'+str(z))
    orgfile.write('|\n')              

orgfile.close()

Finally I (and you should!) get the tabs reported in the article.

Experimental Setup

Compute accuracy

Comments