import os,sys

# if PyML is installed in your site-packages directory you won't
# need an equivalent of the following lines
sys.path.append("/home/asa/PyML")
sys.path.append("/home/asa/PyML/ext")
sys.path.append("/home/asa/")

# definitions that reflect the file structure on my machine
# you will need to change those according to your setup
homeDir = os.path.join(os.environ['HOME'], 'sppi')
dataDir = os.path.join(homeDir, 'data')
datasetDir = os.path.join(dataDir, 'interactionDatasets')

import datafunc

# the location of the sequence kernel:
kernelFile = os.path.join(dataDir, 'kernels',
                          'motifNRplusPfamPlusSpectrum3.kernel')

# read the sequence kernel
kdata = datafunc.KernelData(kernelFile)

# construct a dataset from the sequence kernel:
import pairData
p = pairData.PairDataSet(os.path.join(datasetDir, 'yeastBind.data'),
                         data=kdata)

# read the feature data (homology, GO and MCC data)
fdata = datafunc.SparseCDataSet(os.path.join(datasetDir, 'yeastBindHomologyGOmcc.data'),
                                labelsColumn = 1)

# center each variable and divide by its standard deviation:
import preproc
rescaler = preproc.Rescale()
rescaler.train(fdata)

# use a gaussian kernel for the feature data:
fdata.attachKernel('gaussian', gamma = 0.1)

# construct a dataset out of the feature data and the pairwise kernel data
data = datafunc.Aggregate([p, fdata])

# the mcc feature needs to be computed on the fly based on the training data
# one function is for training, and one for testing
# we use the mechanism of using the PyML feature of naming a "train" and "test"
# method for the dataset that is called prior to training or testing a classifier
from sppi.src import mcc
data.setTrainingFunc(mcc.addDataTrain)
data.setTestingFunc(mcc.addDataTest)

# instantiate an SVM that uses the PyML native optimizer
import svm
s=svm.SVM(optimizer = 'mysmo')

# perform CV with a given random number generator seed:
r=s.stratifiedCV(data, seed = 1)

# save the results
r.save('seqKernel_plus_HomologyGOmcc.pyd')
