import os,sys

# if PyML is installed in your site-packages directory you won't
# need an equivalent of the following lines
sys.path.append("/home/asa/PyML")
sys.path.append("/home/asa/PyML/ext")
sys.path.append("/home/asa/")

# definitions that reflect the file structure on my machine
# you will need to change those according to your setup
homeDir = os.path.join(os.environ['HOME'], 'sppi')
dataDir = os.path.join(homeDir, 'data')
datasetDir = os.path.join(dataDir, 'interactionDatasets')

# read the data:
import datafunc
data = datafunc.SparseCDataSet(os.path.join(datasetDir, 'yeastBindHomologyGOmcc.data'),
			       labelsColumn = 1)

# center each variable and divide by its standard deviation:
import preproc
rescaler = preproc.Rescale()
rescaler.train(data)

# the mcc feature needs to be computed on the fly based on the training data
# one function is for training, and one for testing
# we use the mechanism of using the PyML feature of naming a "train" and "test"
# method for the dataset that is called prior to training or testing a classifier
from sppi.src import mcc
data.setTrainingFunc(mcc.addDataTrain)
data.setTestingFunc(mcc.addDataTest)

import ker
k = ker.Gaussian()

import svm
import modelSelection

# define a model selection object that chooses the width parameter of the kernel
# using CV according to its roc50 score:
param = modelSelection.Param(svm.SVM(k),
                             'kernel.gamma',
                             [0.001, 0.01, 0.1, 1])

m = modelSelection.ModelSelector(param, measure = 'roc50')

# perform CV with a given random number generator seed:
r=m.stratifiedCV(data, seed = 1)

# save the results:
r.save('goMCChomology.gaussian.select.pyd')
