import os,sys # if PyML is installed in your site-packages directory you won't # need an equivalent of the following lines sys.path.append("/home/asa/PyML") sys.path.append("/home/asa/PyML/ext") sys.path.append("/home/asa/") # definitions that reflect the file structure on my machine # you will need to change those according to your setup homeDir = os.path.join(os.environ['HOME'], 'sppi') dataDir = os.path.join(homeDir, 'data') datasetDir = os.path.join(dataDir, 'interactionDatasets') import datafunc # the location of the sequence kernel: kernelFile = os.path.join(dataDir, 'kernels', 'motifNRplusPfamPlusSpectrum3.kernel') # read the sequence kernel kdata = datafunc.KernelData(kernelFile) # construct a dataset from the sequence kernel: import pairData p = pairData.PairDataSet(os.path.join(datasetDir, 'yeastBind.data'), data=kdata) # read the feature data (homology, GO and MCC data) fdata = datafunc.SparseCDataSet(os.path.join(datasetDir, 'yeastBindHomologyGOmcc.data'), labelsColumn = 1) # center each variable and divide by its standard deviation: import preproc rescaler = preproc.Rescale() rescaler.train(fdata) # use a gaussian kernel for the feature data: fdata.attachKernel('gaussian', gamma = 0.1) # construct a dataset out of the feature data and the pairwise kernel data data = datafunc.Aggregate([p, fdata]) # the mcc feature needs to be computed on the fly based on the training data # one function is for training, and one for testing # we use the mechanism of using the PyML feature of naming a "train" and "test" # method for the dataset that is called prior to training or testing a classifier from sppi.src import mcc data.setTrainingFunc(mcc.addDataTrain) data.setTestingFunc(mcc.addDataTest) # instantiate an SVM that uses the PyML native optimizer import svm s=svm.SVM(optimizer = 'mysmo') # perform CV with a given random number generator seed: r=s.stratifiedCV(data, seed = 1) # save the results r.save('seqKernel_plus_HomologyGOmcc.pyd')