#------------------------------------------------------------------------------ #------------------------------------------------------------------------------ """ This program takes the list of Viterbi output files (from the 2nd stage of the Philius decoder, together with the posterior probabilities computed by gmtkJT in the first stage, and writes out a report file containing the predicted protein types and segments, together with confidence scores for each of the input proteins. Copyright 2008 University of Washington """ __author__ = "Sheila M. Reynolds (sheila@ee.washington.edu)" #------------------------------------------------------------------------------ numTypes = 4 pString = [0] * numTypes pString[0] = "GLOBULAR" pString[1] = "GLOBULAR with SIGNAL PEPTIDE" pString[2] = "TM PROTEIN" pString[3] = "TM PROTEIN with SIGNAL PEPTIDE" topoString = [0] * 6 topoString[0] = "BAD TOPOLOGY INDEX !!!" topoString[1] = "inside " topoString[2] = "TMhelix " topoString[3] = "outside " topoString[4] = "BAD TOPOLOGY INDEX ???" topoString[5] = "signal-peptide " topoString[1] = "cytoplasmic " topoString[2] = "TM helix " topoString[3] = "non-cytoplasmic" #------------------------------------------------------------------------------ # here we just want to return a string that indicates the predicted protein # type : GLOBULAR type0 # GLOBULAR with SIGNAL PEPTIDE type1 # TM PROTEIN type2 # TM PROTEIN with SIGNAL PEPTIDE type3 def getPredictedType ( labelVec ): # first check to see if there is a signal-peptide ... if ( 5 in labelVec ): spBit = 1 else: spBit = 0 # then check to see if there is a TM segment ... if ( 2 in labelVec ): tmBit = 1 else: tmBit = 0 return ( spBit + 2*tmBit ) #------------------------------------------------------------------------------ def getSPlength ( labelVec ): if ( 5 not in labelVec ): return ( 0 ) nn = len(labelVec) numAAs = 0 for ii in range(nn): if ( labelVec[ii] == 5 ): numAAs += 1 return ( numAAs ) #------------------------------------------------------------------------------ def getNumberTMHelices ( labelVec ): # print labelVec if ( 2 not in labelVec ): return ( 0, 0 ) nn = len(labelVec) tmpVec = labelVec + [99] numTMHs = 0 numAAs = 0 ii = 0 while ( ii < nn ): while ( tmpVec[ii] != 2 and ii < nn ): ii += 1 if ( tmpVec[ii] == 2 ): numTMHs += 1 while ( tmpVec[ii] == 2 ): ii += 1 numAAs += 1 return ( numTMHs, numAAs ) #------------------------------------------------------------------------------ def topoClass2label ( iTopoClass ): if ( iTopoClass == 1 ): return 1 if ( iTopoClass == 2 ): return 2 if ( iTopoClass == 3 ): return 3 if ( iTopoClass == 4 ): return 5 print ' ERROR in topoClass2label ??? ', iTopoClass sys.exit(-1) #------------------------------------------------------------------------------ def label2topoClass ( iLabel ): if ( iLabel == 1 ): return 1 if ( iLabel == 2 ): return 2 if ( iLabel == 3 ): return 3 if ( iLabel == 5 ): return 4 print ' ERROR in label2topoClass ??? ', iLabel sys.exit(-1) #------------------------------------------------------------------------------ def getViterbiSeqs ( vitFilename ): # print ' ' # print ' ' # print ' in getViterbiSeqs ... ', vitFilename # number of variables dumped out by the Viterbi pass ... numDump = 4 try: fhVit = file ( vitFilename ) except: print ' ERROR in getViterbiSeqs : error opening file ', vitFilename sys.exit(-1) wholeFile = fhVit.read() fhVit.close() numSamplesPerRec = numDump numBytesPerInt = 4 numBytesPerRec = numSamplesPerRec * numBytesPerInt numBytes = len(wholeFile) numSamples = numBytes / numBytesPerRec vitStates = [0] * numSamples vitLabels = [0] * numSamples iByte = 0 for ii in range(numSamples): # the hidden variables currently being written out by gmtkViterbi # are : iState, iToppoClass, spBit, and tmBit ( iState, iTopoClass, spBit, tmBit ) = \ unpack ( '0 ): i1 -= 1 i1 += 1 i2 = i1 + 1 while ( aFilename[i2] != '.' and i2 0.99 ): logScore = 0.99 if ( linScore > 0.99 ): linScore = 0.99 # print ' computing new segment score : ', sumLog, sumN, logScore, linScore segScores += [ ( logScore, linScore ) ] if ( iLabel < numLabels ): lastLabel = tmpLabels[iLabel] sumLog = log ( tmpProbs[iLabel] ) sumLin = tmpProbs[iLabel] sumN = 1 else: sumLog += log ( tmpProbs[iLabel] ) sumLin += tmpProbs[iLabel] sumN += 1 # print ' segScores : ', segScores # print ' range : ', min(segScores), max(segScores) # print fhJT.tell() return ( segScores ) #------------------------------------------------------------------------------ def mapDownFromTen ( iTenClass ): if ( iTenClass == 0 ): print ' ERROR ??? how can this be ??? ' sys.exit(-1) if ( iTenClass < 5 ): return ( iTenClass ) if ( iTenClass == 5 ): print ' ERROR ??? how can this be ??? ' sys.exit(-1) return ( iTenClass-5 ) #------------------------------------------------------------------------------ def getJTposteriors ( fhJT ): # print ' in getJTposteriors ... ', fhJT.tell() # print fhJT expNumTMaa = 0. expNumSPaa = 0. numTopoClasses = 5 topoProbs0 = [0.] * numTopoClasses topoProbsN = [0.] * numTopoClasses # we should be at the "Segment" line ... aLine = fhJT.readline() tokenList = aLine.split() # print len(tokenList), tokenList totalProb = float ( tokenList[6][:-1] ) perFrameP = float ( tokenList[13] ) # print totalProb, perFrameP # skip the '----' line ... aLine = fhJT.readline() # next read the first Partition ... aLine = fhJT.readline() # print aLine tokenList = aLine.split() # print len(tokenList), tokenList numEntries0 = int ( tokenList[10] ) cliqueH0 = float ( tokenList[12][2:] ) # print ' reading %d entries ... ', numEntries0 for ii in range(numEntries0): aLine = fhJT.readline() tokenList = aLine.split() prob = float ( tokenList[1] ) iTenClass = int ( tokenList[2][-1] ) iclass = mapDownFromTen ( iTenClass ) # print tokenList, prob, iTenClass, iclass topoProbs0[iclass] = prob if ( iclass == 2 ): expNumTMaa += prob if ( iclass == 4 ): expNumSPaa += prob # print ' got the first partition ... ' # and then we need to skip forward until we get to the last partition ... # but in the meanwhile we want to look at each partition and find the one # with the highest probability of having topoClass=2 # 18-Jan-08 : also want to count up the total probability of being in # the membrane ... ( topoClass=2 ) maxTM_p = -1. maxTM_h = -1. done = 0 while not done: aLine = fhJT.readline() # print ' A : ', aLine if ( aLine.startswith("---") ): aLine = fhJT.readline() # print ' B : ', aLine if ( aLine.startswith("Partition ") ): if ( aLine.find(" (E), Clique") > 0 ): done = 1 else: tokenList = aLine.split() # print ' C : ', tokenList tmpNum = int ( tokenList[10] ) tmpH = float ( tokenList[12][2:] ) for ii in range(tmpNum): aLine = fhJT.readline() tokenList = aLine.split() prob = float ( tokenList[1] ) iclass = int ( tokenList[2][-1] ) if ( iclass == 2 ): if ( prob > maxTM_p ): maxTM_p = prob maxTM_h = tmpH expNumTMaa += prob if ( iclass == 4 ): expNumSPaa += prob # print ' maxTM : ', maxTM_p, maxTM_h # and then we read the last Partition ... # print aLine tokenList = aLine.split() # print len(tokenList), tokenList numEntriesN = int ( tokenList[10] ) cliqueHN = float ( tokenList[12][2:] ) for ii in range(numEntriesN): aLine = fhJT.readline() tokenList = aLine.split() # print len(tokenList), tokenList prob = float ( tokenList[1] ) iTenClass = int ( tokenList[2][-1] ) iclass = mapDownFromTen ( iTenClass ) topoProbsN[iclass] = prob if ( iclass == 2 ): expNumTMaa += prob if ( iclass == 4 ): expNumSPaa += prob # and we also need to take a look at the information in the new # extra clique in the last Partition ... aLine = fhJT.readline() # print aLine aLine = fhJT.readline() # print aLine tokenList = aLine.split() numEntriesN = int ( tokenList[10] ) typeHN = float ( tokenList[12][2:] ) typeProbs = [0] * 4 for ii in range(numEntriesN): aLine = fhJT.readline() # print aLine tokenList = aLine.split() prob = float ( tokenList[1] ) itype = int ( tokenList[2][-1] ) typeProbs[itype] = prob if ( iclass == 2 ): expNumTMaa += prob if ( iclass == 4 ): expNumSPaa += prob # print topoProbs0 # print topoProbsN # print ' typeHN : ', typeHN # print ' typeProbs : ', typeProbs # sys.exit(-1) return ( totalProb, perFrameP, \ cliqueH0, topoProbs0, \ cliqueHN, topoProbsN, \ maxTM_p, maxTM_h, \ typeHN, typeProbs, \ expNumSPaa, expNumTMaa ) #------------------------------------------------------------------------------ def writeOneReport ( fhOut, vitFilename, fhJT ): # print ' ' # print ' ' # print ' >>>>>>>>>>>>>>>>>>>>> ' # print ' in writeOneReport ... ' # print vitFilename pName = getProteinName ( vitFilename ) # print pName fhOut.write ( '\n#\n' ) fhOut.write ( '# Name %s\n' % pName ) ( vitStates, vitLabels ) = getViterbiSeqs ( vitFilename ) # print vitStates # print vitLabels if ( 1 ): jtPos0 = fhJT.tell() ( jtTotalProb, jtPerFrameP, \ cliqueH0, topoProbs0, \ cliqueHN, topoProbsN, \ maxTM_p, maxTM_h, \ typeHN, typeProbs, \ expNumSPaa, expNumTMaa ) = getJTposteriors ( fhJT ) jtPos1 = fhJT.tell() # print ' fhJT file positions : ', jtPos0, jtPos1 # print jtTotalProb, jtPerFrameP, cliqueH0, cliqueHN, maxTM_p, maxTM_h, typeHN, typeProbs # print topoProbs0 # print topoProbsN # sys.exit(-1) # we need to get something from fhJT regarding this protein, so we need # to rewind ... fhJT.seek(jtPos0) segScores = getSegmentScores ( vitLabels, fhJT ) # and then go back to where getJTposteriors had left us before ... fhJT.seek(jtPos1) pType = getPredictedType ( vitLabels ) fhOut.write ( '# Annotation %s\n' % pString[pType] ) fhOut.write ( '# Length %6d\n' % len(vitLabels) ) numAAs = getSPlength ( vitLabels ) fhOut.write ( '# Number of AAs in SP %3d %5.1f\n' % ( numAAs, expNumSPaa ) ) ( numTMHs, numAAs ) = getNumberTMHelices ( vitLabels ) fhOut.write ( '# Number of predicted TMHs %3d\n' % numTMHs ) if ( numTMHs > 0 ): fhOut.write ( '# Number of AAs in TMHs %3d %5.1f %4.1f\n' % \ ( numAAs, expNumTMaa, (float(numAAs)/float(numTMHs)) ) ) else: fhOut.write ( '# Number of AAs in TMHs %3d %5.1f\n' % \ ( numAAs, expNumTMaa ) ) # first we write out the overall protein-type confidence score if ( pType < 2 ): fhOut.write ( '# Score %4.2f\n' % min(0.99,typeProbs[pType]) ) else: fhOut.write ( '# Scores %4.2f' % min(0.99,typeProbs[pType]) ) # a second topology confidence score is written out ONLY for TM proteins # ( including SP+TM proteins ) if ( pType == 2 ): # for TM proteins, we take the MIN() over the first and last IN/OUT # segments and over the individual TM helices newScore = min ( segScores[0][1], segScores[-1][1] ) for iSeg in range(1,len(segScores),2): if ( newScore > segScores[iSeg][1] ): newScore = segScores[iSeg][1] elif ( pType == 3 ): # for SP+TM proteins, we do the same thing, but which segments are the # TM helices is shifted over by 1 ... newScore = min ( segScores[0][1], segScores[-1][1] ) for iSeg in range(2,len(segScores),2): if ( newScore > segScores[iSeg][1] ): newScore = segScores[iSeg][1] fhOut.write ( ' %4.2f\n' % newScore ) fhOut.write ( '#\n' ) if ( pType > 0 ): writeDetailedTopology ( fhOut, vitStates, vitLabels, segScores ) fhOut.write ( '\n\n\n' ) fhOut.write ( '----------------------------------------------------------------------\n' ) # sys.exit(-1) #------------------------------------------------------------------------------ def writeAllReports ( fhOut, vitFileList, jtFileName ): print ' in writeAllReports : ' print ' vitFileList : ', vitFileList print ' jtFileName : ', jtFileName fhVit = file ( vitFileList ) fhJT = file ( jtFileName ) for aLine in fhVit: vitFilename = aLine[:-1] writeOneReport ( fhOut, vitFilename, fhJT ) #------------------------------------------------------------------------------ #------------------------------------------------------------------------------ # test-driver is here import sys from struct import * from math import * if __name__ == "__main__": if ( len(sys.argv)!=3 ): print ' Usage : %s ' % sys.argv[0] sys.exit(-1) if ( len(sys.argv) == 3 ): vitFileList = sys.argv[1] jtFileName = sys.argv[2] i1 = vitFileList.find('vit') + 3 i2 = vitFileList.find('.list') + 1 if ( i2 <= i1 ): print ' ERROR ??? unexpected file name ??? ' print vitFileList print i1, i2 sys.exit(-1) reportName = 'Philius.report.out' print ' output report name : ', reportName fhOut = file ( reportName, 'w' ) writeAllReports ( fhOut, vitFileList, jtFileName ) #------------------------------------------------------------------------------ #------------------------------------------------------------------------------