import sys # Parse the command line. if (len(sys.argv) != 3): print "USAGE: compute-t-statistics.py " sys.exit(1) myFile1 = sys.argv[1] myFile2 = sys.argv[2] ############################################################################## # Read a tab-delimited text file into a dictionary, with gene IDs as # keys and arrays of expression values as value. def readExpressionFile(myFileName): # Initialize the dictionary. returnValue = {} # Open the file. myFile = open(myFileName, "r") # Skip the first line. myFile.readline() # Read the remaining lines. for line in myFile: line = line.rstrip() # Parse this line. words = line.split("\t") geneID = words[0] valueStrings = words[1:] # Convert to floats. values = [] for valueString in valueStrings: values.append(float(valueString)) # Store the list of values. returnValue[geneID] = values return(returnValue) ############################################################################## # Compute a t statistic for two arrays of numbers. def computeT(firstSet, secondSet): # Compute the mean of each array. firstMean = computeMean(firstSet) secondMean = computeMean(secondSet) # Compute the sum-of-squares for each array. firstSS = computeSS(firstMean, firstSet) secondSS = computeSS(secondMean, secondSet) # Compute the pooled variance. pooledVariance = (firstSS + secondSS) / (len(firstSet) + len(secondSet) - 2) # Compute the t-statistic and print. # N.B. I didn't include the absolute value, to allow a two-tailed statistic. tStatistic = ((firstMean - secondMean) / math.sqrt((pooledVariance / len(firstSet)) + (pooledVariance / len(secondSet)))) return(tStatistic) ############################################################################## # MAIN PROCEDURE ############################################################################## # Read a dictionary from each file. myDict1 = readExpressionFile(myFile1) myDict2 = readExpressionFile(myFile2) # Iterate through the genes, computing t-statistics. for gene in myDict1.keys(): # Check to be sure this gene is in the other file. if (myDict2.has_key(gene)): tStat = computeT(myDict1[gene], myDict2[gene]) print "%s\t%g" % (gene, tStat)