#!/usr/bin/env python # CREATE DATE: 31 Jan 2018 # AUTHOR: WSN import sys import math USAGE = """USAGE: compute-motif-pvalue.py [] Input is (1) 0 a tab-delimited integer PSSM where rows are letters (ACGT) and columns are positions, and (2) a score. The second input is optional. If only a motif is provided as input, then the output is the DP matrix, where the final row is the count of scores for all possible sequences of length W. If a score is also provided, then the DP matrix is not printed and instead the p-value associated with the score is printed. """ # Parse the command line. if ( (len(sys.argv) < 2) or (len(sys.argv) > 3) ): sys.stderr.write(USAGE) sys.exit(1) motif_file_name = sys.argv[1] if (len(sys.argv) == 3): observed_score = int(sys.argv[2]) else: observed_score = None # Read the PSSM. pssm = [] max_value = 0 in_file = open(motif_file_name, "r") for line in in_file: words = line.rstrip().split("\t") this_row = [] for word in words: this_value = int(word) this_row.append(this_value) if (this_value > max_value): max_value = this_value pssm.append(this_row) in_file.close() width = len(pssm[0]) sys.stderr.write("Read %d by %d motif with maximum value %d.\n" % (len(pssm), width, max_value)) dp = [] # Print the header. if (observed_score == None): line = "" for col_index in range(0, width * max_value): line += "{:3} ".format(col_index) print(line) # Fill in the first row of the DP matrix. first_row = [0] * (width * max_value) for nuc_index in range(0, 4): score = pssm[nuc_index][0] first_row[score] += 1 dp.append(first_row) # Print the line. if (observed_score == None): line = "" for value in first_row: line += "{:3} ".format(value) print(line) # Fill in the remaining rows. for row_index in range(1, width): new_row = [0] * (width * max_value) for prev_col_index in range(0, width * max_value): prev_count = dp[row_index-1][prev_col_index] if ( prev_count > 0): for nuc_index in range(0, 4): new_col = prev_col_index + pssm[nuc_index][row_index] new_row[new_col] += prev_count dp.append(new_row) # Print the line. if (observed_score == None): line = "" for value in new_row: line += "{:3} ".format(value) print(line) # Compute the p-value. if (observed_score != None): my_sum = 0 for this_score in range(observed_score, len(new_row)): my_sum += new_row[this_score] num_sequences = math.pow(len(pssm), width) print("p-value: {:d}/{:.0f}={:.3e}".format(my_sum, num_sequences, my_sum / num_sequences))