import sys # Parse the command line. if (len(sys.argv) != 4): print "USAGE: find-unique-tags.py " sys.exit(1) genomeFilename = sys.argv[1] tagsFilename = sys.argv[2] outputFilename = sys.argv[3] ############################################################################# def readSequenceList(genomeFilename): returnValue = [] numBases = 0 # Open the file for reading. genomeFile = open(genomeFilename, "r") # Each line is one chromosome. for chromosome in genomeFile: chromosome = chromosome.rstrip() numBases += len(chromosome) returnValue.append(chromosome) # Tell the user what happened. print "Read %d bases in %d chromosomes from %s." % (numBases, len(returnValue), genomeFilename) return(returnValue) ############################################################################# # MAIN PROCEDURE ############################################################################# # Read the genome into memory. genome = readSequenceList(genomeFilename) # Open the ouput file. outputFile = open(outputFilename, "w") # Initialize counters. numTags = 0 numUniques = 0 # Read the tags line by line. tagsFile = open(tagsFilename, "r") for tag in tagsFile: tag = tag.rstrip() # Count the number of matches. numMatches = 0 for chromosome in genome: numMatches += chromosome.count(tag) # Decide whether it's unique. if (numMatches == 1): outputFile.write("%s\n" % tag) numUniques += 1 numTags += 1 tagsFile.close() # Tell the user what happened. print "Read %d tags from %s." % (numTags, tagsFilename) print "Printed %d unique tags to %s." % (numUniques, outputFilename)