import sys # Parse the command line. if (len(sys.argv) != 4): print "USAGE: map-tags.py " sys.exit(1) genomeFilename = sys.argv[1] tagsFilename = sys.argv[2] outputFilename = sys.argv[3] ############################################################################# def readSequenceList(genomeFilename): returnValue = [] numBases = 0 # Open the file for reading. genomeFile = open(genomeFilename, "r") # Each line is one chromosome. for chromosome in genomeFile: chromosome = chromosome.rstrip() numBases += len(chromosome) returnValue.append(chromosome) # Tell the user what happened. print "Read %d bases in %d chromosomes from %s." % (numBases, len(returnValue), genomeFilename) return(returnValue) ############################################################################# # MAIN PROCEDURE ############################################################################# # Read the genome into memory. genome = readSequenceList(genomeFilename) # Open the ouput file. outputFile = open(outputFilename, "w") # Keep an index of how many tags were mapped. numTags = 0 numLocations = 0 # Read the tags line by line. tagsFile = open(tagsFilename, "r") for tag in tagsFile: tag = tag.rstrip() tagLength = len(tag) # Count the number of matches. for chromIndex in range(0, len(genome)): chrom = genome[chromIndex] for position in range(0, len(chrom) - tagLength): sequence = chrom[position:position+tagLength] if (sequence == tag): outputFile.write("%s\t%d\t%d\n" % (tag, chromIndex + 1, position + 1)) numLocations += 1 numTags += 1 tagsFile.close() # Tell the user what happened. print "Read %d tags from %s." % (numTags, tagsFilename) print "Mapped to %d locations." % numLocations