import sys # Parse the command line. if (len(sys.argv) != 4): print "USAGE: map-tags2.py " sys.exit(1) genomeFilename = sys.argv[1] tagsFilename = sys.argv[2] outputFilename = sys.argv[3] ############################################################################# def readSequenceList(genomeFilename): returnValue = [] numBases = 0 # Open the file for reading. genomeFile = open(genomeFilename, "r") # Each line is one chromosome. for chromosome in genomeFile: chromosome = chromosome.rstrip() numBases += len(chromosome) returnValue.append(chromosome) # Tell the user what happened. print "Read %d bases in %d sequences from %s." % (numBases, len(returnValue), genomeFilename) return(returnValue) ############################################################################# # MAIN PROCEDURE ############################################################################# # Read the tags into memory. tags = readSequenceList(tagsFilename) # Get the tag length (assume they are all the same). tagLength = len(tags[0]) # Open the ouput file. outputFile = open(outputFilename, "w") # Keep an index of how many tags were mapped. numLocations = 0 # Read the genome line by line. genomeFile = open(genomeFilename, "r") chromIndex = 0 for chrom in genomeFile: chrom = chrom.rstrip() chromLength = len(chrom) # Consider each possible location for the tag to map to. for position in range(0, len(chrom) - tagLength): sequence = chrom[position:position+tagLength] # Consider each tag, searching for a match. for tag in tags: if (sequence == tag): outputFile.write("%s\t%d\t%d\n" % (tag, chromIndex + 1, position + 1)) numLocations += 1 chromIndex += 1 genomeFile.close() # Tell the user what happened. print "Read %d chromosomes from %s." % (chromIndex, genomeFilename) print "Mapped to %d locations." % numLocations