Genomedata documentation¶

Michael M. Hoffman <mmh1 at washington dot edu>

wget http://noble.gs.washington.edu/proj/genomedata/install.py
python install.py
Genome
Chromosomes
Chromosome
Supercontigs
Supercontig
continuous
continuous
Supercontigs
genomedata-load [-t trackname=signalfile]... [-s sequencefile]... GENOMEDATADIR
from genomedata import Genome
[...]
genomedatadir = "/path/to/genomedata"
with Genome(genomedatadir) as genome:
    [...]

Chromosome.close()
>>> chromosome = genome["chr2"]
>>> seq = chromosome.seq[1423:1433]
>>> seq
array([116,  99,  99,  99,  99, 103, 103, 103, 103, 103], dtype=uint8)
>>> seq.tostring()
'tccccggggg'

>>> chromosome = genome["chr8"]
>>> chromosome[999:1001, 0:3]  # Note the half-open, zero-based indexing
array([[ NaN,  NaN,  NaN],
       [ 3. ,  5.5,  3.5], dtype=float32)

>>> chromosome = genome["chr1"]
>>> data = chromosome[0:5, "sample_track"]
>>> data
array([ 47.,  NaN,  NaN,  NaN,  NaN], dtype=float32)

>>> from numpy import isfinite
>>> data[isfinite(data)]
array([ 47.], dtype=float32)

>>> col_index = chromosome.index_continuous("sample_track")
>>> data = chromosome[0:5, col_index:col_index+1]

.fa
.fa.gz
trackname=datafile
string
broad.h3k27me3
chrX.fa
chrY.fa.gz
signal.high.wig
signal.low.bed.gz
mygenomedata
genomedata-load -s chrX.fa -s chrY.fa.gz -t high=signal.high.wig -t low=signal.low.bed.gz mygenomedata
Usage: genomedata-load [OPTIONS] GENOMEDATADIR

--track and --sequence may be repeated to specify multiple trackname=trackfile
pairings and sequence files, respectively

Options:
  --version             show program's version number and exit
  -h, --help            show this help message and exit
  -s SEQFILE, --sequence=SEQFILE
                        Add the sequence data in the specified file
  -t TRACK, --track=TRACK
                        Add data for the given track. TRACK should be
                        specified in the form: NAME=FILE, such as: -t
                        signal=signal.dat
.fa
.fa.gz
Usage: genomedata-load-seq [OPTION]... GENOMEDATADIR SEQFILE...

Options:
  -g, --gap-length  XXX: Implement this.
  --version         show program's version number and exit
  -h, --help        show this help message and exit
Usage: genomedata-open-data [OPTION]... GENOMEDATADIR TRACKNAME...

Options:
  --version   show program's version number and exit
  -h, --help  show this help message and exit
Usage: genomedata-load-data [OPTION...] GENOMEDATADIR TRACKNAME
Loads data into genomedata format
Takes track data in on stdin

  -c, --chunk-size=NROWS     Chunk hdf5 data into blocks of NROWS. A higher
                             value increases compression but slows random
                             access. Must always be smaller than the max size
                             for a dataset. [default: 10000]
  -?, --help                 Give this help list
      --usage                Give a short usage message
  -V, --version              Print program version

Mandatory or optional arguments to long options are also mandatory or optional
for any corresponding short options.
Usage: genomedata-close-data [OPTION]... GENOMEDATADIR

Options:
  --version   show program's version number and exit
  -h, --help  show this help message and exit
genomedata.
Genome
with Genome("/path/to/genomedata") as genome:
  chromosome = genome["chr1"]
  [...]

>>> genome = Genome("/path/to/genomedata")
>>> chromosome = genome["chr1"]
[...]
>>> chromosome.close()

__init__
>>> genome = Genome("./genomedata.ctcf.pol2b/")
>>> genome
Genome("./genomedata.ctcf.pol2b/")

__iter__
for chromosome in genome:
  print chromosome.name
  for supercontig, continuous in chromosome.itercontinuous():
    [...]

__getitem__
Chromosome
>>> genome["chrX"]
Chromosome('/path/to/genomedata/chrX.genomedata')
>>> genome["chrZ"]
KeyError: 'Could not find chromosome: chrZ'

maxs
means
mins
num_datapoints
num_tracks_continuous
sums
sums_squares
tracknames_continuous
vars
genomedata.
Chromosome
>>> with Genome("/path/to/genomedata") as genome:
...     chromosome = genome["chrX"]
...     chromosome
...
Chromosome('/path/to/genomedata/chrX.genomedata')

__iter__
Chromosome.itercontinuous()
>>> for supercontig in chromosome:
...     supercontig  # calls repr()
...
<Supercontig('supercontig_0', 0:66115833)>
<Supercontig('supercontig_1', 66375833:90587544)>
<Supercontig('supercontig_2', 94987544:199501827)>

__getitem__
>>> chromosome = genome["chr4"]
>>> chromosome[0:5]  # Get all data for the first five bases of chr4
>>> chromosome[0, 0:2]  # Get data for first two tracks at chr4:0
>>> chromosome[100, "ctcf"]  # Get "ctcf" track value at chr4:100

attrs
close
Genome
end
index_continuous
>>> chromosome = genome["chr3"]
>>> col_index = chromosome.index_continuous("sample_track")
>>> data = chromosome[100:150, col_index]

>>> data = chromosome[100:150, "sample_track"]

itercontinuous
for supercontig, continuous in chromosome.itercontinuous():
    print supercontig, supercontig.start, supercontig.end
    [...]

maxs
Genome.maxs
mins
Genome.mins
name
num_datapoints
Genome.num_datapoints
num_tracks_continuous
seq
start
sums
Genome.sums
sums_squares
Genome.sums_squares
supercontigs
Supercontig
>>> chromosome.supercontigs[100]
[<Supercontig('supercontig_0', 0:66115833)>]
>>> chromosome.supercontigs[1:100000000]
[<Supercontig('supercontig_0', 0:66115833)>, <Supercontig('supercontig_1', 66375833:90587544)>, <Supercontig('supercontig_2', 94987544:199501827)>]
>>> chromosome.supercontigs[66115833:66375833]  # Between two supercontigs
[]

tracknames_continuous
genomedata.
Supercontig
attrs
continuous
end
name
project
seq
>>> chromosome = genome["chr1"]
>>> for supercontig in chromosome:
...     print repr(supercontig)
...
<Supercontig('supercontig_0', 0:121186957)>
<Supercontig('supercontig_1', 141476957:143422081)>
<Supercontig('supercontig_2', 143522081:247249719)>
>>> chromosome.seq[0:10].tostring()  # Inside supercontig
'taaccctaac'
>>> chromosome.seq[121186950:121186970].tostring() # Supercontig boundary
'agAATTCNNNNNNNNNNNNN'
>>> chromosome.seq[121186957:121186960].tostring() # Not in supercontig
/net/noble/vol2/home/stasis/arch/Linux/RHEL5/i686/lib/python2.5/genomedata-0.1.7.dev_r2548-py2.5.egg/genomedata/__init__.py:709: UserWarning: slice of chromosome sequence does not overlap any supercontig (filling with 'N')
warn("slice of chromosome sequence does not overlap any"
'NNN'

start

Author:	Michael M. Hoffman <mmh1 at washington dot edu>
Organization:	University of Washington
Address:	Department of Genome Sciences, PO Box 355065, Seattle, WA 98195-5065, United States of America
Copyright:	2009 Michael M. Hoffman

Parameter:	name (string) – name of the chromosome file (e.g. “chr1” if chr1.genomedata is a file in the genomedata directory)
Returns:	`Chromosome`

Parameter:	key (<base_key>[, <track_key>]) – key must index or slice bases, but can also index, slice, or directly specify (string or list of strings) the data tracks.
Returns:	numpy.array

Parameter:	trackname (string) – name of data track
Returns:	integer

Parameter:	pos (integer) – chromosome coordinate
Returns:	integer

Genomedata documentation¶

Installation¶

Overview¶

The workflow¶

Genomedata usage¶

Python interface¶

Basic usage¶

Command-line interface¶

genomedata-load¶

genomedata-load-seq¶

genomedata-open-data¶

genomedata-load-data¶

genomedata-close-data¶

Python API¶

Support¶

Table Of Contents

Previous topic

This Page

Navigation

Genomedata documentation¶

Installation¶

Overview¶

The workflow¶

Genomedata usage¶

Python interface¶

Basic usage¶

Command-line interface¶

genomedata-load¶

genomedata-load-seq¶

genomedata-open-data¶

genomedata-load-data¶

genomedata-close-data¶

Python API¶

Support¶

Table Of Contents

Previous topic

This Page

Quick search

Navigation