Scripts for common tasks

This is a cook book of reminders, suggestions, and points of departure for customization. You can copy and paste these into your own files. Or you can use the p4 recipes function, func.recipes(), to write new files.

In these scripts I suggest variations, often by enclosing the variation in an if 1: block to turn it on, or an if 0: block to turn it off. Hopefully the meaning is clear and the (sometimes mutually exclusive) possibilities are obvious.

Make a consensus tree, with variations

# Make a consensus tree, with variations.
mySkip = 500
tp = TreePartitions("mcmc_trees_0.nex", skip=mySkip)

# read another file
tp.read("mcmc_trees_1.nex", skip=mySkip)

t = tp.consensus()

# Re-root
t.reRoot(t.node('myOutTaxon').parent)

# Collapse poorly supported nodes
toCollapse = [n for n in t.iterInternalsNoRoot() if n.br.support < 0.7]
for n in toCollapse:
    t.collapseNode(n)

# Transfer node.br.support to node.name as percent
for n in t.iterInternalsNoRoot():
    n.name = "%.0f" % (100. * n.br.support)

t.draw()
t.writeNexus('cons01_70.nex')

Make a consensus tree, uncomplicated

# Make a consensus tree, uncomplicated.
tp = TreePartitions("mcmc_trees_0.nex", skip=500)
t = tp.consensus()
for n in t.iterInternalsNoRoot():
    n.name = "%.0f" % (100. * n.br.support)
t.draw()
t.writeNexus('cons.nex')

Calculate a likelihood

# Calculate a likelihood.
read("d.nex")
d = Data()

read("t.nex")
t = var.trees[0]

t.data = d
t.newComp(free=1, spec='empirical')
t.newRMatrix(free=0, spec='ones')
t.setNGammaCat(nGammaCat=4)
t.newGdasrv(free=1, val=0.5)
t.setPInvar(free=1, val=0.2)

t.optLogLike()
t.writeNexus('optTree.nex')
t.model.dump()

Calculate likelihood with more than one data partition

# Calculate likelihood with more than one data partition.
read("d.nex")
a = var.alignments[0]
a.setCharPartition('p1')
d = Data()
#d.dump()

read("t.nex")
t = var.trees[0]
t.data = d

pNum = 0
t.newComp(partNum=pNum, free=0, spec='empirical')
t.newRMatrix(partNum=pNum, free=1, spec='ones')
t.setNGammaCat(partNum=pNum, nGammaCat=4)
t.newGdasrv(partNum=pNum, free=1, val=0.5)
t.setPInvar(partNum=pNum, free=1, val=0.1)
t.setRelRate(partNum=pNum, val=5.0)

pNum = 1
t.newComp(partNum=pNum, free=0, spec='empirical')
t.newRMatrix(partNum=pNum, free=1, spec='ones')
t.setNGammaCat(partNum=pNum, nGammaCat=1)
#t.newGdasrv(partNum=pNum, free=0, val=0.5)
t.setPInvar(partNum=pNum, free=1, val=0.1)
t.setRelRate(partNum=pNum, val=2.0)

t.model.relRatesAreFree = True

t.optLogLike()
t.model.dump()
t.tPickle('opt')


Do an MCMC

# Do an MCMC.
read("d.nex")
d = Data()
t = func.randomTree(taxNames=d.taxNames)
t.data = d
t.newComp(free=1, spec='empirical')
t.newRMatrix(free=1, spec='ones')
t.setNGammaCat(nGammaCat=4)
t.newGdasrv(free=1, val=0.5)
t.setPInvar(free=1, val=0.2)
m = Mcmc(t, nChains=4, runNum=0, sampleInterval=1000, checkPointInterval=250000)
m.run(1000000)
func.summarizeMcmcPrams(skip=500)


Do an MCMC with more than one data partition

# Do an MCMC with more than one data partition.
read("d.nex")
a = var.alignments[0]
a.setCharPartition('p1')
d = Data()
t = func.randomTree(taxNames=d.taxNames)
t.data = d

pNum = 0
t.newComp(partNum=pNum, free=1, spec='empirical')
t.newRMatrix(partNum=pNum, free=1, spec='ones')
t.setNGammaCat(partNum=pNum, nGammaCat=4)
t.newGdasrv(partNum=pNum, free=1, val=0.5)
t.setPInvar(partNum=pNum, free=1, val=0.1)
t.setRelRate(partNum=pNum, val=5.0)

pNum = 1
t.newComp(partNum=pNum, free=1, spec='empirical')
t.newRMatrix(partNum=pNum, free=1, spec='ones')
t.setNGammaCat(partNum=pNum, nGammaCat=1)
#t.newGdasrv(partNum=pNum, free=0, val=0.5)
t.setPInvar(partNum=pNum, free=1, val=0.1)
t.setRelRate(partNum=pNum, val=2.0)

t.model.relRatesAreFree = True

m = Mcmc(t, nChains=4, runNum=0, sampleInterval=10, checkPointInterval=2000)
m.run(4000)
func.summarizeMcmcPrams(skip=200)


Read checkPoints from an MCMC

# Read checkPoints from an MCMC.

# Read them in ...
cpr = McmcCheckPointReader(theGlob='*')

# A table of acceptances
cpr.writeProposalAcceptances()

# Compare splits using average std deviation of split frequencies
#cpr.compareSplitsAll()
# or between only two checkpoints ...
#cpr.compareSplits(0, 1)

# How was swapping between heated chains?
#cpr.writeSwapMatrices()

#cpr.writeProposalProbs()

# Get a single Mcmc object ..
if 0:
    m = cpr.mm[0]
    print m.tunings


Restart an MCMC

# Restart an MCMC.
read("d.nex")
d = Data()
m = func.unPickleMcmc(0, d)
if 0:
    m.tunings.chainTemp = 0.15
    m.tunings.relRate = 1.2
    #m.tunings.parts[0].rMatrix = 1000.0
    #m.tunings.parts[0].comp = 50.
    #m.prob.comp = 0
m.run(4000)


Simulate data

# Simulate data
if 0:
    read("d.nex")
    d = Data()
if 1:
    nTax = 5
    taxNames = list(string.uppercase[:nTax])
    a = func.newEmptyAlignment(dataType='dna', taxNames=taxNames, length=200)
    d = Data([a])

if 0:
    read("t.nex")
    t = var.trees[0]
    #t.taxNames = taxNames
if 0:
    read('(B:0.5, ((D:0.4, A:0.3), C:0.5), E:0.5);')
    t = var.trees[0]
    t.taxNames = taxNames
if 1:
    t = func.randomTree(taxNames=taxNames)

t.data = d
t.newComp(free=0, spec='specified', val=[0.1, 0.2, 0.3])
t.newRMatrix(free=0, spec='specified', val=[2., 3., 4., 5., 6., 7.])
t.setNGammaCat(nGammaCat=4)
t.newGdasrv(free=0, val=0.5)
t.setPInvar(free=0, val=0.2)

func.reseedCRandomizer(os.getpid())
t.simulate()

if 1:
    d.writeNexus('d.nex', writeDataBlock=True)
if 0:
    d.alignments[0].writePhylip("d.phy")

Simulate data with more than one data partition

# Simulate data with more than one data partition.
nTax = 5
taxNames = list(string.uppercase[:nTax])

dnaAlign = func.newEmptyAlignment(dataType='dna', taxNames=taxNames, length=300)
read(r"#nexus begin sets; charpartition p1 = first:1-100, second:101-.; end;")
dnaAlign.setCharPartition('p1')
d = Data([dnaAlign])
#d.dump()
read("t.nex")
t = var.trees[0]
#t.taxNames = taxNames

t.data = d

pNum = 0
t.newComp(partNum=pNum, free=0, spec='specified', val=[0.2, 0.1, 0.3])
t.newRMatrix(partNum=pNum, free=0, spec='specified', val=[2., 3., 4., 5., 6., 7.])
t.setNGammaCat(partNum=pNum, nGammaCat=4)
t.newGdasrv(partNum=pNum, free=0, val=1.2)
t.setPInvar(partNum=pNum, free=0, val=0.2)
t.setRelRate(partNum=pNum, val=1.5)

pNum = 1
t.newComp(partNum=pNum, free=0, spec='specified', val=[0.45, 0.05, 0.45])
t.newRMatrix(partNum=pNum, free=0, spec='specified', val=[1.2, 3.4, 0.4, 5.9, 0.6, 1.0])
t.setNGammaCat(partNum=pNum, nGammaCat=1)
#t.newGdasrv(partNum=pNum, free=0, val=0.5)
t.setPInvar(partNum=pNum, free=0, val=0.35)
t.setRelRate(partNum=pNum, val=0.3)

func.reseedCRandomizer(os.getpid())
t.simulate()
d.writeNexus('d.nex')

Simulate very hetero data

# Simulate very hetero data

# Below you specify the dataType, length, and relRate of each data
# partition (and symbols, if it is standard dataType).  There is one
# part per alignment, in this example.  You also specify the number of
# taxa in the tree

import random
import math

def randomNumbersThatSumTo1(length, minimum):
    maximum = 1.0 - ((length - 1) * minimum)
    myRange = maximum - minimum
    rnn = [random.random() for i in range(length)]
    mySum = sum(rnn)
    rnn = [minimum + (rn / (mySum / myRange)) for rn in rnn]
    assert math.fabs(sum(rnn) - 1.0) < 1.0e-14
    assert min(rnn) >= minimum
    return rnn

class MySPart(object):
    def __init__(self, dataType, sLen, relRate, symbols=None):
        self.dataType = dataType
        self.sLen = sLen
        self.relRate = relRate
        self.symbols = None
        if dataType == 'standard':
            assert symbols
            self.symbols = symbols

myParts = []
myParts.append(MySPart('dna', 231, 4.6))
myParts.append(MySPart('protein', 411, 0.3))
myParts.append(MySPart('standard', 197, 2.7, symbols='123456'))

nTax = 17
taxNames = ['t%i' % i for i in range(nTax)]
aa = []
for mP in myParts:
    if mP.dataType != 'standard':
        a = func.newEmptyAlignment(dataType=mP.dataType, taxNames=taxNames, length=mP.sLen)
    else:
        a = func.newEmptyAlignment(dataType=mP.dataType, symbols=mP.symbols, taxNames=taxNames, length=mP.sLen)
    aa.append(a)
d = Data(aa)
t = func.randomTree(taxNames=taxNames)
t.data = d
for aNum in range(len(aa)):
    a = aa[aNum]
    mP = myParts[aNum]
    for myNode in t.iterNodes():
        newVal = randomNumbersThatSumTo1(a.dim, var.PIVEC_MIN)
        mt = t.newComp(partNum=aNum, free=1, spec='specified', val=newVal)
        t.setModelThing(mt, myNode, clade=False)

    if a.dataType != 'protein':
        for myNode in t.iterNodesNoRoot():
            nRates = ((a.dim * a.dim) - a.dim) / 2
            newVal = randomNumbersThatSumTo1(nRates, var.RATE_MIN)
            mt = t.newRMatrix(partNum=aNum, free=1, spec='specified', val=newVal)
            t.setModelThing(mt, myNode, clade=False)
    else:
        t.newRMatrix(partNum=aNum, free=0, spec='wag')
    t.setNGammaCat(partNum=aNum, nGammaCat=4)
    t.newGdasrv(partNum=aNum, free=1, val=0.5)
    t.setPInvar(partNum=aNum, free=0, val=0.0)
    t.setRelRate(partNum=aNum, val=mP.relRate)

t.model.relRatesAreFree = True
func.reseedCRandomizer(os.getpid())
t.simulate()
t.name = 'simTree'

d.writeNexus('d.nex', writeDataBlock=True)
t.tPickle()