Scripts for common tasks

This is a cook book of reminders, suggestions, and points of departure for customization. You can copy and paste these into your own files. Or you can use the p4 recipes function, func.recipes(), to write new files.

In these scripts I suggest variations, often by enclosing the variation in an if 1: block to turn it on, or an if 0: block to turn it off. Hopefully the meaning is clear and the (sometimes mutually exclusive) possibilities are obvious.

Make a consensus tree, with variations

# Make a consensus tree, with variations.
mySkip = 500
tp = TreePartitions("mcmc_trees_0.nex", skip=mySkip)

# read another file
tp.read("mcmc_trees_1.nex", skip=mySkip)

t = tp.consensus()

# Re-root
t.reRoot(t.node('myOutTaxon').parent)

# Collapse poorly supported nodes
toCollapse = [n for n in t.iterInternalsNoRoot() if n.br.support < 0.7]
for n in toCollapse:
    t.collapseNode(n)

# Transfer node.br.support to node.name as percent
for n in t.iterInternalsNoRoot():
    n.name = "%.0f" % (100. * n.br.support)

t.draw()
t.writeNexus('cons01_70.nex')

Make a consensus tree, uncomplicated

# Make a consensus tree, uncomplicated.
tp = TreePartitions("mcmc_trees_0.nex", skip=500)
t = tp.consensus()
for n in t.iterInternalsNoRoot():
    n.name = "%.0f" % (100. * n.br.support)
t.draw()
t.writeNexus('cons.nex')

Calculate a likelihood

# Calculate a likelihood.
read("d.nex")
d = Data()

read("t.nex")
t = var.trees[0]

t.data = d
t.newComp(free=1, spec='empirical')
t.newRMatrix(free=0, spec='ones')
t.setNGammaCat(nGammaCat=4)
t.newGdasrv(free=1, val=0.5)
t.setPInvar(free=1, val=0.2)

t.optLogLike()
t.writeNexus('optTree.nex')
t.model.dump()

Calculate likelihood with more than one data partition

# Calculate likelihood with more than one data partition.
read("d.nex")
a = var.alignments[0]
a.setCharPartition('p1')
d = Data()
#d.dump()

read("t.nex")
t = var.trees[0]
t.data = d

pNum = 0
t.newComp(partNum=pNum, free=0, spec='empirical')
t.newRMatrix(partNum=pNum, free=1, spec='ones')
t.setNGammaCat(partNum=pNum, nGammaCat=4)
t.newGdasrv(partNum=pNum, free=1, val=0.5)
t.setPInvar(partNum=pNum, free=1, val=0.1)
t.setRelRate(partNum=pNum, val=5.0)

pNum = 1
t.newComp(partNum=pNum, free=0, spec='empirical')
t.newRMatrix(partNum=pNum, free=1, spec='ones')
t.setNGammaCat(partNum=pNum, nGammaCat=1)
#t.newGdasrv(partNum=pNum, free=0, val=0.5)
t.setPInvar(partNum=pNum, free=1, val=0.1)
t.setRelRate(partNum=pNum, val=2.0)

t.model.relRatesAreFree = True

t.optLogLike()
t.model.dump()
t.tPickle('opt')


Do an MCMC

# Do an MCMC.

# The defaults for these values are much smaller.  These suggested
# changes may not be needed all the time, but seems to help for
# difficult data.
var.PIVEC_MIN = 1.e-6
var.RATE_MIN = 1.e-6
var.BRLEN_MIN = 1.e-5
var.GAMMA_SHAPE_MIN = 0.15

# Assuming more than one run, we set the run number by calling this script as
# p4 sMcmc.py -- 0     # 0, 1, 2, ...
rNum = int(var.argvAfterDoubleDash[0])

read("d.nex")
d = Data()
t = func.randomTree(taxNames=d.taxNames)
t.data = d
t.newComp(free=1, spec='empirical')
t.newRMatrix(free=1, spec='ones')
t.setNGammaCat(nGammaCat=4)
t.newGdasrv(free=1, val=0.5)
t.setPInvar(free=0, val=0.0)

nGen = 1000000
nSamples = 2000
sInterv = int(nGen / nSamples)
nCheckPoints = 2
cpInterv = int(nGen / nCheckPoints)  # or None

m = Mcmc(t, nChains=4, runNum=rNum, sampleInterval=sInterv, checkPointInterval=cpInterv)
m.run(nGen)
# func.summarizeMcmcPrams(skip=500)


Do an MCMC with more than one data partition

# Do an MCMC with more than one data partition.

# The defaults for these values are much smaller.  These suggested
# changes may not be needed all the time, but seems to help for
# difficult data.
var.PIVEC_MIN = 1.e-6
var.RATE_MIN = 1.e-6
var.BRLEN_MIN = 1.e-5
var.GAMMA_SHAPE_MIN = 0.15

# Assuming more than one run, we set the run number by calling this script as
# p4 sMcmcMultiPart.py -- 0     # 0, 1, 2, ...
rNum = int(var.argvAfterDoubleDash[0])

read("d.nex")
a = var.alignments[0]
a.setCharPartition('p1')
d = Data()
t = func.randomTree(taxNames=d.taxNames)
t.data = d

pNum = 0
t.newComp(partNum=pNum, free=1, spec='empirical')
t.newRMatrix(partNum=pNum, free=1, spec='ones')
t.setNGammaCat(partNum=pNum, nGammaCat=4)
t.newGdasrv(partNum=pNum, free=1, val=0.5)
t.setPInvar(partNum=pNum, free=1, val=0.1)
t.setRelRate(partNum=pNum, val=5.0)

pNum = 1
t.newComp(partNum=pNum, free=1, spec='empirical')
t.newRMatrix(partNum=pNum, free=1, spec='ones')
t.setNGammaCat(partNum=pNum, nGammaCat=1)
#t.newGdasrv(partNum=pNum, free=0, val=0.5)
t.setPInvar(partNum=pNum, free=1, val=0.1)
t.setRelRate(partNum=pNum, val=2.0)

t.model.relRatesAreFree = True

nGen = 1000000
nSamples = 2000
sInterv = int(nGen / nSamples)
nCheckPoints = 2
cpInterv = int(nGen / nCheckPoints)  # or None

m = Mcmc(t, nChains=4, runNum=rNum, sampleInterval=sInterv, checkPointInterval=cpInterv)
m.run(nGen)
# func.summarizeMcmcPrams(skip=200)


Read checkPoints from an MCMC

# Read checkPoints from an MCMC.

# Read them in ...
cpr = McmcCheckPointReader(theGlob='*')

# A table of acceptances
cpr.writeProposalAcceptances()

# Compare splits using average std deviation of split frequencies
#cpr.compareSplitsAll()
# or between only two checkpoints ...
#cpr.compareSplits(0, 1)

# How was swapping between heated chains?
#cpr.writeSwapVectors()

#cpr.writeProposalProbs()

# Get a single Mcmc object ..
if 0:
    m = cpr.mm[0]


Restart an MCMC

# Restart an MCMC from a checkpoint.

# Settings made to the var module in the original MCMC are not in
# the pickled file, so you will probably need to set them again ---
var.PIVEC_MIN = 1.e-6
var.RATE_MIN = 1.e-6
var.BRLEN_MIN = 1.e-5
var.GAMMA_SHAPE_MIN = 0.15

read("d.nex")
d = Data()

# Assuming more than one run, we set the run number by calling this script as
# p4 sRestartMcmc.py -- 0     # eg 0, 1, 2, ...
rNum = int(var.argvAfterDoubleDash[0])

m = func.unPickleMcmc(rNum, d)

# You probably don't want to do this, but 
# it is possible to change the mcmc here.
# eg
# m.sampleInterval = 200

# Restart the run with a set number of generations, eg
# m.run(100000)

# Or set the number of gens in terms of the checkPointInterval
print("The checkpoint interval is currently %i" % m.checkPointInterval)

# For example, say to do two more checkpoints' worth of gens.
# m.run(m.checkPointInterval * 2)


Simulate data

# Simulate data
if 0:
    read("d.nex")
    d = Data()
if 1:
    nTax = 5
    taxNames = list(string.ascii_uppercase[:nTax])
    a = func.newEmptyAlignment(dataType='dna', taxNames=taxNames, length=200)
    d = Data([a])

if 0:
    read("t.nex")
    t = var.trees[0]
    #t.taxNames = taxNames
if 0:
    read('(B:0.5, ((D:0.4, A:0.3), C:0.5), E:0.5);')
    t = var.trees[0]
    t.taxNames = taxNames
if 1:
    t = func.randomTree(taxNames=taxNames)

t.data = d
t.newComp(free=0, spec='specified', val=[0.1, 0.2, 0.3])
t.newRMatrix(free=0, spec='specified', val=[2., 3., 4., 5., 6., 7.])
t.setNGammaCat(nGammaCat=4)
t.newGdasrv(free=0, val=0.5)
t.setPInvar(free=0, val=0.2)

t.simulate()

if 1:
    d.writeNexus('d.nex', writeDataBlock=True)
if 0:
    d.alignments[0].writePhylip("d.phy")

Simulate data with more than one data partition

# Simulate data with more than one data partition.
nTax = 5
taxNames = list(string.ascii_uppercase[:nTax])

dnaAlign = func.newEmptyAlignment(dataType='dna', taxNames=taxNames, length=300)
read(r"#nexus begin sets; charpartition p1 = first:1-100, second:101-.; end;")
dnaAlign.setCharPartition('p1')
d = Data([dnaAlign])
#d.dump()
read("t.nex")
t = var.trees[0]
#t.taxNames = taxNames

t.data = d

pNum = 0
t.newComp(partNum=pNum, free=0, spec='specified', val=[0.2, 0.1, 0.3])
t.newRMatrix(partNum=pNum, free=0, spec='specified', val=[2., 3., 4., 5., 6., 7.])
t.setNGammaCat(partNum=pNum, nGammaCat=4)
t.newGdasrv(partNum=pNum, free=0, val=1.2)
t.setPInvar(partNum=pNum, free=0, val=0.2)
t.setRelRate(partNum=pNum, val=1.5)

pNum = 1
t.newComp(partNum=pNum, free=0, spec='specified', val=[0.45, 0.05, 0.45])
t.newRMatrix(partNum=pNum, free=0, spec='specified', val=[1.2, 3.4, 0.4, 5.9, 0.6, 1.0])
t.setNGammaCat(partNum=pNum, nGammaCat=1)
#t.newGdasrv(partNum=pNum, free=0, val=0.5)
t.setPInvar(partNum=pNum, free=0, val=0.35)
t.setRelRate(partNum=pNum, val=0.3)

t.simulate()
d.writeNexus('d.nex')

Simulate very hetero data

# Simulate very hetero data

# Below you specify the dataType, length, and relRate of each data
# partition (and symbols, if it is standard dataType).  There is one
# part per alignment, in this example.  You also specify the number of
# taxa in the tree

import random
import math

def randomNumbersThatSumTo1(length, minimum):
    maximum = 1.0 - ((length - 1) * minimum)
    myRange = maximum - minimum
    rnn = [random.random() for i in range(int(length))]
    mySum = sum(rnn)
    rnn = [minimum + (rn / (mySum / myRange)) for rn in rnn]
    assert math.fabs(sum(rnn) - 1.0) < 1.0e-14
    assert min(rnn) >= minimum
    return rnn

class MySPart(object):
    def __init__(self, dataType, sLen, relRate, symbols=None):
        self.dataType = dataType
        self.sLen = sLen
        self.relRate = relRate
        self.symbols = None
        if dataType == 'standard':
            assert symbols
            self.symbols = symbols

myParts = []
myParts.append(MySPart('dna', 231, 4.6))
myParts.append(MySPart('protein', 411, 0.3))
myParts.append(MySPart('standard', 197, 2.7, symbols='123456'))

nTax = 17
taxNames = ['t%i' % i for i in range(nTax)]
aa = []
for mP in myParts:
    if mP.dataType != 'standard':
        a = func.newEmptyAlignment(dataType=mP.dataType, taxNames=taxNames, length=mP.sLen)
    else:
        a = func.newEmptyAlignment(dataType=mP.dataType, symbols=mP.symbols, taxNames=taxNames, length=mP.sLen)
    aa.append(a)
d = Data(aa)
t = func.randomTree(taxNames=taxNames)
t.data = d
for aNum in range(len(aa)):
    a = aa[aNum]
    mP = myParts[aNum]
    for myNode in t.iterNodes():
        newVal = randomNumbersThatSumTo1(a.dim, var.PIVEC_MIN)
        mt = t.newComp(partNum=aNum, free=1, spec='specified', val=newVal)
        t.setModelComponentOnNode(mt, myNode, clade=False)

    if a.dataType != 'protein':
        for myNode in t.iterNodesNoRoot():
            nRates = ((a.dim * a.dim) - a.dim) / 2
            newVal = randomNumbersThatSumTo1(nRates, var.RATE_MIN)
            mt = t.newRMatrix(partNum=aNum, free=1, spec='specified', val=newVal)
            t.setModelComponentOnNode(mt, myNode, clade=False)
    else:
        t.newRMatrix(partNum=aNum, free=0, spec='wag')
    t.setNGammaCat(partNum=aNum, nGammaCat=4)
    t.newGdasrv(partNum=aNum, free=1, val=0.5)
    t.setPInvar(partNum=aNum, free=0, val=0.0)
    t.setRelRate(partNum=aNum, val=mP.relRate)

t.model.relRatesAreFree = True
t.simulate()
t.name = 'simTree'

d.writeNexus('d.nex', writeDataBlock=True)
t.tPickle()