Scripts for common tasks¶
This is a cook book of reminders, suggestions, and points of departure
for customization. You can copy and paste these into your own files.
Or you can use the p4 recipes function, func.recipes()
, to write new
files.
In these scripts I suggest variations, often by enclosing the variation
in an if 1:
block to turn it on, or an if 0:
block to turn it off.
Hopefully the meaning is clear and the (sometimes mutually exclusive)
possibilities are obvious.
Make a consensus tree, with variations¶
# Make a consensus tree, with variations.
mySkip = 500
tp = TreePartitions("mcmc_trees_0.nex", skip=mySkip)
# read another file
tp.read("mcmc_trees_1.nex", skip=mySkip)
t = tp.consensus()
# Re-root
t.reRoot(t.node('myOutTaxon').parent)
# Collapse poorly supported nodes
toCollapse = [n for n in t.iterInternalsNoRoot() if n.br.support < 0.7]
for n in toCollapse:
t.collapseNode(n)
# Transfer node.br.support to node.name as percent
for n in t.iterInternalsNoRoot():
n.name = "%.0f" % (100. * n.br.support)
t.draw()
t.writeNexus('cons01_70.nex')
Make a consensus tree, uncomplicated¶
# Make a consensus tree, uncomplicated.
tp = TreePartitions("mcmc_trees_0.nex", skip=500)
t = tp.consensus()
for n in t.iterInternalsNoRoot():
n.name = "%.0f" % (100. * n.br.support)
t.draw()
t.writeNexus('cons.nex')
Calculate a likelihood¶
# Calculate a likelihood.
read("d.nex")
d = Data()
read("t.nex")
t = var.trees[0]
t.data = d
t.newComp(free=1, spec='empirical')
t.newRMatrix(free=0, spec='ones')
t.setNGammaCat(nGammaCat=4)
t.newGdasrv(free=1, val=0.5)
t.setPInvar(free=1, val=0.2)
t.optLogLike()
t.writeNexus('optTree.nex')
t.model.dump()
Calculate likelihood with more than one data partition¶
# Calculate likelihood with more than one data partition.
read("d.nex")
a = var.alignments[0]
a.setCharPartition('p1')
d = Data()
#d.dump()
read("t.nex")
t = var.trees[0]
t.data = d
pNum = 0
t.newComp(partNum=pNum, free=0, spec='empirical')
t.newRMatrix(partNum=pNum, free=1, spec='ones')
t.setNGammaCat(partNum=pNum, nGammaCat=4)
t.newGdasrv(partNum=pNum, free=1, val=0.5)
t.setPInvar(partNum=pNum, free=1, val=0.1)
t.setRelRate(partNum=pNum, val=5.0)
pNum = 1
t.newComp(partNum=pNum, free=0, spec='empirical')
t.newRMatrix(partNum=pNum, free=1, spec='ones')
t.setNGammaCat(partNum=pNum, nGammaCat=1)
#t.newGdasrv(partNum=pNum, free=0, val=0.5)
t.setPInvar(partNum=pNum, free=1, val=0.1)
t.setRelRate(partNum=pNum, val=2.0)
t.model.relRatesAreFree = True
t.optLogLike()
t.model.dump()
t.tPickle('opt')
Do an MCMC¶
# Do an MCMC.
# The defaults for these values are much smaller. These suggested
# changes may not be needed all the time, but seems to help for
# difficult data.
var.PIVEC_MIN = 1.e-6
var.RATE_MIN = 1.e-6
var.BRLEN_MIN = 1.e-5
var.GAMMA_SHAPE_MIN = 0.15
# Assuming more than one run, we set the run number by calling this script as
# p4 sMcmc.py -- 0 # 0, 1, 2, ...
rNum = int(var.argvAfterDoubleDash[0])
read("d.nex")
d = Data()
t = func.randomTree(taxNames=d.taxNames)
t.data = d
t.newComp(free=1, spec='empirical')
t.newRMatrix(free=1, spec='ones')
t.setNGammaCat(nGammaCat=4)
t.newGdasrv(free=1, val=0.5)
t.setPInvar(free=0, val=0.0)
nGen = 1000000
nSamples = 2000
sInterv = int(nGen / nSamples)
nCheckPoints = 2
cpInterv = int(nGen / nCheckPoints) # or None
m = Mcmc(t, nChains=4, runNum=rNum, sampleInterval=sInterv, checkPointInterval=cpInterv)
m.run(nGen)
# func.summarizeMcmcPrams(skip=500)
Do an MCMC with more than one data partition¶
# Do an MCMC with more than one data partition.
# The defaults for these values are much smaller. These suggested
# changes may not be needed all the time, but seems to help for
# difficult data.
var.PIVEC_MIN = 1.e-6
var.RATE_MIN = 1.e-6
var.BRLEN_MIN = 1.e-5
var.GAMMA_SHAPE_MIN = 0.15
# Assuming more than one run, we set the run number by calling this script as
# p4 sMcmcMultiPart.py -- 0 # 0, 1, 2, ...
rNum = int(var.argvAfterDoubleDash[0])
read("d.nex")
a = var.alignments[0]
a.setCharPartition('p1')
d = Data()
t = func.randomTree(taxNames=d.taxNames)
t.data = d
pNum = 0
t.newComp(partNum=pNum, free=1, spec='empirical')
t.newRMatrix(partNum=pNum, free=1, spec='ones')
t.setNGammaCat(partNum=pNum, nGammaCat=4)
t.newGdasrv(partNum=pNum, free=1, val=0.5)
t.setPInvar(partNum=pNum, free=1, val=0.1)
t.setRelRate(partNum=pNum, val=5.0)
pNum = 1
t.newComp(partNum=pNum, free=1, spec='empirical')
t.newRMatrix(partNum=pNum, free=1, spec='ones')
t.setNGammaCat(partNum=pNum, nGammaCat=1)
#t.newGdasrv(partNum=pNum, free=0, val=0.5)
t.setPInvar(partNum=pNum, free=1, val=0.1)
t.setRelRate(partNum=pNum, val=2.0)
t.model.relRatesAreFree = True
nGen = 1000000
nSamples = 2000
sInterv = int(nGen / nSamples)
nCheckPoints = 2
cpInterv = int(nGen / nCheckPoints) # or None
m = Mcmc(t, nChains=4, runNum=rNum, sampleInterval=sInterv, checkPointInterval=cpInterv)
m.run(nGen)
# func.summarizeMcmcPrams(skip=200)
Read checkPoints from an MCMC¶
# Read checkPoints from an MCMC.
# Read them in ...
cpr = McmcCheckPointReader(theGlob='*')
# A table of acceptances
cpr.writeProposalAcceptances()
# Compare splits using average std deviation of split frequencies
#cpr.compareSplitsAll()
# or between only two checkpoints ...
#cpr.compareSplits(0, 1)
# How was swapping between heated chains?
#cpr.writeSwapVectors()
#cpr.writeProposalProbs()
# Get a single Mcmc object ..
if 0:
m = cpr.mm[0]
Restart an MCMC¶
# Restart an MCMC from a checkpoint.
# Settings made to the var module in the original MCMC are not in
# the pickled file, so you will probably need to set them again ---
var.PIVEC_MIN = 1.e-6
var.RATE_MIN = 1.e-6
var.BRLEN_MIN = 1.e-5
var.GAMMA_SHAPE_MIN = 0.15
read("d.nex")
d = Data()
# Assuming more than one run, we set the run number by calling this script as
# p4 sRestartMcmc.py -- 0 # eg 0, 1, 2, ...
rNum = int(var.argvAfterDoubleDash[0])
m = func.unPickleMcmc(rNum, d)
# You probably don't want to do this, but
# it is possible to change the mcmc here.
# eg
# m.sampleInterval = 200
# Restart the run with a set number of generations, eg
# m.run(100000)
# Or set the number of gens in terms of the checkPointInterval
print("The checkpoint interval is currently %i" % m.checkPointInterval)
# For example, say to do two more checkpoints' worth of gens.
# m.run(m.checkPointInterval * 2)
Simulate data¶
# Simulate data
if 0:
read("d.nex")
d = Data()
if 1:
nTax = 5
taxNames = list(string.ascii_uppercase[:nTax])
a = func.newEmptyAlignment(dataType='dna', taxNames=taxNames, length=200)
d = Data([a])
if 0:
read("t.nex")
t = var.trees[0]
#t.taxNames = taxNames
if 0:
read('(B:0.5, ((D:0.4, A:0.3), C:0.5), E:0.5);')
t = var.trees[0]
t.taxNames = taxNames
if 1:
t = func.randomTree(taxNames=taxNames)
t.data = d
t.newComp(free=0, spec='specified', val=[0.1, 0.2, 0.3])
t.newRMatrix(free=0, spec='specified', val=[2., 3., 4., 5., 6., 7.])
t.setNGammaCat(nGammaCat=4)
t.newGdasrv(free=0, val=0.5)
t.setPInvar(free=0, val=0.2)
t.simulate()
if 1:
d.writeNexus('d.nex', writeDataBlock=True)
if 0:
d.alignments[0].writePhylip("d.phy")
Simulate data with more than one data partition¶
# Simulate data with more than one data partition.
nTax = 5
taxNames = list(string.ascii_uppercase[:nTax])
dnaAlign = func.newEmptyAlignment(dataType='dna', taxNames=taxNames, length=300)
read(r"#nexus begin sets; charpartition p1 = first:1-100, second:101-.; end;")
dnaAlign.setCharPartition('p1')
d = Data([dnaAlign])
#d.dump()
read("t.nex")
t = var.trees[0]
#t.taxNames = taxNames
t.data = d
pNum = 0
t.newComp(partNum=pNum, free=0, spec='specified', val=[0.2, 0.1, 0.3])
t.newRMatrix(partNum=pNum, free=0, spec='specified', val=[2., 3., 4., 5., 6., 7.])
t.setNGammaCat(partNum=pNum, nGammaCat=4)
t.newGdasrv(partNum=pNum, free=0, val=1.2)
t.setPInvar(partNum=pNum, free=0, val=0.2)
t.setRelRate(partNum=pNum, val=1.5)
pNum = 1
t.newComp(partNum=pNum, free=0, spec='specified', val=[0.45, 0.05, 0.45])
t.newRMatrix(partNum=pNum, free=0, spec='specified', val=[1.2, 3.4, 0.4, 5.9, 0.6, 1.0])
t.setNGammaCat(partNum=pNum, nGammaCat=1)
#t.newGdasrv(partNum=pNum, free=0, val=0.5)
t.setPInvar(partNum=pNum, free=0, val=0.35)
t.setRelRate(partNum=pNum, val=0.3)
t.simulate()
d.writeNexus('d.nex')
Simulate very hetero data¶
# Simulate very hetero data
# Below you specify the dataType, length, and relRate of each data
# partition (and symbols, if it is standard dataType). There is one
# part per alignment, in this example. You also specify the number of
# taxa in the tree
import random
import math
def randomNumbersThatSumTo1(length, minimum):
maximum = 1.0 - ((length - 1) * minimum)
myRange = maximum - minimum
rnn = [random.random() for i in range(int(length))]
mySum = sum(rnn)
rnn = [minimum + (rn / (mySum / myRange)) for rn in rnn]
assert math.fabs(sum(rnn) - 1.0) < 1.0e-14
assert min(rnn) >= minimum
return rnn
class MySPart(object):
def __init__(self, dataType, sLen, relRate, symbols=None):
self.dataType = dataType
self.sLen = sLen
self.relRate = relRate
self.symbols = None
if dataType == 'standard':
assert symbols
self.symbols = symbols
myParts = []
myParts.append(MySPart('dna', 231, 4.6))
myParts.append(MySPart('protein', 411, 0.3))
myParts.append(MySPart('standard', 197, 2.7, symbols='123456'))
nTax = 17
taxNames = ['t%i' % i for i in range(nTax)]
aa = []
for mP in myParts:
if mP.dataType != 'standard':
a = func.newEmptyAlignment(dataType=mP.dataType, taxNames=taxNames, length=mP.sLen)
else:
a = func.newEmptyAlignment(dataType=mP.dataType, symbols=mP.symbols, taxNames=taxNames, length=mP.sLen)
aa.append(a)
d = Data(aa)
t = func.randomTree(taxNames=taxNames)
t.data = d
for aNum in range(len(aa)):
a = aa[aNum]
mP = myParts[aNum]
for myNode in t.iterNodes():
newVal = randomNumbersThatSumTo1(a.dim, var.PIVEC_MIN)
mt = t.newComp(partNum=aNum, free=1, spec='specified', val=newVal)
t.setModelComponentOnNode(mt, myNode, clade=False)
if a.dataType != 'protein':
for myNode in t.iterNodesNoRoot():
nRates = ((a.dim * a.dim) - a.dim) / 2
newVal = randomNumbersThatSumTo1(nRates, var.RATE_MIN)
mt = t.newRMatrix(partNum=aNum, free=1, spec='specified', val=newVal)
t.setModelComponentOnNode(mt, myNode, clade=False)
else:
t.newRMatrix(partNum=aNum, free=0, spec='wag')
t.setNGammaCat(partNum=aNum, nGammaCat=4)
t.newGdasrv(partNum=aNum, free=1, val=0.5)
t.setPInvar(partNum=aNum, free=0, val=0.0)
t.setRelRate(partNum=aNum, val=mP.relRate)
t.model.relRatesAreFree = True
t.simulate()
t.name = 'simTree'
d.writeNexus('d.nex', writeDataBlock=True)
t.tPickle()