Scripts for common tasks

2025-01-29.

This is a cook book of reminders, suggestions, and points of departure for customization. You can copy and paste these into your own files. Or you can use the p4 recipes function, func.recipes(), to write new files.

In these scripts I suggest variations, often by enclosing the variation in an if 1: block to turn it on, or an if 0: block to turn it off. Hopefully the meaning is clear and the (sometimes mutually exclusive) possibilities are obvious.

Maximum likelihood

Calculate a likelihood

# Calculate a likelihood.
read("d.nex")
d = Data()

read("t.nex")
t = var.trees[0]

t.data = d
t.newComp(free=1, spec='empirical')
t.newRMatrix(free=0, spec='ones')
t.setNGammaCat(nGammaCat=4)
t.newGdasrv(free=1, val=0.5)
t.setPInvar(free=1, val=0.2)

t.optLogLike()
t.writeNexus('optTree.nex')
t.model.dump()

Calculate likelihood with more than one data partition

# Calculate likelihood with more than one data partition.
read("d.nex")
a = var.alignments[0]
a.setCharPartition('p1')
d = Data()
#d.dump()

read("t.nex")
t = var.trees[0]
t.data = d

pNum = 0
t.newComp(partNum=pNum, free=0, spec='empirical')
t.newRMatrix(partNum=pNum, free=1, spec='ones')
t.setNGammaCat(partNum=pNum, nGammaCat=4)
t.newGdasrv(partNum=pNum, free=1, val=0.5)
t.setPInvar(partNum=pNum, free=1, val=0.1)
t.setRelRate(partNum=pNum, val=5.0)

pNum = 1
t.newComp(partNum=pNum, free=0, spec='empirical')
t.newRMatrix(partNum=pNum, free=1, spec='ones')
t.setNGammaCat(partNum=pNum, nGammaCat=1)
#t.newGdasrv(partNum=pNum, free=0, val=0.5)
t.setPInvar(partNum=pNum, free=1, val=0.1)
t.setRelRate(partNum=pNum, val=2.0)

t.model.relRatesAreFree = True

t.optLogLike()
t.model.dump()
t.tPickle('opt')

Simulation

Simulate data

# Simulate data
if 0:
    read("d.nex")
    d = Data()
if 1:
    nTax = 5
    taxNames = list(string.ascii_uppercase[:nTax])
    a = func.newEmptyAlignment(dataType='dna', taxNames=taxNames, length=200)
    d = Data([a])

if 0:
    read("t.nex")
    t = var.trees[0]
    #t.taxNames = taxNames
if 0:
    read('(B:0.5, ((D:0.4, A:0.3), C:0.5), E:0.5);')
    t = var.trees[0]
    t.taxNames = taxNames
if 1:
    t = func.randomTree(taxNames=taxNames)

t.data = d
t.newComp(free=0, spec='specified', val=[0.1, 0.2, 0.3])
t.newRMatrix(free=0, spec='specified', val=[2., 3., 4., 5., 6., 7.])
t.setNGammaCat(nGammaCat=4)
t.newGdasrv(free=0, val=0.5)
t.setPInvar(free=0, val=0.2)

t.simulate()

if 1:
    d.writeNexus('d.nex')
if 0:
    d.alignments[0].writePhylip("d.phy")

Simulate data with more than one data partition

# Simulate data with more than one data partition.
nTax = 5
taxNames = list(string.ascii_uppercase[:nTax])

dnaAlign = func.newEmptyAlignment(dataType='dna', taxNames=taxNames, length=300)
read(r"#nexus begin sets; charpartition p1 = first:1-100, second:101-.; end;")
dnaAlign.setCharPartition('p1')
d = Data([dnaAlign])
#d.dump()
read("t.nex")
t = var.trees[0]
#t.taxNames = taxNames

t.data = d

pNum = 0
t.newComp(partNum=pNum, free=0, spec='specified', val=[0.2, 0.1, 0.3])
t.newRMatrix(partNum=pNum, free=0, spec='specified', val=[2., 3., 4., 5., 6., 7.])
t.setNGammaCat(partNum=pNum, nGammaCat=4)
t.newGdasrv(partNum=pNum, free=0, val=1.2)
t.setPInvar(partNum=pNum, free=0, val=0.2)
t.setRelRate(partNum=pNum, val=1.5)

pNum = 1
t.newComp(partNum=pNum, free=0, spec='specified', val=[0.45, 0.05, 0.45])
t.newRMatrix(partNum=pNum, free=0, spec='specified', val=[1.2, 3.4, 0.4, 5.9, 0.6, 1.0])
t.setNGammaCat(partNum=pNum, nGammaCat=1)
#t.newGdasrv(partNum=pNum, free=0, val=0.5)
t.setPInvar(partNum=pNum, free=0, val=0.35)
t.setRelRate(partNum=pNum, val=0.3)

t.simulate()
d.writeNexus('d.nex')

Simulate very hetero data

# Simulate very hetero data

# Below you specify the dataType, length, and relRate of each data
# partition (and symbols, if it is standard dataType).  There is one
# part per alignment, in this example.  You also specify the number of
# taxa in the tree

import random
import math

def randomNumbersThatSumTo1(length, minimum):
    maximum = 1.0 - ((length - 1) * minimum)
    myRange = maximum - minimum
    rnn = [random.random() for i in range(int(length))]
    mySum = sum(rnn)
    rnn = [minimum + (rn / (mySum / myRange)) for rn in rnn]
    assert math.fabs(sum(rnn) - 1.0) < 1.0e-14
    assert min(rnn) >= minimum
    return rnn

class MySPart(object):
    def __init__(self, dataType, sLen, relRate, symbols=None):
        self.dataType = dataType
        self.sLen = sLen
        self.relRate = relRate
        self.symbols = None
        if dataType == 'standard':
            assert symbols
            self.symbols = symbols

myParts = []
myParts.append(MySPart('dna', 231, 4.6))
myParts.append(MySPart('protein', 411, 0.3))
myParts.append(MySPart('standard', 197, 2.7, symbols='123456'))

nTax = 17
taxNames = ['t%i' % i for i in range(nTax)]
aa = []
for mP in myParts:
    if mP.dataType != 'standard':
        a = func.newEmptyAlignment(dataType=mP.dataType, taxNames=taxNames, length=mP.sLen)
    else:
        a = func.newEmptyAlignment(dataType=mP.dataType, symbols=mP.symbols, taxNames=taxNames, length=mP.sLen)
    aa.append(a)
d = Data(aa)
t = func.randomTree(taxNames=taxNames)
t.data = d
for aNum in range(len(aa)):
    a = aa[aNum]
    mP = myParts[aNum]
    for myNode in t.iterNodes():
        newVal = randomNumbersThatSumTo1(a.dim, var.PIVEC_MIN)
        mt = t.newComp(partNum=aNum, free=1, spec='specified', val=newVal)
        t.setModelComponentOnNode(mt, myNode, clade=False)

    if a.dataType != 'protein':
        for myNode in t.iterNodesNoRoot():
            nRates = ((a.dim * a.dim) - a.dim) / 2
            newVal = randomNumbersThatSumTo1(nRates, var.RATE_MIN)
            mt = t.newRMatrix(partNum=aNum, free=1, spec='specified', val=newVal)
            t.setModelComponentOnNode(mt, myNode, clade=False)
    else:
        t.newRMatrix(partNum=aNum, free=0, spec='wag')
    t.setNGammaCat(partNum=aNum, nGammaCat=4)
    t.newGdasrv(partNum=aNum, free=1, val=0.5)
    t.setPInvar(partNum=aNum, free=0, val=0.0)
    t.setRelRate(partNum=aNum, val=mP.relRate)

t.model.relRatesAreFree = True
t.simulate()
t.name = 'simTree'

d.writeNexus('d.nex', writeDataBlock=True)
t.tPickle()

Consensus trees

Make a consensus tree, uncomplicated

# Make a consensus tree, uncomplicated.
tp = TreePartitions("mcmc_trees_0.nex", skip=500)
t = tp.consensus()
for n in t.iterInternalsNoRoot():
    n.name = "%.0f" % (100. * n.br.support)
t.draw()
t.writeNexus('cons.nex')

Make a consensus tree, with variations

# Make a consensus tree, with variations.
mySkip = 500
tp = TreePartitions("mcmc_trees_0.nex", skip=mySkip)

# read another file
tp.read("mcmc_trees_1.nex", skip=mySkip)

t = tp.consensus()

# Re-root
t.reRoot(t.node('myOutTaxon').parent)

# Collapse poorly supported nodes
toCollapse = [n for n in t.iterInternalsNoRoot() if n.br.support < 0.7]
for n in toCollapse:
    t.collapseNode(n)

# Transfer node.br.support to node.name as percent
for n in t.iterInternalsNoRoot():
    n.name = "%.0f" % (100. * n.br.support)

t.draw()
t.writeNexus('cons01_70.nex')

MCMC

Do an MCMC

# Do an MCMC.

# The defaults for these values are much smaller.  These suggested
# changes may not be needed all the time, but seems to help for
# difficult data.
var.PIVEC_MIN = 1.e-6
var.RATE_MIN = 1.e-6
var.BRLEN_MIN = 1.e-5
var.GAMMA_SHAPE_MIN = 0.15

# Assuming more than one run, we set the run number by calling this script as
# p4 sMcmc.py -- 0     # 0, 1, 2, ...
rNum = int(var.argvAfterDoubleDash[0])

read("d.nex")
d = Data()
t = func.randomTree(taxNames=d.taxNames)
t.data = d
t.newComp(free=1, spec='empirical')
t.newRMatrix(free=1, spec='ones')
t.setNGammaCat(nGammaCat=4)
t.newGdasrv(free=1, val=0.5)
t.setPInvar(free=0, val=0.0)

nGen = 1000000
nSamples = 2000
sInterv = int(nGen / nSamples)
nCheckPoints = 2
cpInterv = int(nGen / nCheckPoints)  # or None

m = Mcmc(t, nChains=4, runNum=rNum, sampleInterval=sInterv, checkPointInterval=cpInterv)

m.doCpo = True
m.cpo_startGen = 500000


m.run(nGen)

# func.summarizeMcmcPrams(skip=1000)

Do an MCMC with more than one data partition

# Do an MCMC with more than one data partition.

# The defaults for these values are much smaller.  These suggested
# changes may not be needed all the time, but seems to help for
# difficult data.
var.PIVEC_MIN = 1.e-6
var.RATE_MIN = 1.e-6
var.BRLEN_MIN = 1.e-5
var.GAMMA_SHAPE_MIN = 0.15

# Assuming more than one run, we set the run number by calling this script as
# p4 sMcmcMultiPart.py -- 0     # 0, 1, 2, ...
rNum = int(var.argvAfterDoubleDash[0])

read("d.nex")
a = var.alignments[0]

# assuming we have a partition defined ...
a.setCharPartition('p1')
d = Data()
t = func.randomTree(taxNames=d.taxNames)
t.data = d

pNum = 0
t.newComp(partNum=pNum, free=1, spec='empirical')
t.newRMatrix(partNum=pNum, free=1, spec='ones')
t.setNGammaCat(partNum=pNum, nGammaCat=4)
t.newGdasrv(partNum=pNum, free=1, val=0.5)
t.setPInvar(partNum=pNum, free=1, val=0.1)
t.setRelRate(partNum=pNum, val=5.0)

pNum = 1
t.newComp(partNum=pNum, free=1, spec='empirical')
t.newRMatrix(partNum=pNum, free=1, spec='ones')
t.setNGammaCat(partNum=pNum, nGammaCat=1)
#t.newGdasrv(partNum=pNum, free=0, val=0.5)
t.setPInvar(partNum=pNum, free=1, val=0.1)
t.setRelRate(partNum=pNum, val=2.0)

if 1:
    t.model.relRatesAreFree = True
else:
    pass
    # # set fixed part rates, using eg ml vals from iqtree
    # t.model.relRatesAreFree = False
    # t.model.parts[0].relRate = 0.89
    # t.model.parts[1].relRate = 1.07


nGen = 1000000
nSamples = 2000
sInterv = int(nGen / nSamples)
nCheckPoints = 2
cpInterv = int(nGen / nCheckPoints)  # or None

m = Mcmc(t, nChains=4, runNum=rNum, sampleInterval=sInterv, checkPointInterval=cpInterv)

m.doCpo = True
m.cpo_startGen = 500000

m.run(nGen)
# func.summarizeMcmcPrams(skip=500000)

Do an MCMC with the NDCH2 model

# Do an MCMC with the NDCH2 model
# Here I assume only one data partition, part 0

# The defaults for these values are much smaller.  These suggested
# changes may not be needed all the time, but seems to help for
# difficult data.
var.PIVEC_MIN = 1.e-6
var.RATE_MIN = 1.e-6
var.BRLEN_MIN = 1.e-5
var.GAMMA_SHAPE_MIN = 0.15

# Assuming more than one run, we set the run number by calling this script as
# p4 sMcmcNDCH2.py -- 0     # 0, 1, 2, ...
rNum = int(var.argvAfterDoubleDash[0])

read("d.nex")
d = Data()
t = func.randomTree(taxNames=d.taxNames)
t.data = d

# Fully parameterize the comps, including a comp for the root
for n in t.iterNodes():
    c = t.newComp(free=1, spec='empirical', symbol='-')
    t.setModelComponentOnNode(c, node=n, clade=0)

# adjust these appropriately ...
t.newRMatrix(free=1, spec='ones')
t.setNGammaCat(nGammaCat=4)
t.newGdasrv(free=1, val=0.5)
t.setPInvar(free=0, val=0.0)

# Turn on NDCH2
t.model.parts[0].ndch2 = True
t.model.parts[0].ndch2_writeComps = False # Usually too many

# Set up the MCMC
nGen = 1000000
nSamples = 2000
sInterv = int(nGen / nSamples)
nCheckPoints = 2
cpInterv = int(nGen / nCheckPoints)  # or None

# Instantiate an Mcmc object.  Adjust nChains and runNum appropriately.
m = Mcmc(t, nChains=4, runNum=rNum, sampleInterval=sInterv, checkPointInterval=cpInterv)

# Check what proposals are turned on
# print(m.prob)

m.doCpo = True
m.cpo_startGen = 500000

m.run(nGen)

Read checkPoints from an MCMC

# Read checkPoints from an MCMC.

# Read them in ...
cpr = McmcCheckPointReader(theGlob='*')

# A table of acceptances
cpr.writeProposalAcceptances()

# Compare splits using average std deviation of split frequencies
#cpr.compareSplitsAll()
# or between only two checkpoints ...
#cpr.compareSplits(0, 1)

# How was swapping between heated chains?
#cpr.writeSwapVectors()

#cpr.writeProposalProbs()

# Get a single Mcmc object ..
if 0:
    m = cpr.mm[0]

Restart an MCMC from a checkpoint

# Restart an MCMC from a checkpoint.

# Settings made to the var module in the original MCMC are not in
# the pickled file, so you will probably need to set them again ---
var.PIVEC_MIN = 1.e-6
var.RATE_MIN = 1.e-6
var.BRLEN_MIN = 1.e-5
var.GAMMA_SHAPE_MIN = 0.15

read("d.nex")
d = Data()

# Assuming more than one run, we set the run number by calling this script as
# p4 sRestartMcmc.py -- 0     # eg 0, 1, 2, ...
rNum = int(var.argvAfterDoubleDash[0])

m = func.unPickleMcmc(rNum, d)

# You probably don't want to do this, but 
# it is possible to change the mcmc here.
# eg
# m.sampleInterval = 200

# Restart the run with a set number of generations, eg
# m.run(100000)

# Or set the number of gens in terms of the checkPointInterval
print("The checkpoint interval is currently %i" % m.checkPointInterval)

# For example, say to do two more checkpoints' worth of gens.
# m.run(m.checkPointInterval * 2)