Scripts for common tasks
2025-01-29.
This is a cook book of reminders, suggestions, and points of departure
for customization. You can copy and paste these into your own files.
Or you can use the p4 recipes function, func.recipes(), to write new
files.
In these scripts I suggest variations, often by enclosing the variation
in an if 1: block to turn it on, or an if 0: block to turn it off.
Hopefully the meaning is clear and the (sometimes mutually exclusive)
possibilities are obvious.
Maximum likelihood
Calculate a likelihood
# Calculate a likelihood. read("d.nex") d = Data() read("t.nex") t = var.trees[0] t.data = d t.newComp(free=1, spec='empirical') t.newRMatrix(free=0, spec='ones') t.setNGammaCat(nGammaCat=4) t.newGdasrv(free=1, val=0.5) t.setPInvar(free=1, val=0.2) t.optLogLike() t.writeNexus('optTree.nex') t.model.dump()
Calculate likelihood with more than one data partition
# Calculate likelihood with more than one data partition. read("d.nex") a = var.alignments[0] a.setCharPartition('p1') d = Data() #d.dump() read("t.nex") t = var.trees[0] t.data = d pNum = 0 t.newComp(partNum=pNum, free=0, spec='empirical') t.newRMatrix(partNum=pNum, free=1, spec='ones') t.setNGammaCat(partNum=pNum, nGammaCat=4) t.newGdasrv(partNum=pNum, free=1, val=0.5) t.setPInvar(partNum=pNum, free=1, val=0.1) t.setRelRate(partNum=pNum, val=5.0) pNum = 1 t.newComp(partNum=pNum, free=0, spec='empirical') t.newRMatrix(partNum=pNum, free=1, spec='ones') t.setNGammaCat(partNum=pNum, nGammaCat=1) #t.newGdasrv(partNum=pNum, free=0, val=0.5) t.setPInvar(partNum=pNum, free=1, val=0.1) t.setRelRate(partNum=pNum, val=2.0) t.model.relRatesAreFree = True t.optLogLike() t.model.dump() t.tPickle('opt')
Simulation
Simulate data
# Simulate data if 0: read("d.nex") d = Data() if 1: nTax = 5 taxNames = list(string.ascii_uppercase[:nTax]) a = func.newEmptyAlignment(dataType='dna', taxNames=taxNames, length=200) d = Data([a]) if 0: read("t.nex") t = var.trees[0] #t.taxNames = taxNames if 0: read('(B:0.5, ((D:0.4, A:0.3), C:0.5), E:0.5);') t = var.trees[0] t.taxNames = taxNames if 1: t = func.randomTree(taxNames=taxNames) t.data = d t.newComp(free=0, spec='specified', val=[0.1, 0.2, 0.3]) t.newRMatrix(free=0, spec='specified', val=[2., 3., 4., 5., 6., 7.]) t.setNGammaCat(nGammaCat=4) t.newGdasrv(free=0, val=0.5) t.setPInvar(free=0, val=0.2) t.simulate() if 1: d.writeNexus('d.nex') if 0: d.alignments[0].writePhylip("d.phy")
Simulate data with more than one data partition
# Simulate data with more than one data partition. nTax = 5 taxNames = list(string.ascii_uppercase[:nTax]) dnaAlign = func.newEmptyAlignment(dataType='dna', taxNames=taxNames, length=300) read(r"#nexus begin sets; charpartition p1 = first:1-100, second:101-.; end;") dnaAlign.setCharPartition('p1') d = Data([dnaAlign]) #d.dump() read("t.nex") t = var.trees[0] #t.taxNames = taxNames t.data = d pNum = 0 t.newComp(partNum=pNum, free=0, spec='specified', val=[0.2, 0.1, 0.3]) t.newRMatrix(partNum=pNum, free=0, spec='specified', val=[2., 3., 4., 5., 6., 7.]) t.setNGammaCat(partNum=pNum, nGammaCat=4) t.newGdasrv(partNum=pNum, free=0, val=1.2) t.setPInvar(partNum=pNum, free=0, val=0.2) t.setRelRate(partNum=pNum, val=1.5) pNum = 1 t.newComp(partNum=pNum, free=0, spec='specified', val=[0.45, 0.05, 0.45]) t.newRMatrix(partNum=pNum, free=0, spec='specified', val=[1.2, 3.4, 0.4, 5.9, 0.6, 1.0]) t.setNGammaCat(partNum=pNum, nGammaCat=1) #t.newGdasrv(partNum=pNum, free=0, val=0.5) t.setPInvar(partNum=pNum, free=0, val=0.35) t.setRelRate(partNum=pNum, val=0.3) t.simulate() d.writeNexus('d.nex')
Simulate very hetero data
# Simulate very hetero data # Below you specify the dataType, length, and relRate of each data # partition (and symbols, if it is standard dataType). There is one # part per alignment, in this example. You also specify the number of # taxa in the tree import random import math def randomNumbersThatSumTo1(length, minimum): maximum = 1.0 - ((length - 1) * minimum) myRange = maximum - minimum rnn = [random.random() for i in range(int(length))] mySum = sum(rnn) rnn = [minimum + (rn / (mySum / myRange)) for rn in rnn] assert math.fabs(sum(rnn) - 1.0) < 1.0e-14 assert min(rnn) >= minimum return rnn class MySPart(object): def __init__(self, dataType, sLen, relRate, symbols=None): self.dataType = dataType self.sLen = sLen self.relRate = relRate self.symbols = None if dataType == 'standard': assert symbols self.symbols = symbols myParts = [] myParts.append(MySPart('dna', 231, 4.6)) myParts.append(MySPart('protein', 411, 0.3)) myParts.append(MySPart('standard', 197, 2.7, symbols='123456')) nTax = 17 taxNames = ['t%i' % i for i in range(nTax)] aa = [] for mP in myParts: if mP.dataType != 'standard': a = func.newEmptyAlignment(dataType=mP.dataType, taxNames=taxNames, length=mP.sLen) else: a = func.newEmptyAlignment(dataType=mP.dataType, symbols=mP.symbols, taxNames=taxNames, length=mP.sLen) aa.append(a) d = Data(aa) t = func.randomTree(taxNames=taxNames) t.data = d for aNum in range(len(aa)): a = aa[aNum] mP = myParts[aNum] for myNode in t.iterNodes(): newVal = randomNumbersThatSumTo1(a.dim, var.PIVEC_MIN) mt = t.newComp(partNum=aNum, free=1, spec='specified', val=newVal) t.setModelComponentOnNode(mt, myNode, clade=False) if a.dataType != 'protein': for myNode in t.iterNodesNoRoot(): nRates = ((a.dim * a.dim) - a.dim) / 2 newVal = randomNumbersThatSumTo1(nRates, var.RATE_MIN) mt = t.newRMatrix(partNum=aNum, free=1, spec='specified', val=newVal) t.setModelComponentOnNode(mt, myNode, clade=False) else: t.newRMatrix(partNum=aNum, free=0, spec='wag') t.setNGammaCat(partNum=aNum, nGammaCat=4) t.newGdasrv(partNum=aNum, free=1, val=0.5) t.setPInvar(partNum=aNum, free=0, val=0.0) t.setRelRate(partNum=aNum, val=mP.relRate) t.model.relRatesAreFree = True t.simulate() t.name = 'simTree' d.writeNexus('d.nex', writeDataBlock=True) t.tPickle()
Consensus trees
Make a consensus tree, uncomplicated
# Make a consensus tree, uncomplicated. tp = TreePartitions("mcmc_trees_0.nex", skip=500) t = tp.consensus() for n in t.iterInternalsNoRoot(): n.name = "%.0f" % (100. * n.br.support) t.draw() t.writeNexus('cons.nex')
Make a consensus tree, with variations
# Make a consensus tree, with variations. mySkip = 500 tp = TreePartitions("mcmc_trees_0.nex", skip=mySkip) # read another file tp.read("mcmc_trees_1.nex", skip=mySkip) t = tp.consensus() # Re-root t.reRoot(t.node('myOutTaxon').parent) # Collapse poorly supported nodes toCollapse = [n for n in t.iterInternalsNoRoot() if n.br.support < 0.7] for n in toCollapse: t.collapseNode(n) # Transfer node.br.support to node.name as percent for n in t.iterInternalsNoRoot(): n.name = "%.0f" % (100. * n.br.support) t.draw() t.writeNexus('cons01_70.nex')
MCMC
Do an MCMC
# Do an MCMC. # The defaults for these values are much smaller. These suggested # changes may not be needed all the time, but seems to help for # difficult data. var.PIVEC_MIN = 1.e-6 var.RATE_MIN = 1.e-6 var.BRLEN_MIN = 1.e-5 var.GAMMA_SHAPE_MIN = 0.15 # Assuming more than one run, we set the run number by calling this script as # p4 sMcmc.py -- 0 # 0, 1, 2, ... rNum = int(var.argvAfterDoubleDash[0]) read("d.nex") d = Data() t = func.randomTree(taxNames=d.taxNames) t.data = d t.newComp(free=1, spec='empirical') t.newRMatrix(free=1, spec='ones') t.setNGammaCat(nGammaCat=4) t.newGdasrv(free=1, val=0.5) t.setPInvar(free=0, val=0.0) nGen = 1000000 nSamples = 2000 sInterv = int(nGen / nSamples) nCheckPoints = 2 cpInterv = int(nGen / nCheckPoints) # or None m = Mcmc(t, nChains=4, runNum=rNum, sampleInterval=sInterv, checkPointInterval=cpInterv) m.doCpo = True m.cpo_startGen = 500000 m.run(nGen) # func.summarizeMcmcPrams(skip=1000)
Do an MCMC with more than one data partition
# Do an MCMC with more than one data partition. # The defaults for these values are much smaller. These suggested # changes may not be needed all the time, but seems to help for # difficult data. var.PIVEC_MIN = 1.e-6 var.RATE_MIN = 1.e-6 var.BRLEN_MIN = 1.e-5 var.GAMMA_SHAPE_MIN = 0.15 # Assuming more than one run, we set the run number by calling this script as # p4 sMcmcMultiPart.py -- 0 # 0, 1, 2, ... rNum = int(var.argvAfterDoubleDash[0]) read("d.nex") a = var.alignments[0] # assuming we have a partition defined ... a.setCharPartition('p1') d = Data() t = func.randomTree(taxNames=d.taxNames) t.data = d pNum = 0 t.newComp(partNum=pNum, free=1, spec='empirical') t.newRMatrix(partNum=pNum, free=1, spec='ones') t.setNGammaCat(partNum=pNum, nGammaCat=4) t.newGdasrv(partNum=pNum, free=1, val=0.5) t.setPInvar(partNum=pNum, free=1, val=0.1) t.setRelRate(partNum=pNum, val=5.0) pNum = 1 t.newComp(partNum=pNum, free=1, spec='empirical') t.newRMatrix(partNum=pNum, free=1, spec='ones') t.setNGammaCat(partNum=pNum, nGammaCat=1) #t.newGdasrv(partNum=pNum, free=0, val=0.5) t.setPInvar(partNum=pNum, free=1, val=0.1) t.setRelRate(partNum=pNum, val=2.0) if 1: t.model.relRatesAreFree = True else: pass # # set fixed part rates, using eg ml vals from iqtree # t.model.relRatesAreFree = False # t.model.parts[0].relRate = 0.89 # t.model.parts[1].relRate = 1.07 nGen = 1000000 nSamples = 2000 sInterv = int(nGen / nSamples) nCheckPoints = 2 cpInterv = int(nGen / nCheckPoints) # or None m = Mcmc(t, nChains=4, runNum=rNum, sampleInterval=sInterv, checkPointInterval=cpInterv) m.doCpo = True m.cpo_startGen = 500000 m.run(nGen) # func.summarizeMcmcPrams(skip=500000)
Do an MCMC with the NDCH2 model
# Do an MCMC with the NDCH2 model # Here I assume only one data partition, part 0 # The defaults for these values are much smaller. These suggested # changes may not be needed all the time, but seems to help for # difficult data. var.PIVEC_MIN = 1.e-6 var.RATE_MIN = 1.e-6 var.BRLEN_MIN = 1.e-5 var.GAMMA_SHAPE_MIN = 0.15 # Assuming more than one run, we set the run number by calling this script as # p4 sMcmcNDCH2.py -- 0 # 0, 1, 2, ... rNum = int(var.argvAfterDoubleDash[0]) read("d.nex") d = Data() t = func.randomTree(taxNames=d.taxNames) t.data = d # Fully parameterize the comps, including a comp for the root for n in t.iterNodes(): c = t.newComp(free=1, spec='empirical', symbol='-') t.setModelComponentOnNode(c, node=n, clade=0) # adjust these appropriately ... t.newRMatrix(free=1, spec='ones') t.setNGammaCat(nGammaCat=4) t.newGdasrv(free=1, val=0.5) t.setPInvar(free=0, val=0.0) # Turn on NDCH2 t.model.parts[0].ndch2 = True t.model.parts[0].ndch2_writeComps = False # Usually too many # Set up the MCMC nGen = 1000000 nSamples = 2000 sInterv = int(nGen / nSamples) nCheckPoints = 2 cpInterv = int(nGen / nCheckPoints) # or None # Instantiate an Mcmc object. Adjust nChains and runNum appropriately. m = Mcmc(t, nChains=4, runNum=rNum, sampleInterval=sInterv, checkPointInterval=cpInterv) # Check what proposals are turned on # print(m.prob) m.doCpo = True m.cpo_startGen = 500000 m.run(nGen)
Read checkPoints from an MCMC
# Read checkPoints from an MCMC. # Read them in ... cpr = McmcCheckPointReader(theGlob='*') # A table of acceptances cpr.writeProposalAcceptances() # Compare splits using average std deviation of split frequencies #cpr.compareSplitsAll() # or between only two checkpoints ... #cpr.compareSplits(0, 1) # How was swapping between heated chains? #cpr.writeSwapVectors() #cpr.writeProposalProbs() # Get a single Mcmc object .. if 0: m = cpr.mm[0]
Restart an MCMC from a checkpoint
# Restart an MCMC from a checkpoint. # Settings made to the var module in the original MCMC are not in # the pickled file, so you will probably need to set them again --- var.PIVEC_MIN = 1.e-6 var.RATE_MIN = 1.e-6 var.BRLEN_MIN = 1.e-5 var.GAMMA_SHAPE_MIN = 0.15 read("d.nex") d = Data() # Assuming more than one run, we set the run number by calling this script as # p4 sRestartMcmc.py -- 0 # eg 0, 1, 2, ... rNum = int(var.argvAfterDoubleDash[0]) m = func.unPickleMcmc(rNum, d) # You probably don't want to do this, but # it is possible to change the mcmc here. # eg # m.sampleInterval = 200 # Restart the run with a set number of generations, eg # m.run(100000) # Or set the number of gens in terms of the checkPointInterval print("The checkpoint interval is currently %i" % m.checkPointInterval) # For example, say to do two more checkpoints' worth of gens. # m.run(m.checkPointInterval * 2)