Scripts for common tasks
2024-12-12.
This is a cook book of reminders, suggestions, and points of departure
for customization. You can copy and paste these into your own files.
Or you can use the p4 recipes function, func.recipes()
, to write new
files.
In these scripts I suggest variations, often by enclosing the variation
in an if 1:
block to turn it on, or an if 0:
block to turn it off.
Hopefully the meaning is clear and the (sometimes mutually exclusive)
possibilities are obvious.
Maximum likelihood
Calculate a likelihood
# Calculate a likelihood. read("d.nex") d = Data() read("t.nex") t = var.trees[0] t.data = d t.newComp(free=1, spec='empirical') t.newRMatrix(free=0, spec='ones') t.setNGammaCat(nGammaCat=4) t.newGdasrv(free=1, val=0.5) t.setPInvar(free=1, val=0.2) t.optLogLike() t.writeNexus('optTree.nex') t.model.dump()
Calculate likelihood with more than one data partition
# Calculate likelihood with more than one data partition. read("d.nex") a = var.alignments[0] a.setCharPartition('p1') d = Data() #d.dump() read("t.nex") t = var.trees[0] t.data = d pNum = 0 t.newComp(partNum=pNum, free=0, spec='empirical') t.newRMatrix(partNum=pNum, free=1, spec='ones') t.setNGammaCat(partNum=pNum, nGammaCat=4) t.newGdasrv(partNum=pNum, free=1, val=0.5) t.setPInvar(partNum=pNum, free=1, val=0.1) t.setRelRate(partNum=pNum, val=5.0) pNum = 1 t.newComp(partNum=pNum, free=0, spec='empirical') t.newRMatrix(partNum=pNum, free=1, spec='ones') t.setNGammaCat(partNum=pNum, nGammaCat=1) #t.newGdasrv(partNum=pNum, free=0, val=0.5) t.setPInvar(partNum=pNum, free=1, val=0.1) t.setRelRate(partNum=pNum, val=2.0) t.model.relRatesAreFree = True t.optLogLike() t.model.dump() t.tPickle('opt')
Simulation
Simulate data
# Simulate data if 0: read("d.nex") d = Data() if 1: nTax = 5 taxNames = list(string.ascii_uppercase[:nTax]) a = func.newEmptyAlignment(dataType='dna', taxNames=taxNames, length=200) d = Data([a]) if 0: read("t.nex") t = var.trees[0] #t.taxNames = taxNames if 0: read('(B:0.5, ((D:0.4, A:0.3), C:0.5), E:0.5);') t = var.trees[0] t.taxNames = taxNames if 1: t = func.randomTree(taxNames=taxNames) t.data = d t.newComp(free=0, spec='specified', val=[0.1, 0.2, 0.3]) t.newRMatrix(free=0, spec='specified', val=[2., 3., 4., 5., 6., 7.]) t.setNGammaCat(nGammaCat=4) t.newGdasrv(free=0, val=0.5) t.setPInvar(free=0, val=0.2) t.simulate() if 1: d.writeNexus('d.nex') if 0: d.alignments[0].writePhylip("d.phy")
Simulate data with more than one data partition
# Simulate data with more than one data partition. nTax = 5 taxNames = list(string.ascii_uppercase[:nTax]) dnaAlign = func.newEmptyAlignment(dataType='dna', taxNames=taxNames, length=300) read(r"#nexus begin sets; charpartition p1 = first:1-100, second:101-.; end;") dnaAlign.setCharPartition('p1') d = Data([dnaAlign]) #d.dump() read("t.nex") t = var.trees[0] #t.taxNames = taxNames t.data = d pNum = 0 t.newComp(partNum=pNum, free=0, spec='specified', val=[0.2, 0.1, 0.3]) t.newRMatrix(partNum=pNum, free=0, spec='specified', val=[2., 3., 4., 5., 6., 7.]) t.setNGammaCat(partNum=pNum, nGammaCat=4) t.newGdasrv(partNum=pNum, free=0, val=1.2) t.setPInvar(partNum=pNum, free=0, val=0.2) t.setRelRate(partNum=pNum, val=1.5) pNum = 1 t.newComp(partNum=pNum, free=0, spec='specified', val=[0.45, 0.05, 0.45]) t.newRMatrix(partNum=pNum, free=0, spec='specified', val=[1.2, 3.4, 0.4, 5.9, 0.6, 1.0]) t.setNGammaCat(partNum=pNum, nGammaCat=1) #t.newGdasrv(partNum=pNum, free=0, val=0.5) t.setPInvar(partNum=pNum, free=0, val=0.35) t.setRelRate(partNum=pNum, val=0.3) t.simulate() d.writeNexus('d.nex')
Simulate very hetero data
# Simulate very hetero data # Below you specify the dataType, length, and relRate of each data # partition (and symbols, if it is standard dataType). There is one # part per alignment, in this example. You also specify the number of # taxa in the tree import random import math def randomNumbersThatSumTo1(length, minimum): maximum = 1.0 - ((length - 1) * minimum) myRange = maximum - minimum rnn = [random.random() for i in range(int(length))] mySum = sum(rnn) rnn = [minimum + (rn / (mySum / myRange)) for rn in rnn] assert math.fabs(sum(rnn) - 1.0) < 1.0e-14 assert min(rnn) >= minimum return rnn class MySPart(object): def __init__(self, dataType, sLen, relRate, symbols=None): self.dataType = dataType self.sLen = sLen self.relRate = relRate self.symbols = None if dataType == 'standard': assert symbols self.symbols = symbols myParts = [] myParts.append(MySPart('dna', 231, 4.6)) myParts.append(MySPart('protein', 411, 0.3)) myParts.append(MySPart('standard', 197, 2.7, symbols='123456')) nTax = 17 taxNames = ['t%i' % i for i in range(nTax)] aa = [] for mP in myParts: if mP.dataType != 'standard': a = func.newEmptyAlignment(dataType=mP.dataType, taxNames=taxNames, length=mP.sLen) else: a = func.newEmptyAlignment(dataType=mP.dataType, symbols=mP.symbols, taxNames=taxNames, length=mP.sLen) aa.append(a) d = Data(aa) t = func.randomTree(taxNames=taxNames) t.data = d for aNum in range(len(aa)): a = aa[aNum] mP = myParts[aNum] for myNode in t.iterNodes(): newVal = randomNumbersThatSumTo1(a.dim, var.PIVEC_MIN) mt = t.newComp(partNum=aNum, free=1, spec='specified', val=newVal) t.setModelComponentOnNode(mt, myNode, clade=False) if a.dataType != 'protein': for myNode in t.iterNodesNoRoot(): nRates = ((a.dim * a.dim) - a.dim) / 2 newVal = randomNumbersThatSumTo1(nRates, var.RATE_MIN) mt = t.newRMatrix(partNum=aNum, free=1, spec='specified', val=newVal) t.setModelComponentOnNode(mt, myNode, clade=False) else: t.newRMatrix(partNum=aNum, free=0, spec='wag') t.setNGammaCat(partNum=aNum, nGammaCat=4) t.newGdasrv(partNum=aNum, free=1, val=0.5) t.setPInvar(partNum=aNum, free=0, val=0.0) t.setRelRate(partNum=aNum, val=mP.relRate) t.model.relRatesAreFree = True t.simulate() t.name = 'simTree' d.writeNexus('d.nex', writeDataBlock=True) t.tPickle()
Consensus trees
Make a consensus tree, uncomplicated
# Make a consensus tree, uncomplicated. tp = TreePartitions("mcmc_trees_0.nex", skip=500) t = tp.consensus() for n in t.iterInternalsNoRoot(): n.name = "%.0f" % (100. * n.br.support) t.draw() t.writeNexus('cons.nex')
Make a consensus tree, with variations
# Make a consensus tree, with variations. mySkip = 500 tp = TreePartitions("mcmc_trees_0.nex", skip=mySkip) # read another file tp.read("mcmc_trees_1.nex", skip=mySkip) t = tp.consensus() # Re-root t.reRoot(t.node('myOutTaxon').parent) # Collapse poorly supported nodes toCollapse = [n for n in t.iterInternalsNoRoot() if n.br.support < 0.7] for n in toCollapse: t.collapseNode(n) # Transfer node.br.support to node.name as percent for n in t.iterInternalsNoRoot(): n.name = "%.0f" % (100. * n.br.support) t.draw() t.writeNexus('cons01_70.nex')
MCMC
Do an MCMC
# Do an MCMC. # The defaults for these values are much smaller. These suggested # changes may not be needed all the time, but seems to help for # difficult data. var.PIVEC_MIN = 1.e-6 var.RATE_MIN = 1.e-6 var.BRLEN_MIN = 1.e-5 var.GAMMA_SHAPE_MIN = 0.15 # Assuming more than one run, we set the run number by calling this script as # p4 sMcmc.py -- 0 # 0, 1, 2, ... rNum = int(var.argvAfterDoubleDash[0]) read("d.nex") d = Data() t = func.randomTree(taxNames=d.taxNames) t.data = d t.newComp(free=1, spec='empirical') t.newRMatrix(free=1, spec='ones') t.setNGammaCat(nGammaCat=4) t.newGdasrv(free=1, val=0.5) t.setPInvar(free=0, val=0.0) nGen = 1000000 nSamples = 2000 sInterv = int(nGen / nSamples) nCheckPoints = 2 cpInterv = int(nGen / nCheckPoints) # or None m = Mcmc(t, nChains=4, runNum=rNum, sampleInterval=sInterv, checkPointInterval=cpInterv) m.doCpo = True m.cpo_startGen = 500000 m.run(nGen) # func.summarizeMcmcPrams(skip=1000)
Do an MCMC with more than one data partition
# Do an MCMC with more than one data partition. # The defaults for these values are much smaller. These suggested # changes may not be needed all the time, but seems to help for # difficult data. var.PIVEC_MIN = 1.e-6 var.RATE_MIN = 1.e-6 var.BRLEN_MIN = 1.e-5 var.GAMMA_SHAPE_MIN = 0.15 # Assuming more than one run, we set the run number by calling this script as # p4 sMcmcMultiPart.py -- 0 # 0, 1, 2, ... rNum = int(var.argvAfterDoubleDash[0]) read("d.nex") a = var.alignments[0] # assuming we have a partition defined ... a.setCharPartition('p1') d = Data() t = func.randomTree(taxNames=d.taxNames) t.data = d pNum = 0 t.newComp(partNum=pNum, free=1, spec='empirical') t.newRMatrix(partNum=pNum, free=1, spec='ones') t.setNGammaCat(partNum=pNum, nGammaCat=4) t.newGdasrv(partNum=pNum, free=1, val=0.5) t.setPInvar(partNum=pNum, free=1, val=0.1) t.setRelRate(partNum=pNum, val=5.0) pNum = 1 t.newComp(partNum=pNum, free=1, spec='empirical') t.newRMatrix(partNum=pNum, free=1, spec='ones') t.setNGammaCat(partNum=pNum, nGammaCat=1) #t.newGdasrv(partNum=pNum, free=0, val=0.5) t.setPInvar(partNum=pNum, free=1, val=0.1) t.setRelRate(partNum=pNum, val=2.0) if 1: t.model.relRatesAreFree = True else: pass # # set fixed part rates, using eg ml vals from iqtree # t.model.relRatesAreFree = False # t.model.parts[0].relRate = 0.89 # t.model.parts[1].relRate = 1.07 nGen = 1000000 nSamples = 2000 sInterv = int(nGen / nSamples) nCheckPoints = 2 cpInterv = int(nGen / nCheckPoints) # or None m = Mcmc(t, nChains=4, runNum=rNum, sampleInterval=sInterv, checkPointInterval=cpInterv) m.doCpo = True m.cpo_startGen = 500000 m.run(nGen) # func.summarizeMcmcPrams(skip=500000)
Do an MCMC with the NDCH2 model
# Do an MCMC with the NDCH2 model # Here I assume only one data partition, part 0 # The defaults for these values are much smaller. These suggested # changes may not be needed all the time, but seems to help for # difficult data. var.PIVEC_MIN = 1.e-6 var.RATE_MIN = 1.e-6 var.BRLEN_MIN = 1.e-5 var.GAMMA_SHAPE_MIN = 0.15 # Assuming more than one run, we set the run number by calling this script as # p4 sMcmcNDCH2.py -- 0 # 0, 1, 2, ... rNum = int(var.argvAfterDoubleDash[0]) read("d.nex") d = Data() t = func.randomTree(taxNames=d.taxNames) t.data = d # Fully parameterize the comps, including a comp for the root for n in t.iterNodes(): c = t.newComp(free=1, spec='empirical', symbol='-') t.setModelComponentOnNode(c, node=n, clade=0) # adjust these appropriately ... t.newRMatrix(free=1, spec='ones') t.setNGammaCat(nGammaCat=4) t.newGdasrv(free=1, val=0.5) t.setPInvar(free=0, val=0.0) # Turn on NDCH2 t.model.parts[0].ndch2 = True t.model.parts[0].ndch2_writeComps = False # Usually too many # Set up the MCMC nGen = 1000000 nSamples = 2000 sInterv = int(nGen / nSamples) nCheckPoints = 2 cpInterv = int(nGen / nCheckPoints) # or None # Instantiate an Mcmc object. Adjust nChains and runNum appropriately. m = Mcmc(t, nChains=4, runNum=rNum, sampleInterval=sInterv, checkPointInterval=cpInterv) # Check what proposals are turned on # print(m.prob) m.doCpo = True m.cpo_startGen = 500000 m.run(nGen)
Read checkPoints from an MCMC
# Read checkPoints from an MCMC. # Read them in ... cpr = McmcCheckPointReader(theGlob='*') # A table of acceptances cpr.writeProposalAcceptances() # Compare splits using average std deviation of split frequencies #cpr.compareSplitsAll() # or between only two checkpoints ... #cpr.compareSplits(0, 1) # How was swapping between heated chains? #cpr.writeSwapVectors() #cpr.writeProposalProbs() # Get a single Mcmc object .. if 0: m = cpr.mm[0]
Restart an MCMC from a checkpoint
# Restart an MCMC from a checkpoint. # Settings made to the var module in the original MCMC are not in # the pickled file, so you will probably need to set them again --- var.PIVEC_MIN = 1.e-6 var.RATE_MIN = 1.e-6 var.BRLEN_MIN = 1.e-5 var.GAMMA_SHAPE_MIN = 0.15 read("d.nex") d = Data() # Assuming more than one run, we set the run number by calling this script as # p4 sRestartMcmc.py -- 0 # eg 0, 1, 2, ... rNum = int(var.argvAfterDoubleDash[0]) m = func.unPickleMcmc(rNum, d) # You probably don't want to do this, but # it is possible to change the mcmc here. # eg # m.sampleInterval = 200 # Restart the run with a set number of generations, eg # m.run(100000) # Or set the number of gens in terms of the checkPointInterval print("The checkpoint interval is currently %i" % m.checkPointInterval) # For example, say to do two more checkpoints' worth of gens. # m.run(m.checkPointInterval * 2)