Source code for p4.posteriorsamples

from p4.p4exceptions import P4Error
from p4.tree import Tree
import p4.nexus
from p4.var import var

import os
import string
import io
import copy
import math
import sys

[docs]class PosteriorSamples(object): """A container for mcmc samples from files. This would be useful if you wanted to do eg posterior predictive simulations based on the posterior distribution from an already completed run. This reads in the tree and parameter files, and puts the two together to make p4 Tree objects with a model attached. For each, you would then attach a data object and do your simulation. After getting it instantiated, the only method is getSample(sampNum), which gets the zero-based sampNum'th sample. This will handle output from p4 or MrBayes. It will only do the MrBayes models that p4 can do as well. It will only do one 'run' at a time. It will do models on partitioned data, and p4 tree-hetero models. **Args** tree A p4 tree, with model attached. It must also have taxNames attached, in their proper order. runNum The run that you want to use. It would be zero-based for p4 files, and 1-based for MrBayes files. Eg for the p4 file mcmc_trees_0.nex you would say runNum=0, and for the MrBayes file foo.run2.p you would say runNum=2. program 'p4' or 'mrbayes' mbBaseName If you are trying to read MrBayes files, you need to supply the 'baseName' for the files. Eg if the run1 tree file is foo.run1.t and the run1 parameters file is foo.run1.p, you would supply mbBaseName='foo'. directory The directory where the tree files and parameter files are located. By default, it is '.', which is the current directory. verbose An integer, from 0-3. Zero is quiet. The bigger the number, the chattier it gets. **Example**:: # We need a tree with an attached model. Lets say that we have a one # partition model, with a GTR+G model. Do the usual p4 model setup. read('myTree.nex') t = var.trees[0] # Read in some data. read('myData.nex') d = Data() t.data = d t.newComp(free=1, spec='empirical') t.newRMatrix(free=1, spec='ones') t.setNGammaCat(nGammaCat=4) t.newGdasrv(free=1, val=0.5) t.setPInvar(free=0, val=0.0) # Check to make sure its all good to go. # (This step is needed for empirical comps, at least) t.data = d t.calcLogLike() # Instantiate ps = PosteriorSamples(t, runNum=1, program='mrbayes', mbBaseName='mbout', verbose=2) # Iterate over samples, saving your favourite test quantity from simulations as you go. myStats = [] for sampNum in range(500,1000): t2 = ps.getSample(sampNum) t2.data = d t2.simulate() myStats.append(t2.data.simpleBigXSquared()[0]) """ def __init__(self, tree, runNum, program='p4', mbBaseName=None, directory='.', verbose=1): gm = ["PosteriorSamples() init"] if not isinstance(tree, Tree) or not tree.model: gm.append("Instantiate with a p4 tree with a model attached.") raise P4Error(gm) if not tree.taxNames: gm.append( "The tree should have taxNames (in proper order!) attached.") raise P4Error(gm) self.tree = tree # Check the tree, by calculating the likelihood # self.tree.calcLogLike(verbose=False) self.model = copy.deepcopy(self.tree.model) for pNum in range(self.model.nParts): for compNum in range(self.model.parts[pNum].nComps): if self.model.parts[pNum].comps[compNum].spec == 'empirical': self.model.parts[pNum].comps[compNum].spec = 'specified' if not self.model.parts[pNum].comps[compNum].val[0]: gm.append( "Comp %i in partition %i has no val set." % (compNum, pNum)) gm.append("Maybe fix by calculating a likelihood before?") raise P4Error(gm) self.model.cModel = None self.runNum = int(runNum) self.goodPrograms = ['p4', 'mrbayes'] lowProgram = program.lower() if program not in self.goodPrograms: gm.append( "The program generating the files should be one of %s" % self.goodPrograms) raise P4Error(gm) self.program = lowProgram self.verbose = verbose assert os.path.isdir(directory) self.directory = directory if self.program == 'p4': self._readP4Files() elif self.program == 'mrbayes': self.mbBaseName = mbBaseName self._readMrBayesFiles() self.nSamples = len(self.tLines) if self.tree.model.nFreePrams: nPLines = len(self.pLines) if self.nSamples and self.nSamples == nPLines: if self.verbose >= 1: print("Got %i samples." % self.nSamples) else: gm.append( "Got %i tree samples, but %i parameter samples." % (self.nSamples, nPLines)) raise P4Error(gm) else: # print "Got %i samples. (no free parameters)" % self.nSamples pass
[docs] def getSample(self, sampNum): if self.program == 'p4': return self._getP4SampleTree(sampNum) elif self.program == 'mrbayes': return self._getMrBayesSampleTree(sampNum)
def _readP4Files(self): gm = ["PosteriorSamples._readP4Files()"] # Read in the trees fName = "mcmc_trees_%i.nex" % self.runNum if self.directory != '.': fName = os.path.join(self.directory, fName) try: f = open(fName) except IOError: gm.append("Can't find tree file '%s'" % fName) raise P4Error(gm) fLines = f.readlines() f.close() # Get the translate command lNum = 0 aLine = fLines[0].strip() translateLines = [] while not aLine.startswith("translate"): lNum += 1 aLine = fLines[lNum].strip() lNum += 1 aLine = fLines[lNum].strip() while not aLine.startswith(";"): translateLines.append(aLine) lNum += 1 aLine = fLines[lNum].strip() translateLines.append(aLine) translateFlob = io.StringIO(' '.join(translateLines)) nx = p4.nexus.Nexus() self.translationHash = nx.readTranslateCommand(translateFlob) #print(self.translationHash) # Get the models definition, if it exists. Move to the first tree # line. while not aLine.startswith("tree t_"): if aLine.startswith("[&&p4 models"): self.modelLine = aLine lNum += 1 aLine = fLines[lNum].strip() # Get the tree lines. self.tLines = [] while not aLine.startswith("end;"): self.tLines.append(aLine[5:]) lNum += 1 aLine = fLines[lNum].strip() if self.tree.model.nFreePrams: # Read in the prams fName = "mcmc_prams_%i" % self.runNum if self.directory != '.': fName = os.path.join(self.directory, fName) try: f = open(fName) except IOError: gm.append("Can't find prams file '%s'" % fName) raise P4Error(gm) fLines = f.readlines() f.close() lNum = 0 aLine = fLines[0].strip() while aLine[0] not in string.digits: lNum += 1 aLine = fLines[lNum].strip() self.pLines = [] while aLine: self.pLines.append(aLine) lNum += 1 try: aLine = fLines[lNum].strip() except IndexError: break # Read in the pramsProfile self.nPrams = None self.pramsProfile = None fName = "mcmc_pramsProfile_%i.py" % self.runNum if self.directory != '.': fName = os.path.join(self.directory, fName) try: loc = {} exec(open(fName).read(), {}, loc) # loc =locals() no workee. # print "loc = %s" % loc self.nPrams = loc['nPrams'] self.pramsProfile = loc['pramsProfile'] except IOError: print("The file '%s' cannot be found." % fName) def _getP4SampleTree(self, sampNum): tLine = self.tLines[sampNum] if self.verbose >= 3: print(tLine) if 1: if sys.version_info < (3,): f = io.BytesIO(tLine) else: f = io.StringIO(tLine) t = Tree() t.parseNexus(f, translationHash=self.translationHash, doModelComments=self.tree.model.nParts) t.taxNames = self.tree.taxNames for n in t.iterLeavesNoRoot(): n.seqNum = t.taxNames.index(n.name) t.model = copy.deepcopy(self.model) if self.tree.model.nFreePrams: pLine = self.pLines[sampNum] if self.verbose >= 3: print(pLine) splitPLine = pLine.split() pGenNum = int(splitPLine[0]) splitTName = t.name.split('_') tGenNum = int(splitTName[1]) if tGenNum != pGenNum: raise P4Error( "something wrong. tGenNum=%i, pGenNum=%i" % (tGenNum, pGenNum)) if self.verbose >= 2: print("(zero-based) sample %i is gen %i" % (sampNum, tGenNum)) # t.model.dump() splIndx = 1 for pNum in range(len(self.pramsProfile)): compNum = 0 rMatrixNum = 0 gdasrvNum = 0 for desc in self.pramsProfile[pNum]: if desc[0] == 'relRate': t.model.parts[pNum].relRate = float(splitPLine[splIndx]) splIndx += 1 elif desc[0] == 'comp': vv = [] for i in range(desc[1]): vv.append(float(splitPLine[splIndx])) splIndx += 1 for i in range(desc[1]): if vv[i] < var.PIVEC_MIN: vv[i] = var.PIVEC_MIN * 1.1 thisSum = sum(vv) factor = 1.0 / thisSum # must sum to one for i in range(desc[1]): t.model.parts[pNum].comps[ compNum].val[i] = vv[i] * factor compNum += 1 elif desc[0] == 'rMatrix': vv = [] for i in range(desc[1]): vv.append(float(splitPLine[splIndx])) splIndx += 1 if len(vv) == 1: # its a '2p' model, with a kappa t.model.parts[pNum].rMatrices[ rMatrixNum].val[0] = vv[0] else: # gtr for i in range(desc[1]): if vv[i] < var.RATE_MIN: vv[i] = var.RATE_MIN * 1.1 thisSum = sum(vv) factor = 1.0 / thisSum # must sum to one for i in range(desc[1]): t.model.parts[pNum].rMatrices[ rMatrixNum].val[i] = vv[i] * factor rMatrixNum += 1 elif desc[0] == 'gdasrv': t.model.parts[pNum].gdasrvs[gdasrvNum].val[ 0] = float(splitPLine[splIndx]) splIndx += 1 gdasrvNum += 1 elif desc[0] == 'pInvar': t.model.parts[pNum].pInvar.val = float( splitPLine[splIndx]) splIndx += 1 if splIndx != len(splitPLine): raise P4Error("Something is wrong. After reading, splIndx=%i, but len split pram line=%i" % ( splIndx, len(splitPLine))) return t def _readMrBayesFiles(self): gm = ["PosteriorSamples._readMrBayesFiles()"] # Read in the trees fName = "%s.run%i.t" % (self.mbBaseName, self.runNum) if self.directory != '.': fName = os.path.join(self.directory, fName) try: f = open(fName) except IOError: gm.append("Can't find tree file '%s'" % fName) raise P4Error(gm) fLines = f.readlines() f.close() # Get the translate command lNum = 0 aLine = fLines[0].strip() translateLines = [] while not aLine.startswith("translate"): lNum += 1 aLine = fLines[lNum].strip() lNum += 1 aLine = fLines[lNum].strip() while not aLine.endswith(";"): translateLines.append(aLine) lNum += 1 aLine = fLines[lNum].strip() translateLines.append(aLine) translateFlob = io.StringIO(' '.join(translateLines)) nx = p4.nexus.Nexus() self.translationHash = nx.readTranslateCommand(translateFlob) # print self.translationHash # Get the models definition, if it exists. Move to the first tree line. # MrBayes3.2 uses 'gen', 3.1.2 uses 'rep'. while not aLine.startswith("tree gen.") and not aLine.startswith("tree rep."): lNum += 1 aLine = fLines[lNum].strip() # Get the tree lines. self.tLines = [] while not aLine.startswith("end;"): self.tLines.append(aLine[5:]) lNum += 1 aLine = fLines[lNum].strip() # Read in the prams fName = "%s.run%i.p" % (self.mbBaseName, self.runNum) if self.directory != '.': fName = os.path.join(self.directory, fName) try: f = open(fName) except IOError: gm.append("Can't find prams file '%s'" % fName) raise P4Error(gm) fLines = f.readlines() f.close() # Get the header line, starts with Gen lNum = 1 aLine = fLines[lNum].strip() self.pramsHeader = aLine.split() if self.verbose >= 2: print("pramsHeader: %s" % self.pramsHeader) assert self.pramsHeader[0] == 'Gen' # Collect pram lines lNum += 1 aLine = fLines[lNum].strip() self.pLines = [] while aLine: self.pLines.append(aLine) lNum += 1 try: aLine = fLines[lNum].strip() except IndexError: break # print self.pLines self.nPrams = len(self.pramsHeader) if self.verbose >= 2: print("pram line length is %i" % self.nPrams) def _getMrBayesSampleTree(self, sampNum): tLine = self.tLines[sampNum] if self.verbose >= 3: print(tLine) if 1: if sys.version_info < (3,): f = io.BytesIO(tLine) else: f = io.StringIO(tLine) t = Tree() t.parseNexus(f, translationHash=self.translationHash, doModelComments=self.tree.model.nParts) # doModelComments is nParts t.taxNames = self.tree.taxNames for n in t.iterLeavesNoRoot(): n.seqNum = t.taxNames.index(n.name) t.model = copy.deepcopy(self.model) pLine = self.pLines[sampNum] if self.verbose >= 3: print(pLine) splitPLine = pLine.split() pGenNum = int(splitPLine[0]) splitTName = t.name.split('.') tGenNum = int(splitTName[1]) if tGenNum != pGenNum: raise P4Error("something wrong. tGenNum=%i, pGenNum=%i" % (tGenNum, pGenNum)) if self.verbose >= 2: print("(zero-based) sample %i is gen %i" % (sampNum, tGenNum)) # t.model.dump() splIndx = 3 # but could be Gen LnL LnPr TL ... while splIndx < self.nPrams: pNum = 0 #print("splIndx = %i, pramsHeader = %s" % (splIndx, self.pramsHeader[splIndx])) if self.pramsHeader[splIndx].startswith('r(A<->C)'): if self.tree.model.nParts > 1: try: splitPramHeader = self.pramsHeader[ splIndx].split('{')[1][:-1] pNum = int(splitPramHeader) pNum -= 1 except: raise P4Error("could not get the part number") thisSum = 0.0 for i in range(6): theFloat = float(splitPLine[splIndx]) t.model.parts[pNum].rMatrices[0].val[i] = theFloat thisSum += theFloat splIndx += 1 factor = 1.0 / thisSum # must sum to one for i in range(6): t.model.parts[pNum].rMatrices[0].val[i] *= factor elif self.pramsHeader[splIndx].startswith('pi(A)'): if self.tree.model.nParts > 1: try: splitPramHeader = self.pramsHeader[ splIndx].split('{')[1][:-1] pNum = int(splitPramHeader) pNum -= 1 except: raise P4Error("could not get the part number") thisSum = 0.0 theseComps = [] for i in range(4): theFloat = float(splitPLine[splIndx]) t.model.parts[pNum].comps[0].val[i] = theFloat thisSum += theFloat theseComps.append(theFloat) splIndx += 1 if math.fabs(1.0 - thisSum) > 1.e-6: gm = ["The current parameters line is ---"] gm.append(pLine) gm.append("Compositions %s in (zero-based) partition %i "\ "sum to %f; should sum to 1.0" % (theseComps, pNum, thisSum)) raise P4Error(gm) factor = 1.0 / thisSum # must sum to one for i in range(4): t.model.parts[pNum].comps[0].val[i] *= factor elif self.pramsHeader[splIndx].startswith('alpha'): if self.tree.model.nParts > 1: try: splitPramHeader = self.pramsHeader[ splIndx].split('{')[1][:-1] pNum = int(splitPramHeader) pNum -= 1 except: raise P4Error("could not get the part number") # print "got pNum = %i" % pNum # print "got splitPLine[%i] = %s" % (splIndx, # splitPLine[splIndx]) t.model.parts[pNum].gdasrvs[0].val[ 0] = float(splitPLine[splIndx]) splIndx += 1 elif self.pramsHeader[splIndx].startswith('pinvar'): if self.tree.model.nParts > 1: try: splitPramHeader = self.pramsHeader[splIndx].split('{')[1][:-1] print("pinvar", splitPramHeader) pNum = int(splitPramHeader) pNum -= 1 except: raise P4Error("could not get the part number") t.model.parts[pNum].pInvar.val = float(splitPLine[splIndx]) splIndx += 1 elif self.pramsHeader[splIndx].startswith('m'): if self.tree.model.nParts > 1: try: splitPramHeader = self.pramsHeader[ splIndx].split('{')[1][:-1] pNum = int(splitPramHeader) pNum -= 1 except: raise P4Error("could not get the part number") t.model.parts[pNum].relRate = float(splitPLine[splIndx]) splIndx += 1 elif self.pramsHeader[splIndx].startswith('TL'): splIndx += 1 else: print("splIndx=%i. Got unknown pram %s. Fix me!" % (splIndx, self.pramsHeader[splIndx])) splIndx += 1 if splIndx != len(splitPLine): raise P4Error("Something is wrong. After reading, splIndx=%i, but len split pram line=%i" % ( splIndx, len(splitPLine))) return t