Source code for p4.tree

import sys
import string
import io
import math
import copy
import os
import p4.func
import time
import glob
from p4.var import var
from p4.p4exceptions import P4Error
from p4.node import Node, NodePart, NodeBranch, NodeBranchPart
from p4.nexustoken import nextTok, safeNextTok
from p4.distancematrix import DistanceMatrix

import numpy
import p4.pf as pf
from p4.model import Model
from p4.data import Data
from p4.alignment import Part
from p4.treepicture import TreePicture
import random
import pickle

def _fixFileName(fName):
    if fName.count('.') or fName.count(' '):
        fName = list(fName)
        for i in range(len(fName)):
            theChar = fName[i]
            if theChar == '.' or theChar == ' ':
                fName[i] = '_'
        fName = ''.join(fName)
    return fName

longMessage1 = """

(Boring old) Chi-square test for compositional homogeneity
==========================================================

    The statistic is Sum[(Obs - Exp)^2 / Exp],
        where Exp comes from the data.
    The statistic is X^2 (X squared), in the sense used by
        Sokal & Rohlf in 'Biometry', to distinguish it from
        'chi square', which is a distribution, not a statistic.
    This is (mostly) the same as the test in PAUP, using the
        basefreq command.  There are small differences having
        to do with calculation of degrees of freedom.
    Significance is assessed by Chi-square.

"""

longMessage2 = """

Tree- and model-based composition fit test
==========================================

    The statistic is Sum[(Obs - Exp)^2 / Exp],
        like the statistic used in the Chi-squared test,
        except that Exp comes from the model, not from the data.
    I call the statistic X^2_m (X squared sub m) since it is like
        the X^2 statistic used in the chi-square test above, but uses
        the m subscript to say that the expected values come from the model.
    Significance is assessed by simulations on the tree and model.
    A critical point of 95% is used to decide if the data fits or not.
"""



    # def __del__(self, freeTree=pf.p4_freeTree, freeNode=pf.p4_freeNode):
    # def __del__(self, freeTree=pf.p4_freeTree, dp_freeTree = pf.dp_freeTree, mysys=sys):
    # def __del__(self, freeTree=pf.p4_freeTree, dp_freeTree = pf.dp_freeTree):
[docs] class RMatrix(object): def __init__(self): self.num = -1 self.partNum = None self.free = None self.spec = None self.symbol = None self.val = None self.nNodes = 0
[docs] class Gdasrv(object): def __init__(self): self.num = -1 self.partNum = None self.free = None self.symbol = None # self.val=None self._val = numpy.zeros(1, dtype=numpy.double) self.freqs = None self.rates = None self.nGammaCat = None self.c = None # a p4_gdasrvStruct, if it exists. It is used in calcRates, below. self.nNodes = 0 def _getVal(self): return self._val def _setVal(self, theVal): if theVal < 1.e-16: gm = ["Gdasrv._setVal()"] gm.append("Attempt to set Gdasrv.val (ie alpha) to %g" % theVal) gm.append( "However, we cannot calculate the discrete categories with a value so low.") raise P4Error(gm) self._val[0] = theVal self.calcRates() def _delVal(self): gm = ["Don't/Can't delete this Gdasrv property."] raise P4Error(gm) val = property(_getVal, _setVal, _delVal)
[docs] def calcRates(self): # Use either the p4_gdasrvStruct, or just use the NumPy # array vals (np = NumPy). # print("self.c = %s" % self.c) if self.c: pf.gdasrvCalcRates(self.c) else: pf.gdasrvCalcRates_np(self.nGammaCat, self._val[0], self.freqs, self.rates)
# print('xxx self.rates = %s, val=%s' % (self.rates, self._val[0]))
[docs] class Comp(object): def __init__(self): self.num = -1 self.partNum = None self.free = None self.spec = None self.symbol = None self._val = None self.nNodes = 0 def _getVal(self): return self._val def _setVal(self, theVal): if self._val is None: self._val = numpy.array(theVal) else: #print("Resetting comp val.") assert len(self._val) == len(theVal) for i in range(len(theVal)): self._val[i] = theVal[i] def _delVal(self): gm = ["Don't/Can't delete this Comp property."] raise P4Error(gm) val = property(_getVal, _setVal, _delVal)
[docs] class PInvar(object): def __init__(self): self.num = -1 self.partNum = None self.free = None self.val = None
[docs] class Tree(object): """A phylogenetic tree. **Some instance variables** * ``fName``, if the Tree was read from a file * ``name``, the name of the tree. * ``root``, the root node * ``nodes``, a list of nodes. * ``preOrder`` and ``postOrder``, lists of node numbers * ``recipWeight``, the weight, if it exists, is usually 1/something, so the reciprocal looks nicer ... * ``nexusSets``, if it exists, a NexusSets object. **Properties** * ``taxNames``, a list of names. Usually the order is important! * ``data``, a :class:`Data.Data` object * ``model``, a :class:`Model.Model` object * ``nTax``, the number of taxa * ``nInternalNodes``, the number of non-leaf nodes **The node method** You often will want to refer to a node in a tree. This can be done via its name, or its nodeNum, or as an object, via the method :meth:`~p4.tree.Tree.node`. For example, from a Tree ``t`` you can get node number 3 via:: n = t.node(3) and you can get the node that is the parent of the Mastodon via:: n = t.node('Mastodon').parent For many methods that require specifying a node, the method argument is *nodeSpecifier*, eg:: t.reRoot(23) ``reRoots``'s the tree to node number 23. **Describe, draw, and get information about the tree** .. autosummary:: :nosignatures: ~Tree.dump ~Tree.draw ~Tree.textDrawList ~Tree.tv ~Tree.btv ~Tree.isFullyBifurcating ~Tree.taxSetIsASplit ~Tree.getAllLeafNames ~Tree.getChildrenNums ~Tree.getDegree ~Tree.getLen ~Tree.getNodeNumsAbove ~Tree.getPreAndPostOrderAbove ~Tree.getPreAndPostOrderAboveRoot ~Tree.getSeqNumsAbove ~Tree.subTreeIsFullyBifurcating ~Tree.summarizeModelComponentsNNodes ~Tree.verifyIdentityWith **Write** .. autosummary:: :nosignatures: ~Tree.write ~Tree.writeNewick ~Tree.writeNexus ~Tree.writePhylip ~Tree.tPickle See also Trees methods :meth:`p4.trees.Trees.writeNexus` and :meth:`p4.trees.Trees.writeNewick` for doing trees by the bunch. **Iteration over the nodes** Sometimes you don't want to just iterate over the self.nodes list, because after some manipulations a node might be in self.nodes but not actually in the tree; using these 'iter' methods takes care of that, skipping such nodes. .. autosummary:: :nosignatures: ~Tree.iterInternals ~Tree.iterInternalsNoRoot ~Tree.iterInternalsNoRootPostOrder ~Tree.iterInternalsNoRootPreOrder ~Tree.iterInternalsPostOrder ~Tree.iterLeavesNoRoot ~Tree.iterLeavesPostOrder ~Tree.iterLeavesPreOrder ~Tree.iterNodes ~Tree.iterNodesNoRoot ~Tree.iterPostOrder ~Tree.iterPreOrder ~Tree.nextNode See also :class:`~p4.node.Node` methods that do similar things starting from a given node. **Copy** .. autosummary:: :nosignatures: ~Tree.dupe ~Tree.copyToTree ~Tree.dupeSubTree **In combination with Data and Model** .. autosummary:: :nosignatures: ~Tree.calcLogLike ~Tree.optLogLike ~Tree.simulate ~Tree.getSiteLikes ~Tree.ancestralStateDraw ~Tree.bigXSquaredSubM ~Tree.compStatFromCharFreqs ~Tree.compoTestUsingSimulations ~Tree.modelFitTests ~Tree.modelSanityCheck ~Tree.simsForModelFitTests ~Tree.getEuclideanDistanceFromSelfDataToExpectedComposition **Setting a model** .. autosummary:: :nosignatures: ~Tree.newComp ~Tree.newRMatrix ~Tree.newGdasrv ~Tree.setPInvar ~Tree.setRelRate ~Tree.setModelComponentOnNode ~Tree.setModelComponentsOnNodesRandomly ~Tree.setModelComponentsNNodes ~Tree.summarizeModelComponentsNNodes ~Tree.setNGammaCat ~Tree.setTextDrawSymbol **Tree manipulation** .. autosummary:: :nosignatures: ~Tree.addLeaf ~Tree.addNodeBetweenNodes ~Tree.addSibLeaf ~Tree.addSubTree ~Tree.allBiRootedTrees ~Tree.collapseNode ~Tree.collapseClade ~Tree.ladderize ~Tree.lineUpLeaves ~Tree.nni ~Tree.nni2 ~Tree.pruneSubTreeWithoutParent ~Tree.pruneSubTreeWithParent ~Tree.randomSpr ~Tree.randomizeTopology ~Tree.reRoot ~Tree.reconnectSubTreeWithoutParent ~Tree.reconnectSubTreeWithParent ~Tree.removeEverythingExceptCladeAtNode ~Tree.removeNode ~Tree.removeAboveNode ~Tree.removeRoot ~Tree.renameForPhylip ~Tree.resolve ~Tree.resolvePolytomyAtNode ~Tree.restoreDupeTaxa ~Tree.restoreNamesFromRenameForPhylip ~Tree.rotateAround ~Tree.spr ~Tree.stripBrLens **Misc** .. autosummary:: :nosignatures: ~Tree.checkDupedTaxonNames ~Tree.checkSplitKeys ~Tree.checkTaxNames ~Tree.checkThatAllSelfNodesAreInTheTree ~Tree.inputTreesToSuperTreeDistances ~Tree.makeSplitKeys ~Tree.readBipartitionsFromPaupLogFile ~Tree.recalculateSplitKeysOfNodeFromChildren ~Tree.setNexusSets ~Tree.topologyDistance ~Tree.tvTopologyCompare ~Tree.patristicDistanceMatrix """ def __init__(self): self.fName = None # The name of the file it came from self.name = None self.root = None self.nodes = [] # nodeNums of nodes root -> tips A numpy array self.preOrder = None # nodeNums of nodes tips -> root A numpy array self.postOrder = None self.preAndPostOrderAreValid = 0 # Usually weight is 1/N, so the reciprocal looks nicer self.recipWeight = None # self.weight = None # Only for floating point weights, # so not usually ... # An ordered list. self.taxNames is a property self._taxNames = [] # A Data object. self.data is a property self._data = None self.cTree = None # A pointer to a c-struct self.logLike = None self.partLikes = None # A Model object. self.model is a property self._model = None # self.nTax, a property self._nTax = 0 self._nInternalNodes = -1 # self.nInternalNodes, a property self.doDataPart = 0 self.nexusSets = None self.nodeForSplitKeyDict = None ######################################################### # Properties: data, model ######################################################### ######################################################### # Properties: taxNames, nTax, nInternalNodes ######################################################### @property def taxNames(self): """(property) taxNames""" return self._taxNames @taxNames.setter def taxNames(self, theTaxNames): gm = ['Tree.taxNames'] if not isinstance(theTaxNames, list): gm.append("You can only set property 'taxNames' to a list.") gm.append("Got attempt to set to '%s'" % theTaxNames) raise P4Error(gm) self._taxNames = theTaxNames if theTaxNames: self.checkTaxNames() @taxNames.deleter def taxNames(self): gm = ['Tree.taxNames'] gm.append(" Caught an attempt to delete self.taxNames, but") gm.append("self.taxNames is a property, so you can't delete it.") gm.append("But you can set it to an empty list if you like.") raise P4Error(gm) def _setTaxNamesFromLeaves(self): tax = [] for n in self.iterNodes(): if n.isLeaf and n.name: tax.append(n.name) # This next line should not be needed, as root leaves should be # leaves. # terminal root that has a taxName elif n == self.root and n.name and n.getNChildren() < 2: tax.append(n.name) tax.sort() self._taxNames = tax if self._taxNames: self.checkTaxNames() def _getNTax(self): # We can't rely on len(self.taxNames), cuz it might not exist. # if hasattr(self, '_nTax') and self._nTax: if self._nTax: return self._nTax else: nTax = 0 if self.nodes: for n in self.iterNodes(): if n.isLeaf: nTax += 1 self._nTax = nTax return nTax nTax = property(_getNTax) """(property) nTax""" def _getNInternalNodes(self): if self._nInternalNodes >= 0: return self._nInternalNodes elif not self.nodes: return 0 else: self._nInternalNodes = len([n for n in self.iterInternalsNoRoot()]) if not self.root.isLeaf: self._nInternalNodes += 1 return self._nInternalNodes def _setNInternalNodes(self, theNInternalNodes): gm = ['Tree._setNInternalNodes()'] gm.append("Caught an attempt to set self.nInternalNodes, but") gm.append( "self.nInternalNodes is a property, so you shouldn't do that.") raise P4Error(gm) def _delNInternalNodes(self): self._nInternalNodes = -1 nInternalNodes = property(_getNInternalNodes, _setNInternalNodes, _delNInternalNodes) """(property) nInternalNodes""" ################################################## ##################################################
[docs] def dupe(self): """Duplicates self, but with no c-pointers. And no data object. If there is a model, it is duped. Returns a copy of self. """ storedData = None if self.data: storedData = self.data # We don't want to have to copy a big data object, now do we? self.data = None dupe = copy.deepcopy(self) # print 'Tree.dupe() self.root=%s, dupe.root=%s' % (self.root, # dupe.root) # Delete cPointers for n in dupe.nodes: if n.cNode: n.cNode = None if dupe.cTree: dupe.cTree = None if dupe.model and dupe.model.cModel: dupe.model.cModel = None if storedData: self.data = storedData return dupe
[docs] def parseNexus(self, flob, translationHash=None, doModelComments=0): """Start parsing nexus format newick tree description. From just after the command word 'tree', to the first paren of the Newick part of the tree. Args: flob: an open file or file-like object translationHash (dict): associates short names or numbers with long proper taxon names doModelComments (bool): whether to parse p4-specific model command comments in the tree description Returns: None """ gm = ['Tree.parseNexus()'] # re-defined below if 0: print('Tree.parseNexus() translationHash = %s' % translationHash) print(' doModelComments = %s (nParts)' % doModelComments) print(' flob type is %s' % type(flob)) tok = safeNextTok(flob, 'Tree.parseNexus()') # print 'parseNexus() tok = %s' % tok tok = p4.func.nexusUnquoteName(tok) if tok == '*': print(gm[0]) print(" Ignoring '*' in tree description") tok = safeNextTok(flob, 'Tree.parseNexus()') if not p4.func.nexusCheckName(tok): gm.append("Bad tree name: '%s'" % tok) raise P4Error(gm) self.name = tok # print "got name: '%s'" % tok # print "%s" % tok gm = ["Tree.parseNexus() '%s'" % self.name] # re-defining tok = safeNextTok(flob, gm[0]) if tok != '=': gm.append("Tree name must be followed by '='") raise P4Error(gm) # Generally this is the beginning of the newick tree # description. But we have to look ahead to see if there is a # weight comment. savedPos = flob.tell() while 1: beforeSafeNextTokPosn = flob.tell() #print('mnop var.nexus_getAllCommandComments = %s' % var.nexus_getAllCommandComments) tok = safeNextTok(flob, gm[0]) # skips [&U] if there is one #print("parseNexus: tok after '=' is '%s'. Before pos %i, after pos %i" % ( # tok, beforeSafeNextTokPosn, flob.tell())) # This next bit will only happen if either var.nexus_getWeightCommandComments # or var nexus_getAllCommandComments is set. if tok[0] == '[': self.getWeightCommandComment(tok) elif tok == '(': flob.seek(beforeSafeNextTokPosn) #print("wxy var.nexus_getAllCommandComments is %s" % var.nexus_getAllCommandComments) self.parseNewick(flob, translationHash, doModelComments) # self._initFinish() break elif tok == ';': gm.append("Got ';' before any tree description.") raise P4Error(gm) elif tok[0] in string.ascii_letters + string.digits + '_' + "'": flob.seek(savedPos, 0) self.parseNewick(flob, translationHash, doModelComments) # self._initFinish() break else: gm.append('Expecting a newick tree description.') raise P4Error(gm) self._initFinish()
# print 'finished Tree.parseNexus()'
[docs] def getWeightCommandComment(self, tok): if 0: print('var.nexus_getWeightCommandComments = %s' % var.nexus_getWeightCommandComments) print('var.nexus_getAllCommandComments = %s' % var.nexus_getAllCommandComments) print("Got comment '%s' (type %s), checking if it is a 'weight' comment." % (tok, type(tok))) gm = ["Tree.getWeightCommandComment()"] cFlob = io.StringIO(tok) cFlob.seek(1) # The [ cTok = nextTok(cFlob) if not cTok: # print "no cTok -- returning nothing" return lowCTok = cTok.lower() if lowCTok in ['&r', '&u']: # print "got %s -- returning nothing" % cTok return if lowCTok != '&w': gm.append('Expecting a weight comment. Got %s' % tok) raise P4Error(gm) cTok = nextTok(cFlob) # It might be a float, or the more usual 1/something if ("." in cTok): # A float? try: self.weight = float(cTok) except: gm.append("I can't grok '%s' in weight comment %s" % (cTok, tok)) raise P4Error(gm) # Should check for scientific notation? else: try: theNumerator = int(cTok) if theNumerator != 1: gm.append( 'Expecting a numerator 1 in weight comment %s' % tok) raise P4Error(gm) # print 'got theNumerator %i' % theNumerator except ValueError: gm.append('Expecting a numerator 1 in weight comment %s' % tok) raise P4Error(gm) cTok = nextTok(cFlob) if cTok == '/': cTok = safeNextTok(cFlob, 'Getting weight comment %s' % tok) try: self.recipWeight = int(cTok) except ValueError: gm.append('Bad denominator in weight comment %s' % tok) raise P4Error(gm) elif cTok == ']': # self.recipWeight = theNumerator # ie 1, might as well leave # it as None pass else: gm.append("I can't grok '%s' in weight comment %s" % (cTok, tok)) raise P4Error(gm) cFlob.close()
# print 'got recipWeight = %s' % self.recipWeight # def printStack(self, theStack): # only used for debugging parseNewick() # print 'stack = ', # for n in theStack: # print "%i['%s'] " % (n.nodeNum, n.name), # print ''
[docs] def parseNewick(self, flob, translationHash, doModelComments=0): """Parse Newick tree descriptions. This is stack-based, and does not use recursion. """ if 0: print('parseNewick here. doModelComments=%s' % doModelComments) print(" translationHash=%s, self.taxNames=%s" % (translationHash, self.taxNames)) print(" flob type is %s, pos is %i" % (type(flob), flob.tell())) print("wyy var.nexus_getAllCommandComments is %s" % var.nexus_getAllCommandComments) print(f" var.punctuation is {var.punctuation}") if self.name: gm = ["Tree.parseNewick(), tree '%s'" % self.name] else: gm = ['Tree.parseNewick()'] if hasattr(flob, 'name') and flob.name: self.fName = flob.name gm[0] += ", File %s" % self.fName stack = [] isAfterParen = 1 # to start, even tho its not true isAfterComma = 0 parenNestLevel = 0 lastPopped = None tok = nextTok(flob) # skip over [&U] if there is one. # print("xx Got tok %s" % tok) if not tok: return isQuotedTok = False if tok.startswith("'"): isQuotedTok = True # Should generally be the opening paren, except if its a single-node # tree. tok = p4.func.nexusUnquoteName(tok) if doModelComments: # Turn on var.nexus_getAllCommandComments in order # to be able to read model info on nodes, eg [& c0.1] # restore at end # We need this after the safeNextTok() above, in case there is a [&U] to be ignored. savedP4Nexus_getAllCommandComments = var.nexus_getAllCommandComments var.nexus_getAllCommandComments = 1 while tok != ';': # print("top of loop tok '%s', isQuotedTok=%s, tok[0] is '%s'" % (tok, isQuotedTok, tok[0])) if tok == '(': # print "Got '(': new node (%i)." % len(self.nodes) if not (isAfterParen or isAfterComma): gm.append( 'Got badly-placed paren, not after a paren or comma.') raise P4Error(gm) newNode = Node() if doModelComments: for pNum in range(doModelComments): newNode.parts.append(NodePart()) newNode.br.parts.append(NodeBranchPart()) # self.printStack(stack) if len(stack): newNode.parent = stack[-1] if newNode.parent.leftChild == None: newNode.parent.leftChild = newNode else: newNode.parent.rightmostChild().sibling = newNode else: if len(self.nodes) == 0: self.root = newNode # Sometimes. Generally not true-- corrected at the end. newNode.isLeaf = 1 else: gm.append('Something is wrong. Stack is empty.') gm.append('Extra paren?') raise P4Error(gm) newNode.nodeNum = len(self.nodes) self.nodes.append(newNode) stack.append(newNode) isAfterParen = 1 parenNestLevel += 1 elif tok == ',': if isAfterParen: gm.append('Got comma after paren.') raise P4Error(gm) elif isAfterComma: gm.append('Got comma after comma.') raise P4Error(gm) # self.printStack(stack) try: lastPopped = stack.pop() except IndexError: gm.append('Empty stack. Out of place comma?') raise P4Error(gm) isAfterComma = 1 if len(stack) == 0: gm.append('Empty stack. Out of place comma?') raise P4Error(gm) elif tok == ')': try: lastPopped = stack.pop() except IndexError: gm.append('Empty stack. Out of place unparen?') raise P4Error(gm) isAfterParen = 0 isAfterComma = 0 parenNestLevel = parenNestLevel - 1 if parenNestLevel < 0: gm.append('Unmatched unparen.') raise P4Error(gm) if len(stack) == 0 and len(self.nodes) > 1: gm.append('Empty stack. Out of place unparen?') raise P4Error(gm) elif tok[0] in string.ascii_letters or tok[0] in string.digits or tok[0] in var.nexus_safeChars \ or isQuotedTok or tok[0] in ['_', '#', '\\', '/', '"', '(', ')']: # A single-node tree, not ()aName, rather just aName. # print(f"parseNewick() here ACE tok={tok}") if len(self.nodes) == 0: isAfterParen = 1 if not (isAfterParen or isAfterComma): # Probably a name of an internal node. # print(f"parseNewick() here ACG tok={tok} isAfterParen={isAfterParen} len(stack)={len(stack)}") if len(stack): # print(f"parseNewick() here ACI stack[-1].name={stack[-1].name}") # if stack[-1].isLeaf and stack[-1].name != '(': if stack[-1].name: if not var.newick_allowSpacesInNames: # a second name after a node name, eg (A foo, B) =>foo is bad # or eg (A, B)foo bar => bar is bad gm.append("Badly placed token '%s'." % tok) gm.append("Appears to be a second node name, after '%s'" % stack[-1].name) gm.append('Missing comma maybe? Or punctuation or spaces in an unquoted name?') gm.append("To allow reading Newick (or Nexus) with spaces, ") gm.append("turn var.newick_allowSpacesInNames on") raise P4Error(gm) else: stack[-1].name += ' ' stack[-1].name += tok else: # Usually this... # print("parseNewick() here ACK naming node %i as '%s'" % (stack[-1].nodeNum, tok)) # We allow bad names on internal nodes, ie we do # not nexusCheckName(tok) stack[-1].name = tok # We may have just named the root node, and it may be a leaf, and perhaps should have a translation. # But we do not know that yet. Checked at the end. else: # len(stack) == 0 if lastPopped and lastPopped.name == None: # ()A # print "naming lastPopped node %i with '%s'" % # (lastPopped.nodeNum, tok) lastPopped.isLeaf = 1 #lastPopped.label = tok lastPopped.name = tok else: gm.append("Badly placed token '%s' in tree description." % tok) raise P4Error(gm) else: # A new terminal node. if tok[0] in string.ascii_letters or tok[0] in ['_']: if translationHash and tok in translationHash: # print 'got key %s, val is %s' % (tok, # translationHash[tok]) tok = translationHash[tok] elif tok[0] in string.digits: if var.nexus_allowAllDigitNames: if translationHash and tok in translationHash: # print 'got key %s, val is %s' % (tok, # translationHash[tok]) tok = translationHash[tok] else: try: tok = int(tok) if translationHash and repr(tok) in translationHash: tok = translationHash[repr(tok)] elif translationHash and repr(tok) not in translationHash: gm.append("There is a 'translation' for this tree, but the") gm.append("number '%i' in the tree description" % tok) gm.append('is not included in that translate command.') raise P4Error(gm) elif self.taxNames: try: tok = self.taxNames[tok - 1] except IndexError: gm.append("Can't make sense out of token '%s' for a new terminal node." % tok) gm.append('There is no translate command, and the taxNames does not') gm.append('have a value for that number.') raise P4Error(gm) else: gm.append("We have a taxon name '%s', composed only of numerals." % tok) gm.append(" ") gm.append('The Nexus format allows tree specifications with no') gm.append('translate command to use integers to refer to taxa.') gm.append('That is possible because in a proper Nexus file the') gm.append('taxa are defined before the trees. P4, however, does') gm.append('not require definition of taxa before the trees, and in') gm.append('the present case no definition was made. Deal with it.') raise P4Error(gm) except ValueError: if translationHash and repr(tok) in translationHash: tok = translationHash[repr(tok)] # else: # starts with a digit, but it is not an int. # gm.append('Problem token %s' % tok) # raise P4Error(gm) #print("Got terminal node '%s'" % tok) newNode = Node() if doModelComments: for pNum in range(doModelComments): newNode.parts.append(NodePart()) newNode.br.parts.append(NodeBranchPart()) newNode.isLeaf = 1 if p4.func.nexusCheckName(tok): newNode.name = tok # print 'got newNode.name = %s' % tok else: gm.append("Bad name '%s'" % tok) raise P4Error(gm) if len(stack): newNode.parent = stack[-1] if newNode.parent.leftChild == None: newNode.parent.leftChild = newNode else: newNode.parent.rightmostChild().sibling = newNode newNode.nodeNum = len(self.nodes) if len(self.nodes) == 0: self.root = newNode self.nodes.append(newNode) stack.append(newNode) isAfterParen = 0 isAfterComma = 0 elif tok == ':': # Looking for a br.len number, which might be eg 0.234 or -1.23e-05 # It might be a multi-token operation. Accumulate tok's in # theNum theNum = nextTok(flob) if not theNum: gm.append('Tree description ended with a colon. Bad!') raise P4Error(gm) # print " Got token after colon: '%s'" % theNum if theNum == '-' or theNum == '+': tok = nextTok(flob) # print " Got tok: '%s' after '%s'" % (tok, theNum) if not tok: gm.append('Trying to deal with a branch length.') gm.append("It didn't work, tho.") gm.append("Got this after colon: '%s'" % theNum) gm.append('followed by nothing.') raise P4Error(gm) theNum += tok try: # If it is a simple number like 0.123 or -23, then we are # finished. # Won't work if it ends in 'e' stack[-1].br.len = float(theNum) #print(' Successfully got br.len %f' % stack[-1].br.len) except ValueError: # The first bit after the colon is hopefully something like # +1.23e if theNum[-1] not in ['e', 'E']: gm.append( 'Trying to deal with a branch length after a colon, but I am totally confused.') gm.append("Can't make sense out of '%s'" % theNum) raise P4Error(gm, 'newick_badBranchLength') try: float(theNum[:-1]) except ValueError: gm.append( 'Trying to deal with a branch length after a colon, but I am totally confused.') gm.append("Can't make sense out of '%s'" % theNum) raise P4Error(gm, 'newick_badBranchLength') # Now we are sure that the first bit *is* something like +1.23e # We do not allow spaces after the 'e', so we do not use nextTok(). # That introduces a bug, where comments inserted in the number don't get ignored. <<== unfixed bug! # The first thing must be a '+' or a '-'. c = flob.read(1) if not c: gm.append( 'Trying to deal with a branch length, possibly in scientific notation.') gm.append( "Got '%s' after the colon, but then nothing." % theNum) raise P4Error(gm) if c not in ['+', '-']: gm.append( 'Trying to deal with a branch length, possibly in scientific notation.') gm.append("Got '%s' after the colon." % theNum) gm.append( "Expecting a '+' or '-' after that (no spaces allowed).") gm.append("Got '%s'." % c) raise P4Error(gm) # Accumulate characters in 'theExp'. We need at least one # digit. theExp = c c = flob.read(1) if not c: gm.append('Trying to deal with a branch length, possibly in scientific notation.') gm.append("Got '%s%s' after the colon, but then nothing." % (theNum, theExp)) raise P4Error(gm) if c not in string.digits: gm.append("Trying to deal with a branch length, possibly in scientific notation.") gm.append("Got '%s%s' after the colon." % (theNum, theExp)) gm.append('Expecting one or more digits.') gm.append("Got '%s'" % c) raise P4Error(gm) theExp += c # So we got one good digit. Are there any more? while 1: positionBeforeRead = flob.tell() c = flob.read(1) if not c: gm.append('Trying to deal with a branch length, possibly in scientific notation.') gm.append("Got '%s%s' after the colon, but then nothing." % (theNum, theExp)) raise P4Error(gm) # We got something. If its a digit, add it to # theExp. If its anything else, back up one # space and then break if c in string.digits: theExp += c else: flob.seek(positionBeforeRead) break #print(" At this point, theNum='%s' and theExp='%s'" % (theNum, theExp)) try: # print " Trying to see if theExp '%s' can be # converted to an int." % theExp int(theExp) try: theBrLen = float(theNum + theExp) # print ' Successfully got br.len %g (from %s%s)' # % (theBrLen, theNum, theExp) stack[-1].br.len = theBrLen except ValueError: gm.append('Trying to deal with a branch length, possibly in scientific notation.') gm.append("It didn't work, tho.") gm.append("Got these after colon: '%s' and '%s'" % (theNum, theExp)) gm.append('And they could not be converted to an exponential float.') raise P4Error(gm) except ValueError: gm.append('Trying to deal with a branch length, possibly in scientific notation.') gm.append("It didn't work, tho.") gm.append("Got these after colon: '%s' and '%s'." % (theNum, theExp)) gm.append('And the latter does not appear to be an int.') raise P4Error(gm) elif tok[0] == '[': # print("openSquareBracket. Got tok '%s'" % tok) # if doModelComments is set, it should be set to nParts. if doModelComments: # eg [& c0.1 r0.0] n = stack[-1] # print('got comment %s, node %i' % (tok, n.nodeNum)) cFlob = io.StringIO(tok) cFlob.seek(2) tok2 = safeNextTok(cFlob) while 1: if tok2 == ']': break elif tok2[0] in ['c', 'r', 'g']: ending = tok2[1:] splitEnding = ending.split('.') try: firstNum = int(splitEnding[0]) secondNum = int(splitEnding[1]) except ValueError: gm.append('Bad command comment %s' % tok) raise P4Error(gm) if tok2[0] == 'c': n.parts[firstNum].compNum = secondNum if tok2[0] == 'r': n.br.parts[firstNum].rMatrixNum = secondNum if tok2[0] == 'g': n.br.parts[firstNum].gdasrvNum = secondNum else: gm.append('Bad command comment %s' % tok) raise P4Error(gm) tok2 = safeNextTok(cFlob) elif 0: # Ugly hack for RAxML trees with bootstrap # supports in square brackets after the br len, on # internal nodes. First modify the eg [100] to be # [&100], set var.nexus_getAllCommandComments = # True, and turn on this elif section. myNode = stack[-1] assert not myNode.isLeaf assert not myNode.name mySupportString = tok[2:-1] # print mySupportString myNode.name = mySupportString elif var.nexus_readBeastTreeCommandComments: n = stack[-1] i = 2 while i < (len(tok) - 1): j = i inBraces = False while 1: j += 1 # print tok[j] if tok[j] == ']': break if tok[j] == '{': inBraces = True if inBraces: if tok[j] == '}': inBraces = False else: if tok[j] == ',': break substring = tok[i:j] # print(substring) splSS = substring.split('=') theNameString = splSS[0].strip() for badChr in "%()": if badChr in theNameString: theNameString = theNameString.replace(badChr, '') theValString = splSS[1] if '{' in theValString: theValString = theValString.replace('{', '(') theValString = theValString.replace('}', ')') if theValString == 'true': theVal = True elif theValString == 'false': theVal = False else: theVal = eval(theValString) if 0: # Check type print(f"beast command comment theValString = {theValString}") if isinstance(theVal, (int, float, tuple, bool)): n.__setattr__(theNameString, theVal) else: print(f"reading beastTreeCommandComment; cannot parse value {theValString} for {theNameString}") n.__setattr__(theNameString, theVal) i = j + 1 elif 0: # branch colours, after the branch length, eg '[&!color=#8caaf4]' if tok.startswith("[&!color="): myNode = stack[-1] theColour = tok[9:-1] #print(theColour) myNode.br.colour = theColour else: gm.append("I can't make sense of the token '%s'" % tok) if len(tok) == 1: if tok[0] in var.punctuation: gm.append("The token is in var.punctuation. If you don't think it should") gm.append("be, you can modify what p4 thinks that punctuation is.") gm.append("So you might do this:") gm.append("var.punctuation = var.phylip_punctuation") gm.append("(or use your own definition -- see var.py)") gm.append("read('yourWackyTreeFile.phy')") gm.append("That might work.") if tok[0] not in var.punctuation: gm.append("The token is not in your current var.punctuation.") #gm.append("tok[0] is '%s'" % tok[0]) raise P4Error(gm) sTok = safeNextTok(flob, 'Tree init, reading tree string') if sTok.startswith("'"): isQuotedTok = True else: isQuotedTok = False tok = p4.func.nexusUnquoteName(sTok) #print('got tok for next round = %s' % tok) # This is the end of the "while tok != ';':" loop # print '\n*** Stack len = %i ***' % len(stack) if parenNestLevel > 0: gm.append('Unmatched paren.') raise P4Error(gm) elif parenNestLevel < 0: gm.append('Unmatched unparen.') raise P4Error(gm) if len(stack) == 0: if len(self.nodes) == 1: pass else: gm.append( "Got an oddly-placed ';' in the tree %s description." % self.name) self.dump(treeInfo=0, nodeInfo=1) raise P4Error(gm) elif len(stack) > 1: gm.append( "Got an oddly-placed ';' in the tree %s description." % self.name) # gm.append('(stack len = %i)' % len(stack) #self.dump(tree=0, node=1) raise P4Error(gm) if self.root.leftChild and self.root.leftChild.sibling: # usually this self.root.isLeaf = 0 else: # The root is a leaf assert self.root.isLeaf if self.root.name: if translationHash and self.root.name in translationHash: self.root.name = translationHash[self.root.name] # Should a root on a stick be a leaf? If it is just for # display purposes, then it should be ok to not be a leaf. # But if you are going to re-Root, then it will cause trouble. # So by default, a root on a stick should be a leaf. I think. # Hopefully if you are dealing with this, then you know what # you are doing and what you want, and how to modify things to # get it. # Uncomment this next line to make it always non-leaf, even if it is a # leaf. # self.root.isLeaf = 0 # do this to always have a non-leaf # root-on-a-stick <-- Potential Trouble!!! self.root.br = None # self.draw() #self.dump(tree=0, node=1, treeModel=0) if doModelComments: # restore the value of var.nexus_getAllCommandComments, which was # saved above. var.nexus_getAllCommandComments = savedP4Nexus_getAllCommandComments
def _initFinish(self): if self.name: gm = ["Tree._initFinish() tree '%s'" % self.name] else: gm = ['Tree._initFinish()'] # Checking for duped taxon names used to be here, but it was # pushed further ahead, so that self.taxNames can be corrected # also, if need be. At this point, self does not have # taxNames set. # Check that all terminal nodes have names for item in self.nodes: if item.isLeaf: # print 'leaf name %s' % item.name if not item.name: if item == self.root: if var.warnAboutTerminalRootWithNoName: print('Tree._initFinish()') print(' Non-fatal warning: the root is terminal, but has no name.') print(' This may be what you wanted. Or not?') print(' (To get rid of this warning, turn off var.warnAboutTerminalRootWithNoName)') else: gm.append('Got a terminal node with no name.') raise P4Error(gm) self.preOrder = numpy.array( [var.NO_ORDER] * len(self.nodes), numpy.int32) self.postOrder = numpy.array( [var.NO_ORDER] * len(self.nodes), numpy.int32) if len(self.nodes) > 1: self.setPreAndPostOrder()
[docs] def checkDupedTaxonNames(self): # Called by p4.func._tryToReadNexusFile() and p4.func._tryToReadPhylipFile() # Check for duped names if self.name: gm = ["Tree.checkDupedTaxonNames() tree '%s'" % self.name] else: gm = ['Tree.checkDupedTaxonNames()'] if self.fName: gm[0] += ' file=%s' % self.fName hasDupedName = 0 loNames = [] for n in self.nodes: if n.isLeaf and n.name: loNames.append(n.name.lower()) for loName in loNames: if loNames.count(loName) > 1: if var.allowDupedTaxonNames: pass elif not var.doRepairDupedTaxonNames: gm.append( "Got duplicated taxon (lowercased) name '%s'." % loName) gm.append( 'Since var.doRepairDupedTaxonNames is not turned on, p4 will not fix duplications.') gm.append('To repair duplications verbosely, set ') gm.append('var.doRepairDupedTaxonNames = 1') gm.append('To repair duplications silently, set') gm.append('var.doRepairDupedTaxonNames = 2') raise P4Error(gm) hasDupedName = 1 break if hasDupedName: # print self.name if var.allowDupedTaxonNames: # more hacking ... if var.allowDupedTaxonNames == 2: # ie silently. pass else: complainedAlready = [] for loName in loNames: if loNames.count(loName) > 1 and loName not in complainedAlready: if self.name: print(" Tree %s. Duped tax name (lowercased) '%s'" % ( self.name, loName)) else: print(" Duped tax name (lowercased) '%s'" % loName) complainedAlready.append(loName) elif var.doRepairDupedTaxonNames: repairedNames = [] for loName in loNames: if loNames.count(loName) > 1 and n not in repairedNames: repairCounter = 1 repairCounter2 = 1 for n in self.nodes: if n.isLeaf: if n.name and n.name.lower() == loName: newName = '%s_%i' % (n.name, repairCounter) if var.doRepairDupedTaxonNames == 1: if self.name: print(" Tree %s. Changing '%s' to '%s'" % ( self.name, n.name, newName)) else: print(" Changing '%s' to '%s'" % (n.name, newName)) n.name = newName repairedNames.append(loName) repairCounter += 1 if self.taxNames: for tNameNum in range(len(self.taxNames)): tName = self.taxNames[tNameNum] if tName.lower() == loName: newName = '%s_%i' % (tName, repairCounter2) self.taxNames[tNameNum] = newName repairCounter2 += 1 assert repairCounter == repairCounter2, "Got a problem with re-naming duped taxa."
############## ############## # # dump() # ############## ##############
[docs] def dump(self, tree=0, node=0, model=0, all=0): """Print rubbish about self. tree is the default, showing basic info about the tree. node shows info about all the nodes. model shows which model component number goes on which node. (which you can also get by drawing the tree) (If you want the info about the model itself, do a aTree.model.dump() instead.) """ if all: self._doTreeInfo() self._doNodeInfo() self._doNodeModelInfo() elif not tree and not node and not model: self._doTreeInfo() else: if tree: self._doTreeInfo() if node: self._doNodeInfo() if model: self._doNodeModelInfo()
def _doTreeInfo(self): if self.name: print("Tree '%s' dump" % self.name) else: print('Tree dump. No name.') if self.fName: print(" From file '%s'" % self.fName) else: print(" From an unknown file, or no file.") if self.root: print(' Node %i is root' % self.root.nodeNum) else: print(' There is no root') if self.recipWeight: print(' The tree recipWeight is %s' % self.recipWeight) else: print(' There is no recipWeight') print(' There are %i nodes' % len(self.nodes)) terminals = 0 for i in self.nodes: if i.isLeaf: terminals += 1 print(' of which %i are terminal nodes' % terminals) if self.data: print(' There is a data object, with %i parts.' % self.data.nParts) else: print(' There is no data object.') if self.data: print(' The data came from file(s):') for a in self.data.alignments: if a.fName: print(' %s' % a.fName) if self.model: print(' There is a model object, with %i parts.' % self.model.nParts) if self.model.cModel: print(' model.cModel is %i' % self.model.cModel) else: print(' There is no cModel.') else: print(' There is no model object.') if self.taxNames: print(' There is a taxNames list.') else: print(' There is no taxNames list.') if self.cTree: print(' cTree is %i' % self.cTree) else: print(' There is no cTree.')
[docs] def _doNodeInfo(self): """Basic rubbish about nodes.""" print('\n-------- nodes -----------------------------------------') print('%7s %6s %6s %6s %6s %7s %6s %4s' % ('nodeNum', 'isLeaf', 'parent', 'leftCh', 'siblng', 'br.len', 'seqNum', 'name')) for n in self.nodes: print('%7s %6s' % (n.nodeNum, n.isLeaf), end=' ') if n.parent: print('%6s' % n.parent.nodeNum, end=' ') else: print('%6s' % 'None', end=' ') if n.leftChild: print('%6s' % n.leftChild.nodeNum, end=' ') else: print('%6s' % 'None', end=' ') if n.sibling: print('%6s' % n.sibling.nodeNum, end=' ') else: print('%6s' % 'None', end=' ') if n.br and (n.br.len or n.br.len == 0.0): print('%7.3f' % n.br.len, end=' ') else: print('%7s' % 'None', end=' ') if n.seqNum or n.seqNum == 0: print('%6s' % n.seqNum, end=' ') else: print('%6s' % 'None', end=' ') if n.name: print(' %s' % n.name) else: print(' %s' % 'None') print('--------------------------------------------------------\n') doMore = 0 for n in self.iterNodesNoRoot(): if hasattr(n.br, 'name') and n.br.name: doMore = 1 break elif hasattr(n.br, 'uName') and n.br.uName: doMore = 1 break elif hasattr(n.br, 'support') and n.br.support: doMore = 1 break if doMore: print('\n-------- more node stuff -------------------------------') print('%7s %10s %10s %10s %4s' % ('nodeNum', 'br.name', 'br.uName', 'br.support', 'name')) for n in self.nodes: print('%7s' % n.nodeNum, end=' ') if n.br and hasattr(n.br, 'name') and n.br.name: print('%10s' % n.br.name, end=' ') else: print('%10s' % '-', end=' ') if n.br and hasattr(n.br, 'uName') and n.br.uName: print('%10s' % n.br.uName, end=' ') else: print('%10s' % '-', end=' ') if n.br and hasattr(n.br, 'support') and n.br.support: print('%10.4f' % n.br.support, end=' ') else: print('%10s' % '-', end=' ') if n.name: print(' %s' % n.name) else: print(' %s' % 'None') print('--------------------------------------------------------\n') doMore = 0 for n in self.nodes: if hasattr(n, 'rootCount') and n.rootCount: doMore = 1 break if n.br: if hasattr(n.br, 'color') and n.br.color: doMore = 1 break elif hasattr(n.br, 'biRootCount') and n.br.biRootCount: doMore = 1 break if doMore: print('\n-------- even more node stuff --------------------------') print('%7s %10s %14s %10s %4s' % ('nodeNum', 'br.color', 'br.biRootCount', 'rootCount', 'name')) for n in self.nodes: print('%7s' % n.nodeNum, end=' ') if n.br and hasattr(n.br, 'color') and n.br.color: print('%10s' % n.br.color, end=' ') else: print('%10s' % '-', end=' ') if n.br and hasattr(n.br, 'biRootCount') and n.br.biRootCount: print('%14s' % n.br.biRootCount, end=' ') else: print('%14s' % '-', end=' ') if hasattr(n, 'rootCount') and n.rootCount: print('%10s' % n.rootCount, end=' ') else: print('%10s' % '-', end=' ') if n.name: print(' %s' % n.name) else: print(' %s' % 'None') print('--------------------------------------------------------\n')
def _doNodeModelInfo(self): if not self.model: print('\n****** Node Model Info. No model.') if not self.data: print('(no data attached, either)') else: print('\n****** Node Model Info. nParts=%s' % self.model.nParts) if not self.data: print('no data') if not self.model.nParts: return print('\nComps in the nodes:') print(' %13s' % 'nodeNum', end=' ') for i in range(self.model.nParts): print(' %8s' % 'part%i' % i, end=' ') print('') for n in self.nodes: print(' %13i' % n.nodeNum, end=' ') for i in range(self.model.nParts): print('%8i' % n.parts[i].compNum, end=' ') print('') print('\nrMatrices in the nodes:') print(' %13s' % 'nodeNum', end=' ') for i in range(self.model.nParts): print(' %8s' % 'part%i' % i, end=' ') print('') for n in self.iterNodesNoRoot(): print(' %13i' % n.nodeNum, end=' ') for i in range(self.model.nParts): print('%8i' % n.br.parts[i].rMatrixNum, end=' ') print('') print('\ngdasrvs in the nodes:') print(' %13s' % '', end=' ') for i in range(self.model.nParts): print(' %8s' % 'part%i' % i, end=' ') print('') print(' %13s' % 'nGammaCats ->', end=' ') for i in range(self.model.nParts): print('%8i' % self.model.parts[i].nGammaCat, end=' ') print('\n') print(' %13s' % 'nodeNum', end=' ') for i in range(self.model.nParts): print(' %8s' % 'part%i' % i, end=' ') print('') for n in self.iterNodesNoRoot(): print(' %13i' % n.nodeNum, end=' ') for i in range(self.model.nParts): print('%8i' % n.br.parts[i].gdasrvNum, end=' ') print('') ########################### # # Set a NexusSets object, for taxSets ... # ###########################
[docs] def setNexusSets(self): """Set self.nexusSets from var.nexusSets. A deepcopy is made of var.nexusSets, only if it exists. If var.nexusSets does not yet exist, a new blank one is not made (cf this method in Alignment class, where it would be made). Important! This method depends on a correct taxNames. """ assert self.taxNames, "This method requires correct taxNames, in the correct order." gm = ["Tree.setNexusSets()"] if not var.nexusSets: return self.nexusSets = copy.deepcopy(var.nexusSets) self.nexusSets.taxNames = self.taxNames self.nexusSets.nTax = self.nTax self.nexusSets.constant = None self.nexusSets.gapped = None self.nexusSets.charSets = [] self.nexusSets.charPartitions = [] if self.nexusSets.taxSets: # print "%s. There are %i taxSets." % (gm[0], len(self.nexusSets.taxSets)) # Check that no taxSet name is a taxName lowSelfTaxNames = [txName.lower() for txName in self.taxNames] for ts in self.nexusSets.taxSets: if ts.lowName in lowSelfTaxNames: gm.append( "Can't have taxSet names that are the same (case-insensitive) as a tax name") gm.append( "Lowercased taxSet name '%s' is the same as a lowcased taxName." % ts.name) raise P4Error(gm) self.nexusSets.lowTaxNames = lowSelfTaxNames # If it is standard format, # convert triplets to numberTriplets, and then mask for ts in self.nexusSets.taxSets: if ts.format == 'standard': ts.setNumberTriplets() ts.setMask() elif ts.format == 'vector': assert ts.mask if len(ts.mask) != self.nTax: gm.append("taxSet %s" % ts.name) gm.append( "It is vector format, but the length is wrong.") gm.append( "taxSet mask is length %i, but self nTax is %i" % (len(ts.mask), self.nTax)) raise P4Error(gm) else: gm.append("taxSet %s" % ts.name) gm.append("unknown format %s" % ts.format) raise P4Error(gm) # Now set ts.taxNames from the mask ts.taxNames = [self.taxNames[i] for i,c in enumerate(ts.mask) if c == '1']
########################### # # Get lists of nodeNums ... # ###########################
[docs] def getNodeNumsAbove(self, nodeSpecifier, leavesOnly=0): """Gets a list of nodeNums, in postOrder, above nodeSpecifier. The node specified is not included. """ x, y = self.getPreAndPostOrderAbove(nodeSpecifier) if leavesOnly: tOnly = [] for i in y[:-1]: if self.nodes[i].isLeaf: tOnly.append(i) return tOnly return y[:-1]
[docs] def getSeqNumsAbove(self, nodeSpecifier): """Gets a list of seqNums above nodeSpecifier.""" x, y = self.getPreAndPostOrderAbove(nodeSpecifier) seqNums = [] for nNum in y[:-1]: n = self.nodes[nNum] if n.isLeaf: seqNums.append(n.seqNum) return seqNums
# def getAllChildrenNums(self, specifier): # """Returns a list of the nodeNums of all children of the specified node # Ambiguous, unused. # """ # theNode = self.node(specifier) # if not theNode: # gm = ['Tree.getChildrenNums()'] # gm.append('Bad node specifier') # raise P4Error(gm) # ret = [] # x, y = self.getPreAndPostOrderAbove(specifier) # for i in y[:-1]: # ret.append(self.nodes[i].nodeNum) # return ret
[docs] def getAllLeafNames(self, specifier): """Returns a list of the leaf names of all children """ theNode = self.node(specifier) if not theNode: gm = ['Tree.getChildrenNums()'] gm.append('Bad node specifier') raise P4Error(gm) ret = [] x, y = self.getPreAndPostOrderAbove(specifier) for i in y[:-1]: if self.nodes[i].isLeaf: ret.append(self.nodes[i].name) return ret
[docs] def getChildrenNums(self, specifier): """Returns a list of nodeNums of children of the specified node. See also Node.getNChildren()""" theNode = self.node(specifier) if not theNode: gm = ['Tree.getChildrenNums()'] gm.append('Bad node specifier') raise P4Error(gm) ret = [] c = theNode.leftChild while 1: if c: ret.append(c.nodeNum) c = c.sibling else: return ret
[docs] def setPreAndPostOrder(self): """Sets or re-sets self.preOrder and self.postOrder lists of node numbers. PreOrder starts from the root and goes to the tips; postOrder starts from the tips and goes to the root.""" self.getPreAndPostOrderAboveRoot() self.preAndPostOrderAreValid = 1
[docs] def getPreAndPostOrderAbove(self, nodeSpecifier): """Returns 2 lists of node numbers, preOrder and postOrder. This uses a stack, not recursion, so it should work for large trees without bumping into the recursion limit. The 2 lists are relative to the node specified, and include the node specified. PreOrder starts from theNode and goes to the tips; postOrder starts from the tips and goes to theNode.""" gm = ['Tree.getPreAndPostOrderAbove()'] theNode = self.node(nodeSpecifier) preOrder = [] # nodeNum's postOrder = [] if not theNode.leftChild: preOrder.append(theNode.nodeNum) postOrder.append(theNode.nodeNum) return preOrder, postOrder stack = [] # nodes stack.append(theNode) preOrder.append(theNode.nodeNum) while len(stack): if stack[-1].leftChild: # print 'leftChild: %i' % stack[-1].leftChild.nodeNum theNodeNum = stack[-1].leftChild.nodeNum stack.append(stack[-1].leftChild) preOrder.append(theNodeNum) elif stack[-1].sibling: # print 'sibling: %i' % stack[-1].sibling.nodeNum theNodeNum = stack[-1].sibling.nodeNum theSib = stack[-1].sibling # print ' postOrder appending u %i' % # stack[-1].nodeNum postOrder.append(stack[-1].nodeNum) stack.pop() stack.append(theSib) preOrder.append(theNodeNum) else: # print ' postOrder appending v %i' % # stack[-1].nodeNum postOrder.append(stack[-1].nodeNum) stack.pop() if len(stack) == 0: break # print ' postOrder appending w %i' % # stack[-1].nodeNum postOrder.append(stack[-1].nodeNum) theNode = stack.pop() while not theNode.sibling: if len(stack) == 0: break # print ' postOrder appending x %i' % # stack[-1].nodeNum postOrder.append(stack[-1].nodeNum) theNode = stack.pop() if len(stack) == 0: break if theNode.sibling: stack.append(theNode.sibling) preOrder.append(theNode.sibling.nodeNum) else: gm.append('Problemo.') gm.append('xxx got theNode %s' % theNode.nodeNum) raise P4Error(gm) return preOrder, postOrder
[docs] def getPreAndPostOrderAboveRoot(self): """Sets self.preOrder and self.postOrder. This uses a stack, not recursion, so it should work for large trees without bumping into the recursion limit. PreOrder starts from the root and goes to the tips; postOrder starts from the tips and goes to the root.""" gm = ['Tree.getPreAndPostOrderAboveRoot()'] theNode = self.root preOrdIndx = 0 postOrdIndx = 0 if self.preOrder is None or self.postOrder is None or len(self.preOrder) != len(self.nodes) or len(self.postOrder) != len(self.nodes): self.preOrder = numpy.array( [var.NO_ORDER] * len(self.nodes), numpy.int32) self.postOrder = numpy.array( [var.NO_ORDER] * len(self.nodes), numpy.int32) # print "xx self.preOrder=%s" % self.preOrder if not theNode.leftChild: self.preOrder[preOrdIndx] = theNode.nodeNum preOrdIndx += 1 self.postOrder[postOrdIndx] = theNode.nodeNum postOrdIndx += 1 else: stack = [] # nodes stack.append(theNode) self.preOrder[preOrdIndx] = theNode.nodeNum preOrdIndx += 1 while len(stack): if stack[-1].leftChild: # print 'leftChild: %i (%s)' % # (stack[-1].leftChild.nodeNum, [n.nodeNum for n in stack]) theNodeNum = stack[-1].leftChild.nodeNum stack.append(stack[-1].leftChild) self.preOrder[preOrdIndx] = theNodeNum preOrdIndx += 1 elif stack[-1].sibling: # print 'sibling: %i (%s)' % (stack[-1].sibling.nodeNum, # [n.nodeNum for n in stack]) theNodeNum = stack[-1].sibling.nodeNum theSib = stack[-1].sibling # print ' postOrder appending u %i' % # stack[-1].nodeNum self.postOrder[postOrdIndx] = stack[-1].nodeNum postOrdIndx += 1 stack.pop() stack.append(theSib) try: self.preOrder[preOrdIndx] = theNodeNum except IndexError: gm.append("preOrdIndx=%s, theNodeNum=%i" % (preOrdIndx, theNodeNum)) gm.append("preOrder = %s" % self.preOrder) raise P4Error(gm) preOrdIndx += 1 else: # print ' postOrder appending v %i' % # stack[-1].nodeNum self.postOrder[postOrdIndx] = stack[-1].nodeNum postOrdIndx += 1 stack.pop() if len(stack) == 0: break # print ' postOrder appending w %i' % # stack[-1].nodeNum self.postOrder[postOrdIndx] = stack[-1].nodeNum postOrdIndx += 1 theNode = stack.pop() while not theNode.sibling: if len(stack) == 0: break # print ' postOrder appending x %i' % # stack[-1].nodeNum self.postOrder[postOrdIndx] = stack[-1].nodeNum postOrdIndx += 1 theNode = stack.pop() if len(stack) == 0: break if theNode.sibling: stack.append(theNode.sibling) # print "self.preOrder = %s, preOrdIndx=%i" % # (self.preOrder, preOrdIndx) self.preOrder[preOrdIndx] = theNode.sibling.nodeNum preOrdIndx += 1 else: gm.append('Problemo.') gm.append('xxx got theNode %s' % theNode.nodeNum) raise P4Error(gm) if 1: assert preOrdIndx == postOrdIndx # print "a preOrdIndx = %i, len(self.nodes) = %i" % (preOrdIndx, # len(self.nodes)) if preOrdIndx != len(self.nodes): pOI = preOrdIndx for i in range(preOrdIndx, len(self.nodes)): self.preOrder[i] = var.NO_ORDER self.postOrder[i] = var.NO_ORDER preOrdIndx += 1 postOrdIndx += 1 # print "b preOrdIndx = %i, len(self.nodes) = %i" % (preOrdIndx, # len(self.nodes)) assert preOrdIndx == len(self.nodes) and postOrdIndx == len(self.nodes)
[docs] def iterPreOrder(self): """Node generator. Assumes preAndPostOrderAreValid.""" for i in self.preOrder: j = int(i) # this speeds things up a lot! Two-fold! if j != var.NO_ORDER: # yield self.nodes[int(i)] yield self.nodes[j]
[docs] def iterPostOrder(self): """Node generator. Assumes preAndPostOrderAreValid.""" for i in self.postOrder: j = int(i) if j != var.NO_ORDER: yield self.nodes[j]
[docs] def iterNodes(self): """Node generator, in preOrder. Assumes preAndPostOrderAreValid.""" for i in self.preOrder: j = int(i) if j != var.NO_ORDER: yield self.nodes[j]
[docs] def iterNodesNoRoot(self): """Node generator, skipping the root. PreOrder.""" for i in self.preOrder: j = int(i) if j != var.NO_ORDER: n = self.nodes[j] if n != self.root: yield n
[docs] def iterLeavesNoRoot(self): """Leaf node generator, skipping the root. PreOrder.""" for i in self.preOrder: j = int(i) if j != var.NO_ORDER: n = self.nodes[j] if n != self.root and n.isLeaf: yield n
[docs] def iterLeavesPreOrder(self): for i in self.preOrder: j = int(i) if j != var.NO_ORDER: n = self.nodes[j] if n.isLeaf: yield n
[docs] def iterLeavesPostOrder(self): for i in self.postOrder: j = int(i) if j != var.NO_ORDER: n = self.nodes[j] if n.isLeaf: yield n
[docs] def iterInternalsNoRootPreOrder(self): """Internal post order node generator, skipping the root. Assumes preAndPostOrderAreValid.""" for i in self.preOrder: j = int(i) if j != var.NO_ORDER: n = self.nodes[j] if n != self.root and not n.isLeaf: yield n
[docs] def iterInternalsNoRootPostOrder(self): """Internal post order node generator, skipping the root. Assumes preAndPostOrderAreValid.""" for i in self.postOrder: j = int(i) if j != var.NO_ORDER: n = self.nodes[j] if n != self.root and not n.isLeaf: yield n
[docs] def iterInternalsPostOrder(self): """Internal post order node generator. Assumes preAndPostOrderAreValid.""" for i in self.postOrder: j = int(i) if j != var.NO_ORDER: n = self.nodes[j] if not n.isLeaf: yield n
[docs] def iterInternalsNoRoot(self): """Internal node generator, skipping the root. PreOrder""" for i in self.preOrder: j = int(i) if j != var.NO_ORDER: n = self.nodes[j] if n != self.root and not n.isLeaf: yield n
[docs] def iterInternals(self): """Internal node generator. PreOrder. Including the root, if it is internal.""" for i in self.preOrder: j = int(i) if j != var.NO_ORDER: n = self.nodes[j] if not n.isLeaf: yield n
################################################ #
[docs] def stripBrLens(self): """Sets all node.br.len's to 0.1, the default in p4. Then, if you were to write it out in Nexus or Newick format, no branch lengths would be printed. """ for n in self.iterNodesNoRoot(): n.br.len = 0.1
[docs] def getLen(self): """Return the sum of all br.len's.""" if self.cTree: # About 0.01 msec for a tree with 1000 leaves return pf.p4_getTreeLen(self.cTree) else: # About 2 msec for a tree with 1000 leaves. total = 0.0 for n in self.iterNodesNoRoot(): total += n.br.len return total
# def lenInternals(self): # """Return the sum of all internal br.len's.""" # total = 0.0 # for n in self.iterInternalsNoRoot(): # total += n.br.len # return total # def stemminess(self): # """Ratio of internal branches to overall tree length. # Also called 'treeness'. Via Phillips and Penny, MPE 2003, but # they cite Lanyon 1988.""" # total = self.len() # internals = self.lenInternals() # return internals/total
[docs] def _makeRCSplitKeys(self, splitList=None): """Make long integer-valued split keys. This is dependent on that the leaf bitkeys are already set, ie this method should only be used by the reduced consensus supertree class """ if not self.preAndPostOrderAreValid: self.setPreAndPostOrder() # self.draw() # allOnes = 2**(self.nTax) - 1 # print 'nTax = %i, allOnes = %i' % (self.nTax, allOnes) # for n in self.iterInternalsPostOrder(): for n in self.iterInternalsNoRootPostOrder(): # if n != self.root: # print 'doing node %s' % n.nodeNum # if not n.isLeaf(): if n.leftChild: childrenNums = self.getChildrenNums(n) # x = childrenNums[0] x = self.nodes[childrenNums[0]].br.rc for i in childrenNums[1:]: x = x | self.nodes[i].br.rc # x = x | i n.br.rc = x if splitList != None: splitList.append([x, 0]) n.br.rcList = [n.br.rc]
[docs] def makeSplitKeys(self, makeNodeForSplitKeyDict=True): """Make long integer-valued split keys. This needs to have self.taxNames set. We make 2 kinds of split keys-- rawSplitKeys and splitKeys. Both are attributes of node.br, so we have eg node.br.splitKey. Raw split keys for terminal nodes are 2**n, where n is the index of the taxon name. Eg for the first taxon, the rawSplitKey will be 1, for the 3rd taxon the rawSplitKey will be 4. RawSplitKeys for internal nodes are the rawSplitKey's for the children, bitwise OR'ed together. SplitKeys, cf rawSplitKeys, are in 'standard form', where the numbers are even, ie do not contain the 1-bit. Having it in standard form means that you can compare splits among trees. If the rawSplitKey is even, then the splitKey is simply that, unchanged. If, however, the rawSplitKey is odd, then the splitKey is the rawSplitKey bit-flipped. For example, if there are 5 taxa, and one of the rawSplitKeys is 9 (odd), we can calculate the splitKey by bit-flipping, as:: 01001 = 9 rawSplitKey 10110 = 22 splitKey (Bit-flipping is done by exclusive-or'ing (xor) with 11111.) The splitKey is readily converted to a splitString for display, as 22 becomes '.**.*' (note the '1' bit is now on the left). It is conventional that the first taxon, on the left, is always a dot. (I don't know where the convention comes from.) The root has no rawSplitKey or splitKey. For example, the tree:: +-------2:B (rawSplitKey = 2) +---1 | +---------3:C (rawSplitKey = 4) | 0-------------4:E (rawSplitKey = 16) | | +-----6:A (rawSplitKey = 1) +----5 +-----------7:D (rawSplitKey = 8) has 2 internal splits, on nodes 1 and 5. :: Node n.br.rawSplitKey n.br.splitKey 1 6 6 5 9 22 There should be no duplicated rawSplitKeys, but if the tree has a bifurcating root then there will be a duped splitKey. This method will fail for trees with internal nodes that have only one child, because that will make duplicated splits. If arg *makeNodeForSplitKeyDict* is set, then it will make a dictionary ``nodeForSplitKeyDict`` where the keys are the splitKeys and the values are the corresponding nodes. """ gm = ['Tree.makeSplitKeys()'] #raise P4Error(gm) if not self.taxNames: gm.append('No taxNames.') raise P4Error(gm) if not self.preAndPostOrderAreValid: self.setPreAndPostOrder() # self.draw() if makeNodeForSplitKeyDict: self.nodeForSplitKeyDict = {} allOnes = 2 ** (self.nTax) - 1 # print 'nTax = %i, allOnes = %i' % (self.nTax, allOnes) for n in self.iterPostOrder(): if n != self.root: # print 'doing node %s' % n.nodeNum if not n.leftChild: try: n.br.rawSplitKey = 1 << self.taxNames.index( n.name) # "<<" is left-shift except ValueError: gm.append('node.name %s' % n.name) gm.append('is not in taxNames %s' % self.taxNames) raise P4Error(gm) # n.br.rawSplitKey = 1 << self.taxNames.index(n.name) # # "<<" is left-shift if n.br.rawSplitKey == 1: n.br.splitKey = allOnes - 1 else: n.br.splitKey = n.br.rawSplitKey # print 'upward leaf node %s, rawSplitKey %s, splitKey # %s' % (n.nodeNum, n.br.rawSplitKey, n.br.splitKey) else: childrenNums = self.getChildrenNums(n) if len(childrenNums) == 1: gm.append( 'Internal node has only one child. That will make a duped split.') raise P4Error(gm) x = self.nodes[childrenNums[0]].br.rawSplitKey for i in childrenNums[1:]: y = self.nodes[i].br.rawSplitKey x = x | y # '|' is bitwise "OR". n.br.rawSplitKey = x # Make node.br.splitKey's in "standard form", ie # without the first taxon, ie without a 1. To do that # we use the '&' operator, to bitwise "and" with 1. # Ie "Does rawSplitKey contain a 1?" or "Is rawSplitKey # odd?" if 1 & n.br.rawSplitKey: # "^" is xor, a bit-flipper. n.br.splitKey = allOnes ^ n.br.rawSplitKey else: n.br.splitKey = n.br.rawSplitKey # print 'intern node %s, rawSplitKey %s, splitKey %s' % # (n.nodeNum, n.br.rawSplitKey, n.br.splitKey) if makeNodeForSplitKeyDict: self.nodeForSplitKeyDict[n.br.splitKey] = n if 0: # There should be no duped rawSplitKeys theKeys = [] for n in self.iterNodesNoRoot(): theKeys.append(n.br.rawSplitKey) for k in theKeys: if theKeys.count(k) > 1: gm.append('Duped rawSplitKey %i.' % k) for n in self.nodes: if n != self.root: print('%7s %4s %4s' % (n.nodeNum, n.br.rawSplitKey, n.br.splitKey)) raise P4Error(gm) # Any duped splitKeys? There will be if the tree is bi-Rooted. if 0: theKeys = [] for n in self.iterNodesNoRoot(): theKeys.append(n.br.splitKey) for k in theKeys: if theKeys.count(k) > 1: gm.append('Duped splitKey %i.' % k) for n in self.iterNodesNoRoot(): print('%7s %4s %4s' % (n.nodeNum, n.br.rawSplitKey, n.br.splitKey)) raise P4Error(gm) if 0: print(gm[0]) print(self.taxNames) print(f"root is node {self.root.nodeNum}") print('nodeNum rawSplitKey splitKey') # getSplitsFromKey would usually use self.nTax. # Changed below to accommodate tempRoot for bi-rooted trees. for n in self.iterNodesNoRoot(): print('%7s %4s %4s %s' % ( n.nodeNum, n.br.rawSplitKey, n.br.splitKey, p4.func.getSplitStringFromKey(n.br.splitKey, len(self.taxNames))))
[docs] def recalculateSplitKeysOfNodeFromChildren(self, aNode, allOnes): children = [n for n in aNode.iterChildren()] x = children[0].br.rawSplitKey for n in children[1:]: x = x | n.br.rawSplitKey # '|' is bitwise "OR". aNode.br.rawSplitKey = x # Ie "Does rawSplitKey contain a 1?" or "Is rawSplitKey odd?" if 1 & aNode.br.rawSplitKey: # "^" is xor, a bit-flipper. aNode.br.splitKey = allOnes ^ aNode.br.rawSplitKey else: aNode.br.splitKey = aNode.br.rawSplitKey
[docs] def checkSplitKeys(self, useOldName=False, glitch=True, verbose=True): gm = ['Tree.checkSplitKeys()'] allOnes = 2 ** (self.nTax) - 1 # print 'nTax = %i, allOnes = %i' % (self.nTax, allOnes) isBad = False for n in self.iterPostOrder(): if n != self.root: # print 'doing node %s' % n.nodeNum if not n.leftChild: # A long int, eg 1, has no upper limit on its value try: if useOldName: rawSplitKey = 1 << self.taxNames.index( n.oldName) # "<<" is left-shift else: rawSplitKey = 1 << self.taxNames.index( n.name) # "<<" is left-shift except ValueError: if useOldName: gm.append('node.name %s' % n.oldName) else: gm.append('node.name %s' % n.name) gm.append('is not in taxNames %s' % self.taxNames) raise P4Error(gm) # n.br.rawSplitKey = 1 << self.taxNames.index(n.name) # # "<<" is left-shift if rawSplitKey == 1: splitKey = allOnes - 1 else: splitKey = rawSplitKey # print 'upward leaf node %s, rawSplitKey %s, splitKey # %s' % (n.nodeNum, n.br.rawSplitKey, n.br.splitKey) else: childrenNums = self.getChildrenNums(n) if len(childrenNums) == 1: gm.append( 'Internal node has only one child. That will make a duped split.') raise P4Error(gm) x = self.nodes[childrenNums[0]].br.rawSplitKey for i in childrenNums[1:]: y = self.nodes[i].br.rawSplitKey x = x | y # '|' is bitwise "OR". rawSplitKey = x # Make node.br.splitKey's in "standard form", ie # without the first taxon, ie without a 1. To do that # we use the '&' operator, to bitwise "and" with 1. # Ie "Does rawSplitKey contain a 1?" or "Is rawSplitKey # odd?" if 1 & rawSplitKey: # "^" is xor, a bit-flipper. splitKey = allOnes ^ rawSplitKey else: splitKey = rawSplitKey # print 'intern node %s, rawSplitKey %s, splitKey %s' % # (n.nodeNum, n.br.rawSplitKey, n.br.splitKey) if n.br.rawSplitKey != rawSplitKey: print("checkSplitKeys node %2i rawSplitKey: existing %s, calculated %s" % (n.nodeNum, n.br.rawSplitKey, rawSplitKey)) isBad = True if n.br.splitKey != splitKey: print("checkSplitKeys node %2i splitKey: existing %s, calculated %s" % (n.nodeNum, n.br.splitKey, splitKey)) isBad = True if glitch and isBad: raise P4Error(gm) if verbose and not isBad: print("checkSplitKeys(). ok")
[docs] def taxSetIsASplit(self, taxSetName): """Asks whether a nexus taxset is a split in the tree. Args: taxSetName (str): The name of the taxset. Case does not matter. Returns: Node: the node in self if the taxset is a split, or else None """ gm = ["Tree.taxSetIsASplit(%s)" % taxSetName] assert self.nexusSets assert self.taxNames assert self.nexusSets.taxSets lowArgTaxSetName = taxSetName.lower() theTS = None for ts in self.nexusSets.taxSets: if ts.lowName == lowArgTaxSetName: theTS = ts break assert theTS, "Could not find the taxSet named %s" % taxSetName # theTS.dump() assert theTS.mask # Make sure that self splitKey has been set. Check the first internal node. needsDoing = False for n in self.iterInternalsNoRoot(): if not n.br.splitKey: needsDoing = True break if needsDoing: self.makeSplitKeys() rawSplitKey = 0 for i in range(len(theTS.mask)): # print i, theTS.mask[i] if theTS.mask[i] == '1': rawSplitKey += (1 << i) # Ie "Does rawSplitKey contain a 1?" or "Is rawSplitKey odd?" if 1 & rawSplitKey: allOnes = 2 ** (self.nTax) - 1 splitKey = allOnes ^ rawSplitKey # "^" is xor, a bit-flipper. else: splitKey = rawSplitKey # print "got splitKey %s" % splitKey for n in self.nodes: if n.br and not n.isLeaf: assert n.br.splitKey, "Needs node.br.splitKey on all internal nodes." # print " %2i %s %s" % (n.nodeNum, n.br.splitKey, splitKey) if n.br.splitKey == splitKey: # self.draw() # return n.nodeNum return n # Was -1 before, when n.nodeNum was returned for hits. Now a node is # returned. return None
[docs] def checkTaxNames(self): """Check that all taxNames are in the tree, and vice versa.""" # If var.allowTreesWithDifferingTaxonSets is set to True we will not check # the taxnames. This is primarily used to read trees for supertree and # leafstability calcuations. # Peter comments that Tobias added this, but it is not needed, # and messes other things up -- so comment out until it is # sorted. # if var.allowTreesWithDifferingTaxonSets: # return # self.taxNames is a property, so when it is set, it calls this method if self.name: gm = ['Tree.checkTaxNames() tree %s' % self.name] else: gm = ['Tree.checkTaxNames()'] if self.fName: gm.append("From file '%s' " % self.fName) if not self.taxNames: gm.append('No taxNames.') raise P4Error(gm) tax = [] for n in self.iterNodes(): if n.isLeaf and n.name: tax.append(n.name) # This next line should not be needed, as root leaves should be # leaves. # terminal root that has a taxName elif n == self.root and n.name and n.getNChildren() < 2: tax.append(n.name) # print 'tax from tree = %s' % tax # print 'self.taxNames = %s' % self.taxNames # Check for same number of taxa if len(tax) != len(self.taxNames): # self.draw() # self.dump(node=1) gm.append('Mismatch. Length of self.taxNames is wrong.') gm.append('The tree has %i leaves with names, but len(self.taxNames) = %i' % ( len(tax), len(self.taxNames))) gm.append('leaves on the tree = %s' % tax) gm.append('self.taxNames = %s' % self.taxNames) gm.append('symmetric_difference is %s' % set(tax).symmetric_difference(set(self.taxNames))) # print ' Setting invalid taxNames to an empty list.' #self.taxNames = [] raise P4Error(gm, 'tree_badTaxNamesLength') # Check for mis-matched taxNames isBad = 0 taxSet = set(tax) selfTaxNamesSet = set(self.taxNames) s = taxSet.difference(selfTaxNamesSet) if len(s): isBad = 1 print(gm[0]) print('TaxName mismatch between the tree and self.taxNames.') print('These taxa are found in the tree but not in self.taxNames:') print(s) s = selfTaxNamesSet.difference(taxSet) if len(s): isBad = 1 print(gm[0]) print('TaxName mismatch between the tree and self.taxNames.') print('These taxa are found in the self.taxNames but not in the tree:') print(s) if isBad: raise P4Error(gm, 'tree_taxNamesMisMatch')
################################################ # Copy and Verify
[docs] def copyToTree(self, otherTree): gm = ['Tree.copyToTree()'] if len(self.nodes) != len(otherTree.nodes): gm.append('Different number of nodes.') raise P4Error(gm) # node relations (parent, child, sib) for nNum in range(len(self.nodes)): selfNode = self.nodes[nNum] otherNode = otherTree.nodes[nNum] # parent if selfNode.parent: otherNode.parent = otherTree.nodes[selfNode.parent.nodeNum] else: # print "otherNode.parent", otherNode.parent otherNode.parent = None # leftChild if selfNode.leftChild: otherNode.leftChild = otherTree.nodes[ selfNode.leftChild.nodeNum] else: # print "otherNode.leftChild", otherNode.leftChild otherNode.leftChild = None # sibling if selfNode.sibling: otherNode.sibling = otherTree.nodes[selfNode.sibling.nodeNum] else: # print "otherNode.sibling", otherNode.sibling otherNode.sibling = None # root otherTree.root.br = otherTree.nodes[self.root.nodeNum].br otherTree.nodes[self.root.nodeNum].br = None otherTree.root = otherTree.nodes[self.root.nodeNum] # brLens and splitKeys for nNum in range(len(self.nodes)): if self.nodes[nNum] != self.root: otherTree.nodes[nNum].br.len = self.nodes[nNum].br.len otherTree.nodes[nNum].br.lenChanged = self.nodes[nNum].br.lenChanged otherTree.nodes[nNum].flag = self.nodes[nNum].flag otherTree.nodes[nNum].br.splitKey = self.nodes[nNum].br.splitKey otherTree.nodes[nNum].br.rawSplitKey = self.nodes[nNum].br.rawSplitKey if 0: print("Tree.copyToTree()") for nNum in range(len(self.nodes)): if self.nodes[nNum] != self.root: print(f"{nNum:3} {self.nodes[nNum].br.splitKey} {otherTree.nodes[nNum].br.splitKey}") else: print(f"{nNum:3} is root") # model usage numbers if self.model: for nNum in range(len(self.nodes)): selfNode = self.nodes[nNum] otherNode = otherTree.nodes[nNum] if selfNode.parts and otherNode.parts: for pNum in range(self.model.nParts): otherNode.parts[pNum].compNum = selfNode.parts[pNum].compNum if selfNode != self.root: if selfNode.br.parts and otherNode.br.parts: otherNode.br.parts[pNum].rMatrixNum = selfNode.br.parts[pNum].rMatrixNum otherNode.br.parts[pNum].gdasrvNum = selfNode.br.parts[pNum].gdasrvNum # pre- and postOrder for i in range(len(self.preOrder)): otherTree.preOrder[i] = self.preOrder[i] otherTree.postOrder[i] = self.postOrder[i] # partLikes if self.model: for pNum in range(self.model.nParts): otherTree.partLikes[pNum] = self.partLikes[pNum] otherTree._nInternalNodes = self._nInternalNodes
[docs] def verifyIdentityWith(self, otherTree, doSplitKeys=False, doMore=False): """For MCMC debugging. Verifies that two trees are identical.""" complaintHead = '\nTree.verifyIdentityWith()' # keep if len(self.nodes) != len(otherTree.nodes): print(complaintHead) print(' Different number of nodes.') return var.DIFFERENT # check node relations (parent, child, sib) isBad = 0 for nNum in range(len(self.nodes)): selfNode = self.nodes[nNum] otherNode = otherTree.nodes[nNum] # parent if selfNode.parent: if otherNode.parent: if otherNode.parent.nodeNum != selfNode.parent.nodeNum: isBad = 1 else: isBad = 1 else: if otherNode.parent: isBad = 1 # leftChild if selfNode.leftChild: if otherNode.leftChild: if otherNode.leftChild.nodeNum != selfNode.leftChild.nodeNum: isBad = 1 else: isBad = 1 else: if otherNode.leftChild: isBad = 1 # sibling if selfNode.sibling: if otherNode.sibling: if otherNode.sibling.nodeNum != selfNode.sibling.nodeNum: isBad = 1 else: isBad = 1 else: if otherNode.sibling: isBad = 1 if isBad: print(complaintHead) print(' Node %i, relations differ.' % nNum) self.write() otherTree.write() return var.DIFFERENT if self.root.nodeNum != otherTree.root.nodeNum: print(complaintHead) print(' Roots differ.') return var.DIFFERENT # brLens, lenChanged, and node.flag. and splitKeys if 0: print("Tree.verifyIdentityWith() flag") for nNum in range(len(self.nodes)): if self.nodes[nNum] != self.root: print(f"{nNum:3} {self.nodes[nNum].flag} {otherTree.nodes[nNum].flag}") else: print(f"{nNum:3} is root") for nNum in range(len(self.nodes)): if self.nodes[nNum] != self.root: # if self.nodes[nNum].br.len != otherTree.nodes[nNum].br.len: if math.fabs(self.nodes[nNum].br.len - otherTree.nodes[nNum].br.len) > 1.e-8: print(complaintHead) print(' BrLens differ.') return var.DIFFERENT if self.nodes[nNum].br.lenChanged != otherTree.nodes[nNum].br.lenChanged: print(complaintHead) print(' br.lenChanged differs.') return var.DIFFERENT if self.nodes[nNum].flag != otherTree.nodes[nNum].flag: print(complaintHead) print(' flag differs, nodeNum %i. %s vs %s' % (nNum, self.nodes[nNum].flag, otherTree.nodes[nNum].flag)) return var.DIFFERENT if doSplitKeys: if self.nodes[nNum].br.splitKey != otherTree.nodes[nNum].br.splitKey: print(complaintHead) print(f' SplitKeys differ. nodeNum {nNum}') return var.DIFFERENT if self.nodes[nNum].br.rawSplitKey != otherTree.nodes[nNum].br.rawSplitKey: print(complaintHead) print(f' rawSplitKeys differ. nodeNum {nNum}') return var.DIFFERENT if self.model: # model usage numbers isBad = 0 for pNum in range(self.model.nParts): for nNum in range(len(self.nodes)): selfNode = self.nodes[nNum] otherNode = otherTree.nodes[nNum] if selfNode.parts and otherNode.parts: if selfNode.parts[pNum].compNum != otherNode.parts[pNum].compNum: isBad = 1 if self.nodes[nNum] != self.root: if selfNode.br.parts and otherNode.br.parts: if selfNode.br.parts[pNum].rMatrixNum != otherNode.br.parts[pNum].rMatrixNum: isBad = 1 elif selfNode.br.parts[pNum].gdasrvNum != otherNode.br.parts[pNum].gdasrvNum: isBad = 1 if isBad: print(complaintHead) print(' Node %i, model usage info does not match.' % nNum) return var.DIFFERENT # pre- and postOrder isBad = 0 for i in range(len(self.preOrder)): if self.preOrder[i] != otherTree.preOrder[i]: isBad = 1 break elif self.postOrder[i] != otherTree.postOrder[i]: isBad = 1 break if isBad: print(complaintHead) print(' Pre- or postOrder do not match.') return var.DIFFERENT if self.nInternalNodes != otherTree.nInternalNodes: print(complaintHead) print(' nInternalNodes differ.') return var.DIFFERENT # partLikes for pNum in range(self.model.nParts): # if otherTree.partLikes[pNum] != self.partLikes[pNum]: myDiff = math.fabs(otherTree.partLikes[pNum] - self.partLikes[pNum]) if myDiff > 1.e-5: print(complaintHead) print(" partLikes differ by %.8f (%g). (%.8f, (%g) %.8f (%g)" % ( myDiff, myDiff, otherTree.partLikes[pNum], otherTree.partLikes[pNum], self.partLikes[pNum], self.partLikes[pNum])) return var.DIFFERENT if doMore: # some more for nNum in range(len(self.nodes)): selfNode = self.nodes[nNum] otherNode = otherTree.nodes[nNum] if selfNode.nodeNum != otherNode.nodeNum: print(complaintHead) print(' nodeNum differs') return var.DIFFERENT if selfNode.seqNum != otherNode.seqNum: print(complaintHead) print(' seqNum differs') return var.DIFFERENT if selfNode.name != otherNode.name: print(complaintHead) print(' name differs') return var.DIFFERENT if selfNode.isLeaf != otherNode.isLeaf: print(complaintHead) print(' isLeaf differs') return var.DIFFERENT return var.SAME
############################################
[docs] def isFullyBifurcating(self, verbose=False, biRoot=False): """Returns True if the tree is fully bifurcating. Else False. If arg biRoot is True, then it is required that the tree be biRoot'ed (ie have a bifurcating root) If arg biRoot is False, then biRoot'ed trees are not allowed, but trees rooted on a leaf are allowed. """ rootNChildren = self.root.getNChildren() if rootNChildren > 3: if verbose: print("isFullyBifurcating() returning False, due to root with %i children." % rootNChildren) return False if not biRoot: if rootNChildren == 2: if verbose: print("isFullyBifurcating() returning False, due to root with %i children." % rootNChildren) return False elif rootNChildren == 1 or rootNChildren == 3: # rooting on a leaf is OK if rootNChildren == 1: if not self.root.isLeaf: if verbose: print("isFullyBifurcating()", end=" ") print("returning False, because the single-child root is not a leaf.") return False for n in self.iterInternalsNoRoot(): if n.leftChild and n.leftChild.sibling: if n.leftChild.sibling.sibling: if verbose: print("isFullyBifurcating()", end=" ") print("returning False, due to node %i having 3 or more children." % n.nodeNum) return False else: if verbose: print("isFullyBifurcating()", end=" ") print("returning False, due to non-leaf node %i having 1 or fewer children." % n.nodeNum) return False return True else: raise P4Error("Tree.isFullyBifurcating() --- this should not happen; fix me.") if biRoot: # root on a leaf not allowed on a biRoot tree, as the root is explicitly not a leaf. if rootNChildren != 2: if verbose: print("isFullyBifurcating(biRoot=True)", end=" ") print("returning False, due to root with %i children." % rootNChildren) return False for n in self.iterInternalsNoRoot(): if n.leftChild and n.leftChild.sibling: if n.leftChild.sibling.sibling: if verbose: print("isFullyBifurcating(biRoot=True)", end=" ") print("returning False, due to node %i having 3 or more children." % n.nodeNum) return False else: if verbose: print("isFullyBifurcating(biRoot=True)", end=" ") print("returning False, due to non-leaf node %i having 1 or fewer children." % n.nodeNum) return False return True
# These next two are for the eTBR implementation that I got from Jason # Evans' Crux. Thanks Jason!
[docs] def getDegree(self, nodeSpecifier): n = self.node(nodeSpecifier) if n.isLeaf: if n.parent: return 1 else: assert n == self.root if n.leftChild: # a tree-on-a-stick return 1 else: # a single isolated node return 0 else: #assert n.leftChild deg = 1 # the leftChild if n.parent: deg += 1 s = n.leftChild.sibling while s: deg += 1 s = s.sibling return deg
[docs] def nextNode(self, spokeSpecifier, hubSpecifier): """Get next node cycling around a hub node. A bit of a hack to make a p4 Node behave sorta like a Felsenstein node. Imagine cycling around the branches emanating from a node like spokes on a hub, starting from anywhere, with no end. The hub node would usually be the parent of the spoke, or the spoke would be the hub itself. Usually self.nextNode(spoke,hub) delivers spoke.sibling. What happens when the siblings run out is that self.nextNode(rightmostSibling, hub) delivers hub itself, and of course its branch (spoke) points toward the hub.parent. (Unless hub is the root, of course, in which case self.nextNode(rightmostSibling, hub) delivers hub.leftChild.) In the usual case of the hub not being the root, the next node to be delivered by nextNode(spokeIsHub, hub) is usually the leftChild of the hub. Round and round, clockwise. """ spoke = self.node(spokeSpecifier) hub = self.node(hubSpecifier) if spoke.parent == hub or hub == spoke: if spoke == hub: if spoke.leftChild: return spoke.leftChild else: return hub else: if spoke.sibling: return spoke.sibling else: if hub.parent: return hub else: return hub.leftChild else: print("*=" * 25) self.draw() gm = ["Tree.nextNode() spoke=%i, hub=%i" % (spoke.nodeNum, hub.nodeNum)] gm.append( "Need to have either spoke.parent == hub or hub == spoke.") raise P4Error(gm)
[docs] def topologyDistance(self, tree2, metric='sd', resetSplitKeySet=True): """Compares the topology of self with tree2. The complete list of metrics is given in var.topologyDistanceMetrics For most metrics using this method, taxNames needs to be set, to the same in the two trees. If the taxa differ, this method simply returns -1 The 'metric' can be one of 'sd' (symmetric difference), 'wrf' (weighted Robinson-Foulds), 'bld' (Felsenstein's branch- length distance), or 'diffs'. The unweighted Robinson-Foulds metric would be the same as the symmetric difference. There is also an experimental scqdist, but that needs the scqdist.so module, in the QDist directory. See Felsenstein 2004 Inferring Phylogenies, Pg 529. The default metric is the very simple 'sd', symmetric difference. Using this metric, if the 2 trees share the same set of splits, they are deemed to be the same topology; branch lengths are not compared. This method returns the number of splits that are in self that are not in tree2 plus the number of splits that are in tree2 that are not in self. So it would return 0 for trees that are the same. The 'wrf' and 'bld' metrics take branch lengths into account. Bifurcating roots complicate things, so they are not allowed for weighted distance calculations. In the unweighted case (ie metric='sd'), whether the trees compared have bifurcating roots or not is ignored. So the trees (A,B,(C,D)) and ((A,B),(C,D)) will be deemed to have the same topology, since they have the same splits. The measurement 'diffs', which returns a tuple of 2 numbers -- both are set differences. The first is the number of splits in self that are not in tree2, and the second is the number of splits in tree2 that are not in self. (Consider it as the the symmetric difference split into its 2 parts.) If you calculate a distance and then make a topology change, a subsequent sd topologyDistance calculation will be wrong, as it uses previous splits. So then you need to 'resetSplitKeySet'. The 'scqdist' metric calculates quartet distances. The code was written by Anders Kabell Kristensen for his Masters degree at Aarhus University, 2010. http://www.cs.au.dk/~dalko/thesis/ It has two versions -- a pure Python version (that needs scipy) that I do not include here, and a fast C++ version, that I wrapped in python. Its speedy -- the 'sc' in 'scqdist' is for 'sub-cubic', ie better than O(n^3). I have also incorporated the tqDist v1.0.2 code, from 2014, also for quartet distance calculations, from Christian N. S. Pedersen and his group at BiRC in Aarhus. See https://users-cs.au.dk/cstorm/software/tqdist/ 2014. It is available here via the 'tqdist' metric. """ gm = ['Tree.topologyDistance()'] if metric not in var.topologyDistanceMetrics: gm.append("Got a request for unknown metric '%s'" % metric) gm.append("The 'metric' arg should be one of %s" % var.topologyDistanceMetrics) raise P4Error(gm) if metric == 'scqdist': # no need for taxNames try: import p4.scqdist as scqdist except ImportError: gm.append("Could not find the 'scqdist' module needed for this metric.") gm.append("See the instructions for making it in the p4 source, in the Qdist directory.") raise P4Error(gm) tsSelf = self.writeNewick(toString=True) tsTree2 = tree2.writeNewick(toString=True) return scqdist.qdist(tsSelf, tsTree2) elif metric == 'tqdist': # no need for taxNames try: import p4.pytqdist as pytqdist except ImportError: gm.append("Could not find the 'tqdist' module needed for this metric.") gm.append("See the instructions for making it in the p4 source, in the tqDist directory.") raise P4Error(gm) tsSelf = self.writeNewick(toString=True, spaceAfterComma=False) tsTree2 = tree2.writeNewick(toString=True, spaceAfterComma=False) return pytqdist.qdistFromStrings(tsSelf, tsTree2) if not self.taxNames or not tree2.taxNames: gm.append("This method requires taxNames to be set.") raise P4Error(gm) if self.taxNames != tree2.taxNames: gm.append("The taxNames are different for the two trees.") gm.append("Self: %s" % self.taxNames) gm.append("tree2: %s" % tree2.taxNames) raise P4Error(gm) if (self.root.getNChildren() == 2 or tree2.root.getNChildren() == 2) and (metric in ['wrf', 'bld']): gm.append('One of the input trees has a bifurcating root.') gm.append( 'Weighted tree distance calculations do not work on bi-rooted trees.') raise P4Error(gm) # I might be doing a lot of these, and I don't want to make # splitKeys and make the splitKeys set over and over again. # So store it as a Tree.attribute. if resetSplitKeySet: if hasattr(self, 'splitKeySet'): del(self.splitKeySet) if hasattr(tree2, 'splitKeySet'): del(tree2.splitKeySet) if not hasattr(self, 'splitKeySet'): self.makeSplitKeys() self.splitKeySet = set( [n.br.splitKey for n in self.iterNodesNoRoot()]) if not hasattr(tree2, 'splitKeySet'): tree2.makeSplitKeys() tree2.splitKeySet = set( [n.br.splitKey for n in tree2.iterNodesNoRoot()]) if metric == 'sd': # Symmetric difference. The symmetric_difference method # returns all elements that are in exactly one of the sets. theSD = len(self.splitKeySet.symmetric_difference(tree2.splitKeySet)) return theSD # The difference method returns the difference of two sets as # a new Set. I.e. all elements that are in self and not in # the other. selfHasButTree2DoesNot = self.splitKeySet.difference(tree2.splitKeySet) tree2HasButSelfDoesNot = tree2.splitKeySet.difference(self.splitKeySet) if metric == 'diffs': return len(selfHasButTree2DoesNot), len(tree2HasButSelfDoesNot) if metric in ['wrf', 'bld']: self.splitKeyHash = {} for n in self.iterNodesNoRoot(): self.splitKeyHash[n.br.splitKey] = n tree2.splitKeyHash = {} for n in tree2.iterNodesNoRoot(): tree2.splitKeyHash[n.br.splitKey] = n if metric == 'wrf': theSum = 0.0 for k in self.splitKeySet.intersection(tree2.splitKeySet): # print '%s - %s' % (self.splitKeyHash[k].br.len, # tree2.splitKeyHash[k].br.len) theSum += abs(self.splitKeyHash[k].br.len - tree2.splitKeyHash[k].br.len) for k in selfHasButTree2DoesNot: # print 'x %s' % self.splitKeyHash[k].br.len theSum += self.splitKeyHash[k].br.len for k in tree2HasButSelfDoesNot: # print 'y %s' % tree2.splitKeyHash[k].br.len theSum += tree2.splitKeyHash[k].br.len return theSum elif metric == 'bld': theSum = 0.0 for k in self.splitKeySet.intersection(tree2.splitKeySet): theDiff = self.splitKeyHash[ k].br.len - tree2.splitKeyHash[k].br.len theSum += theDiff * theDiff for k in selfHasButTree2DoesNot: theSum += self.splitKeyHash[k].br.len * \ self.splitKeyHash[k].br.len for k in tree2HasButSelfDoesNot: theSum += tree2.splitKeyHash[k].br.len * \ tree2.splitKeyHash[k].br.len # print 'branch score =', theSum return math.sqrt(theSum)
##################################################
[docs] def tv(self): """Tree Viewer. Show the tree in a gui window. Needs Tkinter. If you have nexus taxsets defined, you can show them. """ from p4.btv import TV #import os #os.environ['PYTHONINSPECT'] = '1' TV(self)
[docs] def btv(self): """Big Tree Viewer. Show the tree in a gui window. This is for looking at big trees. The viewer has 2 panels -- one for an overall view of the whole tree, and one for a zoomed view, controlled by a selection rectangle on the whole tree view. Needs Tkinter. If you have nexus taxsets defined, you can show them. """ from p4.btv import BTV #import os #os.environ['PYTHONINSPECT'] = '1' BTV(self)
[docs] def tvTopologyCompare(self, treeB): """Graphically show topology differences. The taxNames need to be set, and need to be the same for both trees. (If the red lines don't show up right away, try adjusting the size of the windows slightly.) """ sd = self.topologyDistance(treeB) if sd == 0: print("The trees are the same. No tv.") return # for sk in self.splitKeyHash.keys(): # if sk not in treeB.splitKeyHash: # print "self has sk self.splitKeyHash = {} for n in self.iterInternalsNoRoot(): self.splitKeyHash[n.br.splitKey] = n treeB.splitKeyHash = {} for n in treeB.iterNodesNoRoot(): treeB.splitKeyHash[n.br.splitKey] = n assert self.splitKeySet assert treeB.splitKeySet selfHasButTreeBDoesnt = self.splitKeySet.difference(treeB.splitKeySet) treeBHasButSelfDoesnt = treeB.splitKeySet.difference(self.splitKeySet) from p4.btv import TV #import os #os.environ['PYTHONINSPECT'] = '1' TV(self, title='TV self') TV(treeB, title='TV treeB') for spl in selfHasButTreeBDoesnt: n = self.splitKeyHash[spl] n.br.color = 'red' for spl in treeBHasButSelfDoesnt: n = treeB.splitKeyHash[spl] n.br.color = 'orange'
##################################################
[docs] def drawTopologyCompare(self, treeB, showNodeNums=False): """Graphically show topology differences between two trees. The two trees (self and treeB) are drawn as text, with differences highlighted with a thick branch. The taxNames need to be set, and need to be the same for both trees. """ sd = self.topologyDistance(treeB) if sd == 0: print("The trees are the same. ") return self.splitKeyHash = {} for n in self.iterInternalsNoRoot(): self.splitKeyHash[n.br.splitKey] = n treeB.splitKeyHash = {} for n in treeB.iterNodesNoRoot(): treeB.splitKeyHash[n.br.splitKey] = n assert self.splitKeySet assert treeB.splitKeySet selfHasButTreeBDoesnt = self.splitKeySet.difference(treeB.splitKeySet) treeBHasButSelfDoesnt = treeB.splitKeySet.difference(self.splitKeySet) for spl in selfHasButTreeBDoesnt: n = self.splitKeyHash[spl] n.br.textDrawSymbol = '=' for spl in treeBHasButSelfDoesnt: n = treeB.splitKeyHash[spl] n.br.textDrawSymbol = '=' self.draw(showNodeNums=showNodeNums) treeB.draw(showNodeNums=showNodeNums)
[docs] def getNodeOnReferenceTreeCorrespondingToSelfRoot(self, refTree, verbose=True): """Find, on a ref tree, a node corresponding to where the self root is. This works for both bifurcating and non-bifurcating roots (of self). The refTree should not be bi-rooted. To facilitate doing a lot of (self) trees, a counter in the corresponding node in the refTree is incremented. The refTree is rooted on the first taxon. For bi-rooted trees, the node.br.biRootCount is incremented, and for non-bi-rooted trees the node.rootCount is incremented. The root counts should therefore be immune to reRoot()'ing. """ gm = ["Tree.getNodeOnReferenceTreeCorrespondingToSelfRoot()"] assert self is not refTree assert self.taxNames assert refTree.taxNames assert self.taxNames == refTree.taxNames isBiRoot = False # sr = self.root; srnChildren = nChildren of self.root srnChildren = self.root.getNChildren() if srnChildren == 1: gm.append("Self root has only one child; does not work") raise P4Error(gm) elif srnChildren == 2: isBiRoot = True for n in refTree.iterNodesNoRoot(): if not hasattr(n.br, "biRootCount"): n.br.biRootCount = 0 else: for n in refTree.iterNodes(): if not hasattr(n, "rootCount"): n.rootCount = 0 #self.draw() refTree.reRoot(self.taxNames[0]) #refTree.draw() self.makeSplitKeys() # default makeNodeForSplitKeyDict=True refTree.makeSplitKeys() for ch in self.root.iterChildren(): if ch.br.rawSplitKey & 1: break # ch clade has the first taxon, ch.parent is self.root #print("self root child %i clade has the first taxon" % ch.nodeNum) refNode = refTree.nodeForSplitKeyDict.get(ch.br.splitKey) if refNode: if isBiRoot: refNode.br.biRootCount += 1 if verbose: #print("When the refTree is rooted on the first taxon,") #print("the bi-root is on the branch on refTree node %i" % refNode.nodeNum) print("Incrementing refTree node %i br.biRootCount by 1" % refNode.nodeNum) else: refNode.rootCount += 1 if verbose: #print("When the refTree is rooted on the first taxon,") #print("the root is at refTree node %i" % refNode.nodeNum) print("Incrementing refTree node %i rootCount by 1" % refNode.nodeNum) return refNode else: if verbose: print("There is no node in the refTree corresponding to the self.root") return None
[docs] def readPhyloXmlFile(self, fName, verbose=False): """Start with an empty Tree, read in a phyloxml file""" assert not self.nodes, "The tree should be empty, with no nodes" import xml.etree.ElementTree as ET xAll = ET.parse(fName) for el in xAll.iter(): if '}' in el.tag: el.tag = el.tag.split('}', 1)[1] # strip all namespaces xroot = xAll.getroot() xphylogeny = xroot.find('phylogeny') assert xphylogeny rootclade = xphylogeny.find('clade') assert rootclade def recurseClade(theClade): global currentNode for el in theClade: #print(el.tag) if el.tag == 'clade': n = Node() n.nodeNum = len(self.nodes) el.attrib['p4Node'] = n n.parent = theClade.attrib['p4Node'] if n.parent: if n.parent.leftChild == None: n.parent.leftChild = n else: rmCh = n.parent.rightmostChild() assert rmCh, "Can't get rightmostChild" rmCh.sibling = n self.nodes.append(n) currentNode = n recurseClade(el) elif el.tag == "branch_length": currentNode.br.len = float(el.text) elif el.tag == 'name': #print(el.text) currentNode.name = el.text if verbose: print("Got node name", el.text) elif el.tag == 'width': currentNode.br.width = float(el.text) if verbose: print("Got node width, as node.br.width", el.text) elif el.tag == 'color': thiscol = {} for el2 in el: thiscol[el2.tag] = int(el2.text) cols = ['red', 'green', 'blue'] for col in cols: assert thiscol.get(col) myHexString = "" for col in cols: bit = thiscol.get(col) hx = hex(bit)[2:].upper() myHexString += hx #print(myHexString) currentNode.br.hexColor = myHexString if verbose: print("Got node color (as attribute node.br.hexColor)", currentNode.br.hexColor) else: print("Fixme. Unhandled/unused element with tag %s" % el.tag) n = Node() n.nodeNum = 0 n.br = None self.root = n self.nodes.append(n) rootclade.attrib['p4Node'] = n recurseClade(rootclade) for n in self.nodes: if not n.leftChild: n.isLeaf = 1 self.setPreAndPostOrder()
[docs] def attachRooter(self): rNode = self.nodes[-1] assert rNode.name == 'tempRooter' assert rNode.nodeNum == var.NO_ORDER rtMostCh = self.root.rightmostChild() rtMostCh.sibling = rNode rNode.sibling = None rNode.parent = self.root rNode.nodeNum = len(self.nodes) - 1 if self.taxNames: # taxnames can be shared among trees, so check whether it has already been added if rNode.name not in self.taxNames: self.taxNames.append(rNode.name) self.preAndPostOrderAreValid = False self.setPreAndPostOrder() #if self._nTax: # self._nTax += 1 return rNode
[docs] def detachRooter(self): rNode = self.nodes[-1] assert rNode.parent == self.root safety = 0 while self.root.leftChild.sibling.sibling != rNode: self.rotateAround(self.root) safety += 1 if safety >= 5: self.draw() raise P4Error("Tree.detachRooter() Too many rotations.") assert self.root.leftChild.sibling.sibling == rNode self.root.leftChild.sibling.sibling = None rNode.parent = None rNode.nodeNum = var.NO_ORDER self.preAndPostOrderAreValid = False self.setPreAndPostOrder() if self.taxNames: # taxnames can be shared among trees, so check whether it is in if rNode.name == self.taxNames[-1]: self.taxNames.pop()
#if self._nTax: # self._nTax -= 1
[docs] def isBiRoot(self): """Answers whether self has a bifurcating root""" rootNChildren = self.root.getNChildren() if rootNChildren == 2: return True return False
[docs] def isTriRoot(self): """Answers whether self has a trifurcating root""" rootNChildren = self.root.getNChildren() if rootNChildren == 3: return True return False
[docs] def isCompatibleWith(self, otherTree): """Determines whether self is compatible with otherTree Returns True or False. """ assert otherTree.nTax == self.nTax assert otherTree.taxNames == self.taxNames # It is easier with sets, so make sets from splitKeys self.makeSplitKeys() for n in self.iterInternalsNoRoot(): ss = set() for i in range(self.nTax): tester = 2 ** i if tester & n.br.splitKey: ss.add(tester) n.br.splSet = ss otherTree.makeSplitKeys() for n in otherTree.iterInternalsNoRoot(): ss = set() for i in range(self.nTax): tester = 2 ** i if tester & n.br.splitKey: ss.add(tester) n.br.splSet = ss isCompatible = True for nA in self.iterInternalsNoRoot(): for nB in otherTree.iterInternalsNoRoot(): if len(nA.br.splSet.intersection(nB.br.splSet)) \ and len(nA.br.splSet.difference(nB.br.splSet)) \ and len(nB.br.splSet.difference(nA.br.splSet)): isCompatible = False break if not isCompatible: break # print(f"isCompatible is {isCompatible}") return isCompatible
#################################################### manip ####################################################
[docs] def node(self, specifier): """Get a node based on a specifier. The *specifier* can be a nodeNum, name, or node object. """ nodeNum = None gm = ['Tree.node()'] if isinstance(specifier, int): nodeNum = specifier elif isinstance(specifier, numpy.int32): nodeNum = specifier elif isinstance(specifier, Node): if specifier in self.nodes: return specifier else: gm.append( "The specifier is a node object, but is not part of self.") raise P4Error(gm) elif isinstance(specifier, str): for n in self.iterNodes(): if n.name == specifier: return n # if we haven't found a node matching the specier... if nodeNum == None: gm.append("Specifier string '%s' is not a node name. What gives?" % specifier) raise P4Error(gm) else: gm.append("I don't understand the specifier '%s', type '%s'." % ( specifier, type(specifier))) raise P4Error(gm) if nodeNum < 0 or nodeNum >= len(self.nodes): gm.append("The requested node number %i is out of range (of %i nodes)." % (nodeNum, len(self.nodes))) raise P4Error(gm) return self.nodes[nodeNum]
[docs] def rotateAround(self, specifier): """Rotate a clade around a node. The *specifier* can be a nodeNum, name, or node object. """ gm = ['Tree.rotateAround()'] rotateNode = self.node(specifier) if rotateNode.isLeaf: print(gm[0]) print(" The rotateNode is a terminal node. Not doing anything ...") return if rotateNode.getNChildren() == 1: print(gm[0]) print(" The rotateNode has only one child. Not doing anything...") return # set up to unattach the rightmost child, and reattach at the left oldLeftChild = rotateNode.leftChild oldRightmostChild = rotateNode.rightmostChild() # we need to know the node second from the right, which will become the # newRightmostChild newRightmostChild = rotateNode.leftChild # as a first guess... while newRightmostChild.sibling != oldRightmostChild: newRightmostChild = newRightmostChild.sibling # now the newRightmostChild is indeed the second from the right # now unattach the rightmost child, and reattach at the left newRightmostChild.sibling = None oldRightmostChild.sibling = oldLeftChild oldLeftChild.parent.leftChild = oldRightmostChild self.preAndPostOrderAreValid = 0
[docs] def reRoot(self, specifier, moveInternalName=True, stompRootName=True, checkBiRoot=True, fixRawSplitKeys=False): """Re-root the tree to the node described by the specifier. It does it in-place, and returns None. The *specifier* can be a node.nodeNum, node.name, or node object. Here is a potential problem. Lets say you start with this tree, with split support as shown:: (((A, B)99, C)70, D, E); +--------3:A +---------2:99 +--------1:70 +--------4:B | | | +---------5:C 0 |--------6:D | +--------7:E Now we want to reRoot it to node 2. So we do that. Often, node.name's are really there to label the branch, not the node; you may be using node.name's to label your branches (splits), eg with support values. If that is the case, then you want to keep the node name with the branch (not the node) as you reRoot(). Ie you want node labels to behave like branch labels. If that is the case, we set moveInternalName=True; that is the default. When that is done in the example above, we get:: (A, B, (C, (D, E)70)99); +--------3:A | |--------4:B 2 | +---------5:C +--------1:99 | +--------6:D +---------0:70 +--------7:E Another possibility is that the node names actually are there to name the node, not the branch, and you want to keep the node name with the node during the re-rooting process. That can be done by setting moveInternalName=False, and the tree below is what happens when you do that. Each internal node.name stays with its node in the re-rooting process:: (A, B, (C, (D, E))70)99; +--------3:A | |--------4:B 99:2 | +---------5:C +--------1:70 | +--------6:D +---------0 +--------7:E Now if you had the default moveInternalName=True and you had a node name on the root, that would not work (draw it out to convince yourself ...). So in that case you probably want stompRootName=True as well --- Both are default. If you set stompRootName=True, it gives you a little warning as it does it. If you set stompRootName=2, it will do it silently. If moveInternalName is not set, then stompRootName is not used. You probably do not want to ``reRoot()`` if the current root is bifurcating. If you do that, you will get a node in the tree with a single child, which is strictly ok, but useless and confusing. So by default I checkBiRoot=True, and throw a P4Error if there is one. If you want to draw such a pathological tree with a node with a single child, set checkBiRoot=False, and it will allow it. """ gm = ['Tree.reRoot()'] # The user probably does not want to reRoot if the current root is # bifurcating. So check that. if checkBiRoot: if self.root.getNChildren() == 2: gm.append("The tree has a bifurcating root, so you probably do not") gm.append("want to reRoot() it. You can remove the bifurcating root") gm.append("with yourTree.removeRoot(). If you really want to reRoot()") gm.append("with a bifurcating root, set checkBiRoot=False in the reRoot() args.") raise P4Error(gm) if self.root.isLeaf and not self.root.name: gm.append("The root is a leaf, but has no name.") gm.append("So when you reRoot() it, some other leaf will have no name.") gm.append("That is a recipe for trouble, and is not allowed.") raise P4Error(gm) newRoot = self.node(specifier) oldRoot = self.root if newRoot == oldRoot: return if moveInternalName: if self.root.name and not self.root.isLeaf: if stompRootName: if stompRootName != 2: print("Notice. Tree.reRoot(stompRootName) is set, so the root name '%s' is being zapped..." % self.root.name) print("(Set stompRootName=2 to do this silently ...)") self.root.name = None else: gm.append("Setting 'moveInternalName' implies keeping node names with their branches.") gm.append("The root in this tree has a name, but has no branch.") gm.append("So that does not work.") gm.append("Set arg stompRootName to work around this.") raise P4Error(gm) if self.root.isLeaf and self.root.leftChild.name: assert self.root.name self.draw() gm.append("The current root is a leaf, with a name.") gm.append("Its sole child has a name also, which is an 'internal' node name.") gm.append("Arg moveInternalName is turned on.") gm.append("So when the tree gets re-rooted, that internal node name should stay with its branch, not its node.") gm.append("But the rerooted branch will already have a name-- the current root name.") gm.append("So that does not work.") raise P4Error(gm) path = [newRoot] theParent = newRoot.parent while theParent != oldRoot: path.append(theParent) theParent = theParent.parent if 0: print("path from the newRoot to the oldRoot:") for i in path: print(" %i" % i.nodeNum) while len(path): # We reverse the path above. Its last entry is a child of the old root. # Start there-- its to be called the 'swivelNode' # print "Rooting on node %i..." % path[-1].nodeNum # +----------2:A # +----------1 # | +----------3:B # 0 # +----------4:C # | # +----------5:D # +----------2:A # | # 1----------3:B # | # | +----------4:C # +----------0 # +----------5:D # in case this is not the first time around this loop... oldRoot = self.root swivelNode = path.pop() # Move the old root around the swivel node to the right. oldRoot.parent = swivelNode oldRoot.br = swivelNode.br swivelNode.br = None # If we are keeping the node.name with the branch, then do so now. if moveInternalName: if swivelNode.isLeaf: pass elif oldRoot.isLeaf: pass else: oldRoot.name = swivelNode.name swivelNode.name = None swivelNode.parent = None # as befits a root if swivelNode.leftChild: swivelNode.rightmostChild().sibling = oldRoot else: swivelNode.leftChild = oldRoot # What we do next depends on whether the swivelNode is the # left, middle, or right child of the oldRoot if oldRoot.leftChild == swivelNode: oldRoot.leftChild = swivelNode.sibling else: leftSib = oldRoot.leftChild while leftSib.sibling != swivelNode: leftSib = leftSib.sibling # At this point leftSib is the node that was left sib of the swivelNode # But the swivelNode is no longer there, so skip to the other side of it # (which may be None, if the swivelNode was rightmost child of the oldRoot). leftSib.sibling = swivelNode.sibling swivelNode.sibling = None # as befits a root self.root = swivelNode # The splitKey is still good, but the rawSplitKey needs updating. if fixRawSplitKeys and oldRoot.br.rawSplitKey: oldRoot.br.rawSplitKey = 0 if oldRoot.leftChild: for n in oldRoot.iterChildren(): oldRoot.br.rawSplitKey += n.br.rawSplitKey else: oldRoot.br.rawSplitKey = 1 << self.taxNames.index( oldRoot.name) # "<<" is left-shift self.preAndPostOrderAreValid = 0
[docs] def removeRoot(self): """Removes the root if self.root is mono- or bifurcating. This removes the root node if the tree is rooted on a terminal node, or if the tree is rooted on a bifurcating node. Otherwise, it refuses to do anything. In the usual case of removing a bifurcating root, the branch length of one fork of the bifurcation is added to the other fork, so the tree length is preserved. In the unusual case of removing a monofurcating root (a root that is a terminal node, a tree-on-a-stick) then its branch length disappears. """ gm = ['\nTree.removeRoot()'] oldRoot = self.root newRoot = None if not oldRoot.leftChild: gm.append("The root has no children.") raise P4Error(gm) self.deleteCStuff() nRootChildren = self.root.getNChildren() if nRootChildren == 1: # The root has only one child. Its rooted on a terminal node. # Its like this: # +---A # 0---1 # +---B newRoot = oldRoot.leftChild newRoot.parent = None newRoot.br = None elif nRootChildren == 2: # The root has exactly two children. This would be the usual, # expected, case. We want to find the first child (of the # oldRoot) with more than one child (of the child). newRoot = oldRoot.leftChild # Try this one first: # Ideally, and usually, the new root should have two or more # children. Imagine what would happen if this tree # # +-----A # 0 +----B # +---2 # +----C # # had its root removed, and then # had A for its new root. We get # +---B # A---2 # +---C # It would be better in this case if the new root was 2. So we # want to search for candidate newRoots that have at least 2 # children. So is the current candidate, newRoot, good # enough? while newRoot: if newRoot.getNChildren() >= 2: break # its good enough, use it newRoot = newRoot.sibling # its not good, try its sib # If we got this far an failed to find a good root, then # newRoot is None. We are dealing with a tree where all the # root children are either leaves or have only one child. Ok, # its unusual, but we do need a new root, even if it is a # terminal node. So arbitrarily choose the leftChild to root # on. if not newRoot: newRoot = oldRoot.leftChild # print "x Re-rooting on node number %i" % newRoot.nodeNum if newRoot.leftChild: newCh = newRoot.leftChild newCh.sibling = newRoot.sibling else: newCh = newRoot.sibling newRoot.leftChild = newCh newRoot.parent = None newRoot.sibling = None newCh.parent = newRoot newCh.br.len = newCh.br.len + newRoot.br.len newRoot.br = None else: # We found a good new root, with two or more children # print "y reRooting on nodeNum %i" % newRoot.nodeNum if newRoot == oldRoot.leftChild: newRoot.sibling.br.len = newRoot.sibling.br.len + \ newRoot.br.len newRoot.br = None newRoot.rightmostChild().sibling = newRoot.sibling newRoot.sibling.parent = newRoot newRoot.parent = None newRoot.sibling = None else: # the new root is the right child of the old oldLeftCh = newRoot.leftChild oldRoot.leftChild.br.len += newRoot.br.len newRoot.br = None newRoot.leftChild = oldRoot.leftChild newRoot.leftChild.parent = newRoot newRoot.leftChild.sibling = oldLeftCh newRoot.parent = None newRoot.sibling = None else: gm.append("The root has more than two children.") gm.append( "Removing the root with more than two children is not implemented.") gm.append("Are you even sure you want to do that?") raise P4Error(gm) oldRoot.wipe() self.nodes.remove(oldRoot) del oldRoot if newRoot: self.root = newRoot else: gm.append("No newRoot? Programming error.") raise P4Error(gm) for i in range(len(self.nodes)): self.nodes[i].nodeNum = i self.preOrder = None self.postOrder = None self.preAndPostOrderAreValid = 0 self.setPreAndPostOrder() self._nTax = 0
[docs] def removeNode(self, specifier, alsoRemoveSingleChildParentNode=True, alsoRemoveBiRoot=True, alsoRemoveSingleChildRoot=True, deleteCStuff=True): """Remove a node, together with everything above it. Arg *specifier* can be a nodeNum, name, or node object. So lets say that we have a tree like this:: +-------1:A 0 | +--------3:B +-------2 +--------4:C and we remove node 4. When it is removed, node 2 ends up having only one child. Generally you would want to remove it as well (so that the parent of node 3 is node 0), so the option *alsoRemoveSingleChildParentNode* is turned on by default. If *alsoRemoveSingleChildParentNode* is turned off, nodes like node 2 get left in the tree. Removal of a node might cause the creation of a bifurcating root. I assume that is not desired, so alsoRemoveBiRoot is turned on by default. In the example above, if I were to remove node 4, by default node 2 would also disappear, but by default node 0 would also disappear because it would then be a tree with a bifurcating root node. So starting with a 5-node tree, by removing 1 node you would end up with a 2-node tree, with 1 branch. In the example here, if I were to simply remove node 1:: +--------1:A | 0 +---------3:B +--------2 | +--------5:C +---------4 +--------6:D Then node 0 would remain, as:: +---------2:B 0--------1 | +--------4:C +---------3 +--------5:D Presumably that is not wanted, so the arg *alsoRemoveSingleChildRoot* is turned on by default. When the root is removed, we are left with a bi-root, which (if the arg alsoRemoveBiRoot is set) would also be removed. The resulting tree would be:: +-------0:B | 1-------2:C | +-------3:D The deleted nodes are really deleted, and do not remain in self.nodes. """ gm = ['Tree.removeNode()'] rNode = self.node(specifier) if rNode == self.root: print(gm[0]) print(" The specified node appears to be the root.") print(" Removing everything above the root would leave nothing.") print(" I assume that you do not want to do that.") print(" So I'm not doing that.") # self.draw() # print "the specifier was %s" % specifier # print "the specified node was node number %i" % rNode.nodeNum #raise P4Error return rNodeParnt = rNode.parent if deleteCStuff: self.deleteCStuff() # For cases where the tree is originally with a single child root # -- we don't want to then delete that root below. assert self.root.leftChild isOriginallySingleChildRoot = False if not self.root.leftChild.sibling: isOriginallySingleChildRoot = True # For cases where we originally have a bi-Root, that we would like to # keep. isOriginallyBiRoot = False if self.root.getNChildren() == 2: isOriginallyBiRoot = True # print "Removing node number", rNode.nodeNum #hitList = [] #self.recursivelyListNodeIndicesDownTo(hitList, rNode) # getNodeNumsAbove does not require setting # self.preAndPostOrderAreValid = 0. Its irrelevant. # does not include rNode hitList = self.getNodeNumsAbove(rNode.nodeNum) hitList.append(rNode.nodeNum) # hitList.sort() # hitList.reverse() # print "Hit list is", hitList hitNodes = [] for i in hitList: hitNodes.append(self.nodes[i]) # Disconnect it from the tree. # the rNode is the left, middle, or right child of the parent if rNodeParnt.leftChild == rNode: # its the left child rNodeParnt.leftChild = rNode.sibling rNode.sibling = None else: # its a middle or a right child # If its a right child, then rNode.sibling = None leftSib = rNode.leftSibling() if 1: if leftSib: # print "leftSib is node %i" % leftSib.nodeNum leftSib.sibling = rNode.sibling else: gm.append("leftSib is None. This shouldn't happen") gm.append("Programming error?") raise P4Error(gm) haveRemovedSingleChildRoot = False if not isOriginallySingleChildRoot: # The tree may have been bifurcating, and left a single-child root. if alsoRemoveSingleChildRoot and self.root.leftChild and not self.root.leftChild.sibling: hitNodes.append(self.root) self.root = self.root.leftChild self.root.parent = None haveRemovedSingleChildRoot = True for n in hitNodes: n.wipe() self.nodes.remove(n) if n.isLeaf and self.taxNames and n.name and n.name in self.taxNames: self.taxNames.remove(n.name) del n self._nTax = 0 if 1: for i in range(len(self.nodes)): self.nodes[i].nodeNum = i # self.dump(node=1) self.preOrder = None self.postOrder = None self.preAndPostOrderAreValid = 0 # self.draw() # This won't work unless preAndPostOrderAreValid set # to 0 # the parent of the removed node may now only have one child, # in which case it (the parent) should be removed if alsoRemoveSingleChildParentNode or alsoRemoveBiRoot or haveRemovedSingleChildRoot: ignoreBrLens = True self.preOrder = numpy.array( [var.NO_ORDER] * len(self.nodes), numpy.int32) self.postOrder = numpy.array( [var.NO_ORDER] * len(self.nodes), numpy.int32) if len(self.nodes) > 1: self.setPreAndPostOrder() for n in self.iterNodesNoRoot(): if n.br.len != 0.1: ignoreBrLens = False break if alsoRemoveSingleChildParentNode: if rNodeParnt and rNodeParnt.leftChild and not rNodeParnt.leftChild.sibling: if rNodeParnt.parent: # rNodeParnt is a left, middle, or right child if rNodeParnt.parent.leftChild == rNodeParnt: # print 'rNodeParnt is a left child' rNodeParnt.parent.leftChild = rNodeParnt.leftChild else: # print 'rNodeParnt is a middle or right child' rNodeParnt.leftSibling().sibling = rNodeParnt.leftChild rNodeParnt.leftChild.sibling = rNodeParnt.sibling if not ignoreBrLens: rNodeParnt.leftChild.br.len += rNodeParnt.br.len rNodeParnt.leftChild.parent = rNodeParnt.parent rNodeParnt.wipe() self.nodes.remove(rNodeParnt) del rNodeParnt if not isOriginallyBiRoot: if self.root.getNChildren() == 2 and alsoRemoveBiRoot: self.removeRoot() if ignoreBrLens: for ch in self.root.iterChildren(): ch.br.len = 0.1 for i in range(len(self.nodes)): self.nodes[i].nodeNum = i self.preOrder = numpy.array( [var.NO_ORDER] * len(self.nodes), numpy.int32) self.postOrder = numpy.array( [var.NO_ORDER] * len(self.nodes), numpy.int32) if len(self.nodes) > 1: self.setPreAndPostOrder()
[docs] def removeAboveNode(self, specifier, newName): """Remove everything above an internal node, making it a leaf, and so needing a new name. """ rNode = self.node(specifier) assert rNode != self.root assert not rNode.isLeaf toDelete = [n for n in rNode.iterPostOrder() if n != rNode] # print [n.nodeNum for n in toDelete] for n in toDelete: # print 'deleting node %i' % n.nodeNum self.removeNode(n, alsoRemoveSingleChildParentNode=False) rNode.name = newName rNode.isLeaf = 1 if self.taxNames: self.taxNames.append(newName)
[docs] def collapseNode(self, specifier): """Collapse the specified node to make a polytomy, and remove it from the tree. Arg *specifier*, as usual, can be a node, node number, or node name. The specified node remains in self.nodes, and is returned. """ theNode = self.node(specifier) assert theNode in self.nodes, "The specified Node is not in the tree." #assert theNode.leftChild and theNode.leftChild.sibling, "The specified node must have at least 2 children." assert not theNode.isLeaf, "The specified node must not be a leaf." assert theNode is not self.root, "The specified node must not be the tree root." # print "Collapsing node %i" % theNode.nodeNum theNewParent = theNode.parent theRightmostChild = theNode.rightmostChild() theLeftSib = theNode.leftSibling() if theLeftSib: theLeftSib.sibling = theNode.leftChild else: theNewParent.leftChild = theNode.leftChild for n in theNode.iterChildren(): n.parent = theNewParent theRightmostChild.sibling = theNode.sibling theNode.wipe() # The following does not work well. # self.nodes.remove(theNode) # del(theNode) self.setPreAndPostOrder() self._nInternalNodes -= 1
[docs] def collapseClade(self, specifier): """Collapse all nodes within a clade, but not the clade itself Arg *specifier*, as usual, can be a node, node number, or node name. It calls Tree.collapseNode repeatedly until the clade is a comb. Should be useful for making constraint trees. """ rNode = self.node(specifier) assert rNode != self.root assert not rNode.isLeaf toCollapse = [n for n in rNode.iterPostOrder() if n != rNode and not n.isLeaf] for n in toCollapse: # print(f'collapsing node {n.nodeNum}') self.collapseNode(n)
[docs] def pruneSubTreeWithoutParent(self, specifier, allowSingleChildNode=False): """Remove and return a node, together with everything above it. Arg *specifier* can be a nodeNum, name, or node object. By default, the arg allowSingleChildNode is turned off, and is for those cases where the parent of the node has more than 2 children. So when the subTree is removed, the parent node that is left behind has more than one child. The stuff that is removed is returned. The nodes are left in self; the idea being that the subTree will be added back to the tree again (via reconnectSubTreeWithoutParent()). """ gm = ['Tree.pruneSubTreeWithoutParent()'] rNode = self.node(specifier) if rNode == self.root: gm.append("The specified node is the root.") raise P4Error(gm) rNodeParnt = rNode.parent if not allowSingleChildNode: if rNodeParnt.getNChildren() < 3: # self.draw() gm.append("The arg allowSingleChildNode is turned off.") gm.append( "This would be for those cases where the parent of the subTree has more than 2 children.") raise P4Error(gm) # self.deleteCStuff() # print "rNode is node %i" % rNode.nodeNum # print "rNodeParnt is node %i" % rNodeParnt.nodeNum # Disconnect it from the tree. # the rNode is the left, middle, or right child of the parent if rNodeParnt.leftChild == rNode: # print "its the left child" rNodeParnt.leftChild = rNode.sibling rNode.sibling = None rNode.parent = None else: # print "its a middle or a right child" leftSib = rNode.leftSibling() assert leftSib leftSib.sibling = rNode.sibling rNode.sibling = None rNode.parent = None self.preAndPostOrderAreValid = 0 self._nTax = 0 return rNode
[docs] def reconnectSubTreeWithoutParent(self, stNode, newParent, beforeNode=None): """Attach subtree stNode to the rest of the tree at newParent. The beforeNode is by default None, and then the subtree is reconnected as the rightmost child of the new parent. However, if you want it somewhere else, for example as the leftmost child, or between two existing child nodes, specify a beforeNode (specified as usual as a node, nodeNumber, or node name) and the subtree will be inserted there. """ gm = ["Tree.reconnectSubTreeWithoutParent()"] newParent = self.node(newParent) if not newParent.leftChild: gm.append("Can't attach to a leaf.") raise P4Error(gm) stNode.parent = newParent if beforeNode == None: # easy -- just add it to the rightmost child. theRMChildOfNewParent = newParent.rightmostChild() theRMChildOfNewParent.sibling = stNode else: bNode = self.node(beforeNode) if bNode.parent != newParent: gm.append( "The parent of the 'beforeNode' should be the newParent.") raise P4Error(gm) if newParent.leftChild == bNode: newParent.leftChild = stNode else: lSib = newParent.leftChild while lSib.sibling != bNode: lSib = lSib.sibling lSib.sibling = stNode stNode.sibling = bNode self._nTax = 0
[docs] def pruneSubTreeWithParent(self, specifier): """Remove and return a subtree, with its parent node and branch Args: specifier: Arg ``specifier`` can be a nodeNum, node name, or node object. Returns: The node that is the parent of the specified node is returned. The nodes are left in self (ie they are not removed from the self.nodes list; the idea being that the subtree will be added back to the tree again (via :meth:`~p4.tree.Tree.reconnectSubTreeWithParent()`). Let's say that we want to detach the subtree composed of taxa E, F, and G, below:: # +--------1:A # | # |--------2:B # | # 0 +---------4:C # | | # | | +--------6:D # | | | # | |---------5 +--------8:E # +--------3 | | # | +--------7 +---------10:F # | +--------9 # | +---------11:G # | # +---------12:H This really means that we want to remove nodes 5 (yes!) and 7 as well, and their branches. To specify that subtree, we can use node 7 as the specifier when we call this method. The node that is returned in this case is node 5 :: stNode = t.pruneSubTreeWithParent(7) print(f"The returned object is a {stNode}, nodeNum {stNode.nodeNum}") # The returned object is a <p4.node.Node object at 0x107748eb8>, nodeNum 5 At this point we will have the tree as shown here:: # +-------1:A # | # |-------2:B # 0 # | +--------4:C # | | # +-------3--------6:D # | # +--------10:G Now we would use the companion method :meth:`~p4.tree.Tree.reconnectSubTreeWithParent` to reconnect the subtree. For example we can reconnect it in the same place that it was, by specifying node 6 as the arg attachNode :: t.reconnectSubTreeWithParent(stNode, 6) That restores the original tree, as shown above. The subtree can be a single leaf if you wish. In that case the subtree is specified by the leaf, and is composed of the leaf and its parent, including both their branches. At the moment it does not work for nodes where the parent is not bifurcating. """ gm = ['Tree.pruneSubTreeWithParent()'] rNode = self.node(specifier) if rNode == self.root: gm.append("The specified node is the root.") raise P4Error(gm) rNodeParnt = rNode.parent if rNodeParnt.getNChildren() != 2: # self.draw() gm.append("The parent of the specified node must have exactly two children") raise P4Error(gm) if rNodeParnt == self.root: gm.append("The parent of the specified node may not be the root") raise P4Error(gm) theSib = rNode.sibling # may be None theLeftSib = rNode.leftSibling() # may be None theGrandParent = rNodeParnt.parent rNodeParntLeftSib = rNodeParnt.leftSibling() rNodeParntSib = rNodeParnt.sibling theGrandParentLeftChild = theGrandParent.leftChild if theSib: rNode.sibling = None rNodeParnt.parent = None theSib.parent = theGrandParent if rNodeParntLeftSib: rNodeParntLeftSib.sibling = theSib if rNodeParntSib: theSib.sibling = rNodeParntSib if theGrandParentLeftChild == rNodeParnt: theGrandParent.leftChild = theSib elif theLeftSib: theLeftSib.sibling = None rNodeParnt.parent = None theLeftSib.parent = theGrandParent if rNodeParntLeftSib: rNodeParntLeftSib.sibling = theLeftSib if rNodeParntSib: theLeftSib.sibling = rNodeParntSib if theGrandParentLeftChild == rNodeParnt: theGrandParent.leftChild = theLeftSib rNodeParnt.leftChild = rNode self.preAndPostOrderAreValid = 0 self._nTax = 0 return rNodeParnt
[docs] def reconnectSubTreeWithParent(self, stNode, attachNode): """Attach subtree stNode to the branch on attachNode This is a companion method to Tree.pruneSubTreeWithParent() The subtree is attached on the branch on attachNode. """ gm = ["Tree.reconnectSubTreeWithParent()"] myAttachNode = self.node(attachNode) if not myAttachNode.br: gm.append("The attachNode must have a branch") myAttachNodeParent = myAttachNode.parent myAttachNodeSib = myAttachNode.sibling # may be None myAttachNodeLeftSib = myAttachNode.leftSibling() # may be None stNodeLeftChild = stNode.leftChild # stNode has only one child if 0: print("myAttachNodeParent", myAttachNodeParent.nodeNum) if myAttachNodeSib: print("myAttachNodeSib", myAttachNodeSib.nodeNum) else: print("myAttachNodeSib", myAttachNodeSib) if myAttachNodeLeftSib: print("myAttachNodeLeftSib", myAttachNodeLeftSib.nodeNum) else: print("myAttachNodeLeftSib", myAttachNodeLeftSib) print("stNodeLeftChild", stNodeLeftChild.nodeNum) myAttachNode.parent = stNode stNode.leftChild = myAttachNode myAttachNode.sibling = stNodeLeftChild stNode.parent = myAttachNodeParent if myAttachNodeSib: stNode.sibling = myAttachNodeSib else: stNode.sibling = None if myAttachNodeLeftSib: myAttachNodeLeftSib.sibling = stNode else: myAttachNodeParent.leftChild = stNode # self.dump(node=True) self.preAndPostOrderAreValid = 0 self._nTax = 0
[docs] def addNodeBetweenNodes(self, specifier1, specifier2): """Add a node between 2 exisiting nodes, which should be parent-child. The *specifier* can be a nodeNum, name, or node object. Returns the new node object. """ gm = ['Tree.addNodeBetweenNodes()'] aNode1 = self.node(specifier1) aNode2 = self.node(specifier2) # aNode1 should be the parent of aNode2 if aNode1 == aNode2.parent: pass elif aNode1 == aNode2: gm.append("The two specified nodes are the same.") raise P4Error(gm) elif aNode2 == aNode1.parent: temp = aNode1 aNode1 = aNode2 aNode2 = temp else: gm.append( "The 2 specified nodes should have a parent-child relationship") raise P4Error(gm) self.deleteCStuff() hasBrLens = False for n in self.iterNodes(): if n.br and math.fabs(n.br.len - 0.1) > 1.e-15: hasBrLens = True break newNode = copy.deepcopy(aNode2) newNode.br.len = 0.1 newNode.name = None newNode.isLeaf = False newNode.nodeNum = len(self.nodes) self.nodes.append(newNode) newNode.parent = aNode1 newNode.leftChild = aNode2 aNode2.parent = newNode if aNode1.leftChild == aNode2: aNode1.leftChild = newNode else: oldCh = aNode1.leftChild while oldCh.sibling != aNode2: oldCh = oldCh.sibling oldCh.sibling = newNode if aNode2.sibling: newNode.sibling = aNode2.sibling aNode2.sibling = None if hasBrLens: halfBrLen = aNode2.br.len / 2.0 aNode2.br.len = halfBrLen newNode.br.len = halfBrLen if 1: self.preOrder = numpy.array( [var.NO_ORDER] * len(self.nodes), numpy.int32) self.postOrder = numpy.array( [var.NO_ORDER] * len(self.nodes), numpy.int32) if len(self.nodes) > 1: self.setPreAndPostOrder() return newNode
[docs] def allBiRootedTrees(self): """Returns a Trees object containing all possible bi-rootings of self. Self should have a root node of degree > 2, but need not be fully resolved. Self needs a taxNames. """ gm = ['Tree.allBiRootedTrees()'] if self.root.getNChildren() < 3: gm.append("Self root should be of degree > 2.") raise P4Error(gm) if not self.taxNames: gm.append("Self (ie the tree) needs to have taxNames set.") raise P4Error(gm) tList = [] for i in range(len(self.nodes)): if self.nodes[i] == self.root: pass else: t = self.dupe() n = t.nodes[i] x = t.addNodeBetweenNodes(n.parent, n) t.reRoot(x, moveInternalName=False) t.name = 'r%i' % n.nodeNum tList.append(t) # This import needs to be here --- if it is up top as usual, it leads to circular grief. from p4.trees import Trees tt = Trees(trees=tList, taxNames=self.taxNames) return tt
[docs] def ladderize(self, biggerGroupsOnBottom=True): """Rotate nodes for a staircase effect. This method, in its default biggerGroupsOnBottom way, will take a tree like this:: +---------4:A +--------3 +---------2 +---------5:B | | | +--------6:C +--------1 | | +--------8:D | +---------7 | +--------9:E 0 |--------10:F | | +---------12:G +--------11 +---------13:H and rearranges it so that it is like ... :: +--------10:F | | +---------12:G |--------11 | +---------13:H 0 | +--------8:D | +---------7 | | +--------9:E +--------1 | +--------6:C +---------2 | +---------4:A +--------3 +---------5:B Note that for each node, the more populated group is on the bottom, the secondmost populated second, and so on. To get it with the bigger groups on top, set biggerGroupsOnBottom=False. I made the default with the bigger groups on the bottom so that it often makes room for a scale bar. The setting biggerGroupsOnBottom, the default here, would equivalent to set torder=right in paup; torder=left puts the bigger groups on the top. """ # self.draw() self.root._ladderize(biggerGroupsOnBottom) self.preAndPostOrderAreValid = 0
# self.draw()
[docs] def randomizeTopology(self, randomBrLens=True): gm = ["Tree.randomizeTopology()"] if self.root.getNChildren() != 3 or not self.isFullyBifurcating(): gm.append( "Should be a fully bifurcating tree, this week. Fix me?") raise P4Error(gm) if self.cTree: self.deleteCStuff() nTax = self.nTax oldRoot = self.root leaves = [] internals = [] for n in self.nodes: if n.isLeaf: leaves.append(n) else: internals.append(n) random.shuffle(leaves) random.shuffle(internals) lIndx = 0 iIndx = 0 self.nodes = [] nodeNum = 0 # new root self.root = internals[iIndx] iIndx += 1 self.nodes.append(self.root) self.root.parent = None self.root.leftChild = None self.root.sibling = None if self.root == oldRoot: pass else: oldRoot.br = self.root.br self.root.br = None self.root.nodeNum = nodeNum nodeNum += 1 # add a leaf as left child to the root n = leaves[lIndx] lIndx += 1 self.nodes.append(n) n.sibling = None n.nodeNum = nodeNum nodeNum += 1 self.root.leftChild = n n.parent = self.root previousNode = n # add the rest of the leaves while lIndx < nTax: n = leaves[lIndx] lIndx += 1 self.nodes.append(n) n.sibling = None n.nodeNum = nodeNum nodeNum += 1 previousNode.sibling = n previousNode = n n.parent = self.root # Now we have a star tree. Now add internal nodes until it is all # resolved, which needs nTax - 3 nodes for i in range(nTax - 3): ssNodes = [] for n in self.nodes: if n.sibling and n.sibling.sibling: ssNodes.append(n) lChild = random.choice(ssNodes) # print "lChild = node %i" % lChild.nodeNum # +----------1:oldLeftSib # | # +----------2:lChild # 0 # +----------3:lChildSib # | # +----------4:oldLChildSibSib # +----------1:oldLeftSib # | # | +----------3:lChild # 0----------2(n) # | +----------4:lChildSib # | # +----------5:oldLChildSibSib n = internals[iIndx] iIndx += 1 n.parent = None n.sibling = None n.leftChild = None n.nodeNum = nodeNum nodeNum += 1 lChildSib = lChild.sibling # guarranteed to have one oldLChildSibSib = lChildSib.sibling # ditto oldLeftSib = lChild.parent.leftChild # first guess ... if oldLeftSib != lChild: while oldLeftSib.sibling != lChild: oldLeftSib = oldLeftSib.sibling else: oldLeftSib = None if 0: if oldLeftSib: print("oldLeftSib = %i" % oldLeftSib.nodeNum) else: print("oldLeftSib = None") print("lChildSib = %i" % lChildSib.nodeNum) if oldLChildSibSib: print("oldLChildSibSib = %i" % oldLChildSibSib.nodeNum) else: print("oldLChildSibSib = None") if oldLeftSib: oldLeftSib.sibling = n else: lChild.parent.leftChild = n n.parent = lChild.parent lChild.parent = n n.leftChild = lChild lChildSib.parent = n n.sibling = oldLChildSibSib lChildSib.sibling = None self.nodes.append(n) # self.dump(all=True) # self.draw() # The way it is now, the root rightmost child is always a # leaf. Not really random, then, right? So choose a random # internal node, and re-root it there. # print "nTax=%i, len(t.nodes)=%i" % (nTax, len(t.nodes)) if nTax > 3: n = self.nodes[random.randrange(nTax + 1, len(self.nodes))] self.reRoot(n, moveInternalName=False) # The default is to have randomBrLens, where internal nodes get # brLens of 0.02 - 0.05, and terminal nodes get brLens of 0.2 - # 0.5. Branch lengths are all 0.1 if randomBrLens is turned # off. if randomBrLens: for n in self.nodes: if n != self.root: if n.isLeaf: n.br.len = 0.02 + (random.random() * 0.48) else: n.br.len = 0.02 + (random.random() * 0.03) self.preAndPostOrderAreValid = 0
# self.dump(all=True)
[docs] def readBipartitionsFromPaupLogFile(self, thePaupLogFileName): """Assigns support to the tree, from the PAUP bipartitions table. This needs to have self.taxNames set. This is useful if you want to make a consensus tree using PAUP, and get the support values. When you make a cons tree with PAUP, the support values, usually bootstrap values, are unfortunately not saved with the tree. That information is in the Bipartitions table, which can be saved to a PAUP log file so that p4 can get it. This method will read thePaupLogFileName and extract the split (tree bipartition) supports, and assign those supports to self as node.br.support's (as a float, not a string). It also returns a hash with the split strings as keys and the split support as values, if you need it. """ gm = ['Tree.readBipartitionsFromPaupLogFile()'] if not self.taxNames: gm.append("This method needs self.taxNames.") raise P4Error(gm) self.checkTaxNames() f = open(thePaupLogFileName, 'r') while 1: aLine = f.readline() if not aLine: gm.append("No Bipartitions line in %s?" % thePaupLogFileName) raise P4Error(gm) if aLine.startswith('Bipartitions found'): # print aLine break # The splits table might be all in one piece, or it might be split # into sections (CYCLEs). Only the last section has the Freq or # percent. If the first section is the only section, then it is # the LAST_CYCLE, so that we get the Freq or percent. # Surprise-- the PAUP output contains both Freq and % support, # unless the sum of the weights (or if there are no weights, the # number of trees) is 100, in which case only the Freq is given. FIRST_CYCLE = -1 MIDDLE_CYCLE = 0 LAST_CYCLE = 1 cycleType = None thisKeyLength = None accumulatedKeyLength = 0 nKeys = 0 keyCounter = 0 theHash = {} theKeys = [] hasFreqOnly = None while 1: prevLine = aLine aLine = f.readline() # print "a Got Line ==>%s<==" % aLine if not aLine: gm.append('Unexpected end of file.') raise P4Error(gm) aLine = aLine.rstrip() # print "b Got Line ==>%s<==" % aLine if len(aLine): if aLine[0] == '-': # print prevLine # print aLine if prevLine.endswith('Freq') or prevLine.endswith('%'): if prevLine.endswith('Freq'): hasFreqOnly = 1 elif prevLine.endswith('%'): hasFreqOnly = 0 cycleType = LAST_CYCLE # print "We are now in the last cycle. hasFreqOnly=%s" # % hasFreqOnly splitLine = prevLine.split() thisKeyLength = len(splitLine[0]) keyCounter = 0 elif cycleType == MIDDLE_CYCLE: # print "We are now in a middle cycle" thisKeyLength = len(prevLine) keyCounter = 0 elif cycleType == FIRST_CYCLE: gm.append('This should never happen.') raise P4Error(gm) else: cycleType = FIRST_CYCLE # print "We are now in the first cycle" thisKeyLength = len(prevLine) elif aLine[0] in ['.', '*']: if cycleType in [FIRST_CYCLE, MIDDLE_CYCLE] and len(aLine) != thisKeyLength: gm.append("Unequal key lengths?!?") gm.append("thisKeyLength = %i" % thisKeyLength) gm.append("%s" % aLine) raise P4Error(gm) if cycleType == FIRST_CYCLE: theKeys.append(aLine) elif cycleType == MIDDLE_CYCLE: theKeys[keyCounter] = theKeys[keyCounter] + aLine keyCounter = keyCounter + 1 if keyCounter > nKeys: gm.append("Too many keys.") raise P4Error(gm) elif cycleType == LAST_CYCLE: # It will usually be a line like: ...*....*. 67.83 67.9% # But it will sometimes be a line like: ...*....*. # 68 splitLine = aLine.split() theKey = splitLine[0] if hasFreqOnly: theSupport = float(splitLine[-1]) theSupport /= 100.0 else: # don't read the % sign theSupport = float(splitLine[-1][:-1]) theSupport /= 100.0 if len(theKey) != thisKeyLength: gm.append("Unequal key lengths?!?") gm.append("thisKeyLength = %i" % thisKeyLength) gm.append("%s" % theKey) raise P4Error(gm) if accumulatedKeyLength: theKeys[keyCounter] = theKeys[keyCounter] + theKey theHash[theKeys[keyCounter]] = theSupport keyCounter = keyCounter + 1 if keyCounter > nKeys: gm.append("Too many keys.") raise P4Error(gm) # LAST_CYCLE is also the FIRST_CYCLE, ie the table is # in one section. else: theKeys.append(theKey) theHash[theKey] = theSupport else: gm.append("This should never happen.") raise P4Error(gm) else: pass # Skip lines with numbers else: # a blank line if cycleType == FIRST_CYCLE: cycleType = MIDDLE_CYCLE nKeys = len(theKeys) # print "At the end of the first cycle, got %i keys" % # nKeys accumulatedKeyLength = thisKeyLength elif cycleType == MIDDLE_CYCLE: accumulatedKeyLength += thisKeyLength elif cycleType == LAST_CYCLE: accumulatedKeyLength += thisKeyLength nKeys = len(theKeys) # print "finished" # print theKeys[0] break else: pass f.close() if 0: print("Finished getting split strings, and supports") print("Got %i items in the hash" % len(theHash)) print("accumulatedKeyLength = %i" % accumulatedKeyLength) print("nKeys = %i" % nKeys) # self.draw() # I will need to make splitStrings (in dot-star notation) from # splitKeys. To do that, I can use the # p4.func.getSplitStringFromKey(theKey, nTax) function. self.makeSplitKeys() for n in self.nodes: if n != self.root: if not n.isLeaf: theNodeSplitString = p4.func.getSplitStringFromKey( n.br.splitKey, self.nTax) if theNodeSplitString in theHash: if hasattr(n.br, 'support') and n.br.support is not None: gm.append("Node %i already has a br.support." % n.nodeNum) gm.append("I am refusing to clobber it with the split support.") gm.append("Either fix the tree or fix this method.") raise P4Error(gm) n.br.support = float(theHash[theNodeSplitString]) return theHash
[docs] def renameForPhylip(self, dictFName='p4_renameForPhylip_dict.py'): """Rename with phylip-friendly short boring names. It saves the old names (together with the new) in a python dictionary, in a file by default named p4_renameForPhylip_dict.py If self does not have taxNames set, it does not write originalNames to that file-- which may cause problems restoring names. If you want to avoid that, be sure to set self.taxNames before you do this method. This method does not deal with internal node names, at all. They are silently ignored. If they are too long for phylip, they are still silently ignored, which might cause problems. """ gm = ['Tree.renameForPhylip()'] if os.path.exists(dictFName): gm.append("The dictionary file '%s' already exists." % dictFName) raise P4Error(gm) d = {} if self.taxNames: d2 = {} originalNames = self.taxNames[:] for i in range(len(self.taxNames)): oldName = self._taxNames[i] newName = 's%i' % i d[newName] = oldName d2[oldName] = newName self._taxNames[i] = newName for n in self.iterLeavesNoRoot(): n.name = d2[n.name] if self.root.isLeaf and self.root.name: self.root.name = d2[self.root.name] else: originalNames = None i = 0 for n in self.iterLeavesNoRoot(): oldName = n.name newName = 's%i' % i d[newName] = oldName n.name = newName i += 1 if self.root.isLeaf and self.root.name: oldName = self.root.name newName = 's%i' % i d[newName] = oldName self.root.name = newName f = open(dictFName, 'w') f.write("p4_renameForPhylip_originalNames = %s\np4_renameForPhylip_dict = %s\n" % ( originalNames, d)) f.close()
[docs] def restoreNamesFromRenameForPhylip(self, dictFName='p4_renameForPhylip_dict.py'): """Given the dictionary file, restore proper names. The renaming is done by the Alignment method renameForPhylip(), which makes the dictionary file. The dictionary file is by default named p4_renameForPhylip_dict.py """ gm = ["Tree.restoreNamesFromRenameForPhylip()"] if os.path.exists(dictFName): loc = {} exec(open(dictFName).read(), {}, loc) try: p4_renameForPhylip_dict = loc['p4_renameForPhylip_dict'] p4_renameForPhylip_originalNames = loc['p4_renameForPhylip_originalNames'] except KeyError: gm.append("Dict file %s exists, but I can't read it correctly." % dictFName) raise P4Error(gm) else: gm.append("The dictionary file '%s' can't be found." % dictFName) raise P4Error(gm) for n in self.iterNodes(): if n.isLeaf: if n.name in p4_renameForPhylip_dict: n.name = p4_renameForPhylip_dict[n.name] else: gm.append("The dictionary does not contain a key for '%s'." % n.name) raise P4Error(gm) if p4_renameForPhylip_originalNames: self.taxNames = p4_renameForPhylip_originalNames else: if self.taxNames: gm.append("self.taxNames is set, and should be replaced, but") gm.append("p4_renameForPhylip_originalNames is None. ?!?") raise P4Error(gm)
[docs] def restoreDupeTaxa(self, dictFileName='p4DupeSeqRenameDict.py', asMultiNames=True): """Restore previously removed duplicate taxa from a dict file. The usual story would be like this: You read in your alignment and p4 tells you that you have duplicate sequences. So you use the Alignment method checkForDuplicateSequences() to remove them, which makes a dictionary file, by default 'p4DupeSeqRenameDict.py', to facilitate restoration of the names. You do your analysis on the reduced alignment, and get a tree. Then you use the dictionary file with this method to restore all the taxon names. If asMultiNames is turned on, the default, then the leaf nodes are not replicated, and the name is changed to be a long multi-name. If asMultiNames is turned off, then the restored taxa are made to be siblings, and the branch lengths are set to zero. """ gm = ['Tree.restoreDupeTaxa()'] if not os.path.isfile(dictFileName): gm.append("Can't find dict file '%s'" % dictFileName) raise P4Error(gm) loc = {} exec(open(dictFileName).read(), {}, loc) try: p4DupeSeqRenameDict = loc['p4DupeSeqRenameDict'] except KeyError: gm.append( "Can't get the dictionary named 'p4DupeSeqRenameDict' from the dict file.") # print p4DupeSeqRenameDict kk = p4DupeSeqRenameDict.keys() # print kk if asMultiNames: for oldName in kk: rNode = self.node(oldName) newNames = p4DupeSeqRenameDict[oldName] multiName = ', '.join(newNames) rNode.name = multiName else: for oldName in kk: rNode = self.node(oldName) newNames = p4DupeSeqRenameDict[oldName] lCh = Node() lCh.isLeaf = 1 lCh.name = newNames[0] lCh.br.len = 0.0 lCh.parent = rNode lCh.nodeNum = len(self.nodes) self.nodes.append(lCh) rNode.isLeaf = 0 rNode.name = None rNode.leftChild = lCh theLeftSib = rNode.leftChild for sibName in newNames[1:]: n = Node() n.parent = rNode theLeftSib.sibling = n n.isLeaf = 1 n.name = sibName n.br.len = 0.0 n.nodeNum = len(self.nodes) self.nodes.append(n) theLeftSib = n self.taxNames = [] self.preOrder = None self.postOrder = None self.setPreAndPostOrder() self._nTax = 0
[docs] def lineUpLeaves(self, rootToLeaf=1.0, overWriteBrLens=True): """Make the leaves line up, as in a cladogram. This makes the rootToLeaf distance the same for all leaves. If overWriteBrLens is set, then the newly calculated br.lens replace the original br.lens. If it is not set, then the new br.lens are placed in br.lenL, and does not over-write the original br.lens. """ levels = 0 for n1 in self.iterLeavesNoRoot(): nodeCount = 1 p = n1.parent while p != self.root: p = p.parent nodeCount += 1 if nodeCount > levels: levels = nodeCount # print "levels = %i" % levels for n1 in self.iterLeavesNoRoot(): n1.lul_pos = levels ch = n1 p = n1.parent while p: newPos = ch.lul_pos - 1 if not hasattr(p, 'lul_pos'): p.lul_pos = newPos else: if p.lul_pos <= newPos: pass else: p.lul_pos = newPos ch = p p = p.parent for n in self.iterNodesNoRoot(): p = n.parent if p == self.root: n.br.lenL = float(n.lul_pos) / float(levels) * rootToLeaf else: n.br.lenL = float(n.lul_pos - p.lul_pos) / \ float(levels) * rootToLeaf # Clean up for n in self.iterNodes(): if hasattr(n, 'lul_pos'): del(n.lul_pos) if overWriteBrLens: if n.br and hasattr(n.br, 'lenL'): n.br.len = n.br.lenL del(n.br.lenL)
[docs] def removeEverythingExceptCladeAtNode(self, specifier): """Like it says. Leaves a tree with a root-on-a-stick.""" theNode = self.node(specifier) if theNode == self.root: return self.reRoot(theNode.parent, checkBiRoot=False) toRemoves = [] for ch in self.root.iterChildren(): if ch != theNode: toRemoves.append(ch) for ch in toRemoves: self.removeNode( ch, alsoRemoveSingleChildParentNode=False, alsoRemoveBiRoot=False)
[docs] def dupeSubTree(self, dupeNodeSpecifier, up, doBrLens=True, doSupport=True): """Makes and returns a new Tree object, duping part of self. The dupeNodeSpecifier can be a node name, node number, or node object. Arg 'up' should be True or False. The returned subtree has a root-on-a-stick. BrLens are not duped -- brLens are default in the new subtree. So if the tree is like this:: +---------2:A +--------1 | +---------3:B | 0--------4:C | | +---------6:D +--------5:dupeNode | +--------8:E +---------7 +--------9:F Then the subtree from node 5 up is:: +---------2:D subTreeRoot:0--------1:dupeNode | +--------4:E +---------3 +--------5:F and the subtree from node 5 down is:: +--------2:A +---------1 dupeNode:0--------5 +--------3:B | +---------4:C """ dupeNode = self.node(dupeNodeSpecifier) if dupeNode == self.root and not up: print("The dupeNode is self.root, and you want a subtree below that?!?") sys.exit() from p4.tree import Tree st = Tree() if up: n = Node() n.br = None n.nodeNum = 0 n.name = 'subTreeRoot' n.isLeaf = 1 st.root = n st.nodes.append(n) if dupeNode.isLeaf: # its easy -- just dupe the leaf n = Node() if doBrLens: n.br.len = dupeNode.br.len if doSupport: n.br.support = dupeNode.br.support n.parent = st.root st.root.leftChild = n n.name = dupeNode.name n.isLeaf = 1 st.nodes.append(n) else: i = 1 # nodeNum counter nodeNumDict = {} for selfNode in dupeNode.iterPreOrder(): n = Node() if doBrLens: n.br.len = selfNode.br.len if doSupport and hasattr(selfNode.br, 'support'): n.br.support = selfNode.br.support n.nodeNum = i nodeNumDict[selfNode.nodeNum] = i n.isLeaf = selfNode.isLeaf n.name = selfNode.name i += 1 st.nodes.append(n) for selfNode in dupeNode.iterPreOrder(): if selfNode.leftChild: st.nodes[nodeNumDict[selfNode.nodeNum]].leftChild = \ st.nodes[nodeNumDict[selfNode.leftChild.nodeNum]] if selfNode == dupeNode: pass # skip parent and sibling else: # parents and siblings. There will always be a parent. st.nodes[nodeNumDict[selfNode.nodeNum]].parent = \ st.nodes[nodeNumDict[selfNode.parent.nodeNum]] if selfNode.sibling: st.nodes[nodeNumDict[selfNode.nodeNum]].sibling = \ st.nodes[nodeNumDict[selfNode.sibling.nodeNum]] st.root.leftChild = st.nodes[nodeNumDict[dupeNode.nodeNum]] st.root.leftChild.parent = st.root else: # down i = 0 # nodeNum counter nodeNumDict = {} mostRecentDown = None for selfNode in dupeNode.iterDown(): n = Node() if selfNode != self.root: if doBrLens: n.br.len = selfNode.br.len if not selfNode.isLeaf and doSupport: if hasattr(selfNode.br, 'support'): n.br.support = selfNode.br.support else: print("Warning: dupeSubTree() doSupport is turned on, but node %i has no support." % selfNode.nodeNum) n.nodeNum = i nodeNumDict[selfNode.nodeNum] = i n.isLeaf = selfNode.isLeaf n.name = selfNode.name i += 1 st.nodes.append(n) for selfNode in dupeNode.iterDown(): # parents and siblings. if selfNode.parent: st.nodes[nodeNumDict[selfNode.nodeNum]].parent = \ st.nodes[nodeNumDict[selfNode.parent.nodeNum]] else: # Its the root st.root = st.nodes[nodeNumDict[selfNode.nodeNum]] st.root.br = None if selfNode.sibling: st.nodes[nodeNumDict[selfNode.nodeNum]].sibling = \ st.nodes[nodeNumDict[selfNode.sibling.nodeNum]] if selfNode == dupeNode: st.nodes[nodeNumDict[dupeNode.nodeNum]].leftChild = None st.nodes[nodeNumDict[dupeNode.nodeNum]].isLeaf = 1 else: if selfNode.leftChild: st.nodes[nodeNumDict[selfNode.nodeNum]].leftChild = \ st.nodes[nodeNumDict[selfNode.leftChild.nodeNum]] st.reRoot(nodeNumDict[dupeNode.nodeNum], moveInternalName=False) st.root.isLeaf = 1 if not st.root.name: st.root.name = 'subTreeRoot' st.setPreAndPostOrder() return st
[docs] def addSubTree(self, selfNode, theSubTree, subTreeTaxNames=None): """Add a subtree to a tree. The nodes from theSubTree are added to self.nodes, and theSubTree is deleted. If subTreeTaxNames is provided, fine, but if not this method can find them. Providing them saves a bit of time, I assume. """ # subTreeRootNode:0-------1:oldSubTreeRootNodeLeftChild # +-------1:A # oldSelfNodeParent:0 # +-------2:selfNode # +--------1:A # oldSelfNodeParent:0 # | +--------2:selfNode # +--------3:subTreeRootNode # +--------4:oldSubTreeRootNodeLeftChild # ========================================= # +-------1:selfNode # oldSelfNodeParent:0 # +-------2:A # +--------1:selfNode # +--------3:subTreeRootNode # oldSelfNodeParent:0 +--------4:oldSubTreeRootNodeLeftChild # | # +--------2:A assert selfNode in self.nodes assert selfNode.parent # its a root on a stick assert theSubTree.root.leftChild and not theSubTree.root.leftChild.sibling if not subTreeTaxNames: subTreeTaxNames = [n.name for n in theSubTree.iterLeavesNoRoot()] oldSelfNodeParent = selfNode.parent subTreeRootNode = theSubTree.root theSubTree.root = None oldSubTreeRootNodeLeftChild = subTreeRootNode.leftChild subTreeRootNode.parent = oldSelfNodeParent subTreeRootNode.leftChild = selfNode selfNode.parent = subTreeRootNode if oldSelfNodeParent.leftChild == selfNode: oldSelfNodeParent.leftChild = subTreeRootNode else: oldCh = oldSelfNodeParent.leftChild while oldCh.sibling != selfNode: oldCh = oldCh.sibling oldCh.sibling = subTreeRootNode if selfNode.sibling: subTreeRootNode.sibling = selfNode.sibling selfNode.sibling = oldSubTreeRootNodeLeftChild subTreeRootNode.isLeaf = 0 nSelfNodes = len(self.nodes) self.nodes += theSubTree.nodes theSubTree.nodes = [] selfNode.br.len = 0.1 subTreeRootNode.br = NodeBranch() subTreeRootNode.br.len = 0.1 self.taxNames += theSubTree.taxNames for i in range(nSelfNodes, len(self.nodes)): n = self.nodes[i] n.nodeNum = i # print # for n in self.nodes: # print n.nodeNum # print "self.taxNames is %s" % self.taxNames # print "subTreeTaxNames %s" % subTreeTaxNames if self.taxNames: self._taxNames += subTreeTaxNames if self._nTax: self._nTax += len(subTreeTaxNames) self.preAndPostOrderAreValid = False self.preOrder = None self.postOrder = None self.setPreAndPostOrder() del(theSubTree)
[docs] def addLeaf(self, attachmentNode, taxName): """Add a leaf to a tree The leaf is added to the branch leading to the attachmentNode. The attachmentNode should be a Node, not a node number or name. A new node is made on that branch, so actually two nodes are added to the tree. The new leaf node is returned. """ assert attachmentNode in self.nodes aNode = attachmentNode # self.draw() # print "attachmentNode is %i" % aNode.nodeNum #bNode = self.addNodeBetweenNodes(aNode, aNode.parent) aNodeP = attachmentNode.parent bNode = Node() bNode.nodeNum = len(self.nodes) self.nodes.append(bNode) bNode.parent = aNodeP bNode.leftChild = aNode aNode.parent = bNode if aNodeP.leftChild == aNode: aNodeP.leftChild = bNode else: oldCh = aNodeP.leftChild while oldCh.sibling != aNode: oldCh = oldCh.sibling oldCh.sibling = bNode if aNode.sibling: bNode.sibling = aNode.sibling aNode.sibling = None aNode.br.len = 0.1 bNode.br.len = 0.1 # self.draw() n = Node() n.nodeNum = len(self.nodes) n.name = taxName n.isLeaf = 1 oldSibling = attachmentNode.sibling n.parent = bNode aNode.sibling = n n.sibling = oldSibling self.nodes.append(n) if self.taxNames: # taxnames can be shared among trees, so check whether it has already been added if n.name not in self.taxNames: self.taxNames.append(n.name) self.preAndPostOrderAreValid = False self.preOrder = None self.postOrder = None self.getPreAndPostOrderAboveRoot() # self.dump(node=True) # self.draw() if self._nTax: self._nTax += 1 return n
[docs] def addSibLeaf(self, attachmentNode, taxName): """Add a leaf to a tree as a sibling, by specifying its parent. The leaf is added so that its parent is the specified node (ie attachmentNode), adding the node as a rightmost child to that parent. The attachmentNode should not be a leaf -- it must have children nodes, to which the new leaf can be added as a sibling. The new node is returned. """ assert attachmentNode in self.nodes assert not attachmentNode.isLeaf aNode = attachmentNode # self.draw() # print "attachmentNode is %i" % aNode.nodeNum n = Node() n.nodeNum = len(self.nodes) n.name = taxName n.isLeaf = 1 n.br.len = 0.1 rtMostCh = aNode.rightmostChild() rtMostCh.sibling = n n.sibling = None n.parent = aNode self.nodes.append(n) if self.taxNames: # taxnames can be shared among trees, so check whether it has already been added if n.name not in self.taxNames: self.taxNames.append(n.name) self.preAndPostOrderAreValid = False self.preOrder = None self.postOrder = None self.getPreAndPostOrderAboveRoot() # self.dump(node=True) # self.draw() if self._nTax: self._nTax += 1 return n
[docs] def subTreeIsFullyBifurcating(self, theNode, up=True): """Is theNode and everything above it (or below it) bifurcating? Arg *up* says whether its above or below. """ # don't use getNChildren() -- too slow! assert theNode != self.root if up: # Includes theNode, which we want for n in theNode.iterInternals(): # if n.getNChildren() != 2: # return False if n.leftChild and n.leftChild.sibling: if n.leftChild.sibling.sibling: return False else: return False return True else: # Includes theNode, which we do not want for n in theNode.iterDown(): if n != theNode: if not n.isLeaf: if n == self.root: # if n.getNChildren() != 3: # return False if n.leftChild and n.leftChild.sibling and n.leftChild.sibling.sibling: if n.leftChild.sibling.sibling.sibling: return False else: return False else: # if n.getNChildren() != 2: # return False if n.leftChild and n.leftChild.sibling: if n.leftChild.sibling.sibling: return False else: return False return True
[docs] def nni(self, upperNodeSpec=None): """Simple nearest-neighbor interchange. You specify an 'upper' node, via an upperNodeSpec, which as usual can be a node name, node number, or node object. If you don't specify something, a random node will be chosen for you. (This latter option might be a little slow if you are doing many of them, as it uses iterInternalsNoRoot(), but mostly it should be fast enough). The upper node has a parent -- the 'lower' node. One subtree from the upper node and one subtree from the lower node are exchanged. Both subtrees are chosen randomly. This works on biRooted trees also, preserving the biRoot. Neighbours are not allowed to be on both sides the the bifurcating root, and so exchanges are only allowed on one side of the tree or the other. """ gm = ["Tree.nni()"] if upperNodeSpec: # This makes sure that upperNode is part of self. upperNode = self.node(upperNodeSpec) else: # This allows candidates adjacent to the root, including the biRoot #candidates = [n for n in self.iterInternalsNoRoot()] # disallow candidates adjacent to the biRoot candidates = [] disallowed = [] selfIsBiRooted = self.isBiRoot() if selfIsBiRooted: disallowed = [self.root.leftChild, self.root.leftChild.sibling] for n in self.iterInternalsNoRoot(): if selfIsBiRooted: if n in disallowed: pass else: candidates.append(n) else: candidates.append(n) if not candidates: self.dump() self.draw() gm.append("No internal nodes?") raise P4Error(gm) upperNode = random.choice(candidates) #print(f"upperNode is {upperNode.nodeNum}") # Want the upperNode to have at least 2 children upperChildren = [n for n in upperNode.iterChildren()] if len(upperChildren) < 2: gm.append("upperNode needs to have at least 2 children.") raise P4Error(gm) if upperNode.parent: lowerNode = upperNode.parent else: gm.append("upperNode needs to have a parent node.") raise P4Error(gm) lowerChildren = [n for n in lowerNode.iterChildren() if n != upperNode] if 0: if lowerNode.parent: if len(lowerChildren) < 1: gm.append("The lower node has a parent.") gm.append( "It needs at least one more child besides the upperNode.") raise P4Error(gm) else: if len(lowerChildren) < 2: gm.append("The lower node does not have a parent.") gm.append( "It needs at least 2 children besides the upperNode.") raise P4Error(gm) if len(lowerChildren) < 1: gm.append( "The lower node needs at least one more child besides the upperNode.") raise P4Error(gm) upperSubTreeNode = random.choice(upperChildren) lowerSubTreeNode = random.choice(lowerChildren) upperSubTree = self.pruneSubTreeWithoutParent( upperSubTreeNode, allowSingleChildNode=True) lowerSubTree = self.pruneSubTreeWithoutParent( lowerSubTreeNode, allowSingleChildNode=True) self.reconnectSubTreeWithoutParent(upperSubTree, lowerNode) self.reconnectSubTreeWithoutParent(lowerSubTree, upperNode) self.setPreAndPostOrder()
[docs] def nni2(self, upperSubTreeNode, lowerSubTreeNode): """Simple nearest-neighbor interchange, with specified nodes to interchange. You specify an 'upper' node, via upperSubTreeNode, which as usual can be a node name, node number, or node object. You also specify a 'lower' node, via lowerSubTreeNode. The grand-parent of the upperSubTreeNode must exist and must be the parent of the lowerSubTreeNode. This works on biRooted trees also, preserving the biRoot. """ gm = ["Tree.nni2()"] # This makes sure that the two nodes are part of self. upperSubTreeNode = self.node(upperSubTreeNode) lowerSubTreeNode = self.node(lowerSubTreeNode) assert upperSubTreeNode.parent upperNode = upperSubTreeNode.parent assert upperSubTreeNode.parent.parent lowerNode = upperSubTreeNode.parent.parent assert lowerSubTreeNode.parent assert upperSubTreeNode.parent.parent == lowerSubTreeNode.parent upperSubTree = self.pruneSubTreeWithoutParent( upperSubTreeNode, allowSingleChildNode=True) lowerSubTree = self.pruneSubTreeWithoutParent( lowerSubTreeNode, allowSingleChildNode=True) self.reconnectSubTreeWithoutParent(upperSubTree, lowerNode) self.reconnectSubTreeWithoutParent(lowerSubTree, upperNode) self.setPreAndPostOrder()
[docs] def checkThatAllSelfNodesAreInTheTree(self, verbose=False, andRemoveThem=False): """Check that all self.nodes are actually part of the tree. Arg *andRemoveThem* will remove those nodes, renumber the nodes, and reset pre- and postOrder, and return None If *andRemoveThem* is not set (the default is not set) then this method returns the list of nodes that are in self.nodes but not in the tree. """ selfNodesSet = set(self.nodes) treeSet = set([n for n in self.iterNodes()]) inSelfNodesButNotInTree = list(selfNodesSet.difference(treeSet)) if verbose: if inSelfNodesButNotInTree: print("These nodes are in self.nodes, but not part of the tree.") for n in inSelfNodesButNotInTree: print(n.nodeNum) else: print("All nodes in self.nodes are also in the tree.") if inSelfNodesButNotInTree and andRemoveThem: for n in inSelfNodesButNotInTree: self.nodes.remove(n) for i in range(len(self.nodes)): self.nodes[i].nodeNum = i self.preOrder = numpy.array( [var.NO_ORDER] * len(self.nodes), numpy.int32) self.postOrder = numpy.array( [var.NO_ORDER] * len(self.nodes), numpy.int32) if len(self.nodes) > 1: self.setPreAndPostOrder() return return list(inSelfNodesButNotInTree)
[docs] def spr(self, pruneNode=None, above=True, graftNode=None): """Subtree pruning and reconnection. See also the Tree.randomSpr() method. It uses this method to do a random spr move. This only works on fully bifurcating trees. Doing spr moves would tend to break up polytomies anyway; pruning subtrees from a polytomy would require creation of new nodes. The subtree to be pruned might be pointing up or pointing down from a specified node. If the subtree is pointing up, the subtree to be pruned is specified by the appropriate child of the root of the subtree; the subtree would have a root-on-a-stick (Is monofurcating a proper word?) with the subtree root's single child being the specified node. If the subtree is pointing down, then the tree is re-rooted to the specified node to allow pruning of the subtree, now above the specified node, with the specified node as the root, including the subtree with the pre-re-rooting parent of the specified node. I'll draw that out. Lets say we want to prune the subtree below node 2 in this tree. That would include nodes 0, 1, 2, and 7. :: +--------1:A | | +---------3:B |--------2 0 | +--------5:C | +---------4 | +--------6:D | +--------7:E The way it is done in this method is to re-root at node 2, which is the specified node. Then the subtree including the pre-re-rooting parent of the specified node, ie node 0, is pruned. :: +--------3:B | | +--------5:C 2--------4 | +--------6:D | | +--------1:A +--------0 +--------7:E """ gm = ["Tree.spr()"] # This is only for fully bifurcating trees. if not self.isFullyBifurcating(): gm.append("This method is only for fully bifurcating trees.") raise P4Error(gm) pnNode = self.node(pruneNode) grNode = self.node(graftNode) # self.draw() self.deleteCStuff() if not above: preReRootingParent = pnNode.parent self.reRoot(pnNode) pnNode = preReRootingParent # Check for silliness assert pnNode != self.root assert grNode != self.root assert pnNode != grNode subTreeNodes = [n for n in pnNode.iterPreOrder()] pnNodeParnt = pnNode.parent subTreeNodes.append(pnNodeParnt) # print [n.nodeNum for n in subTreeNodes] if grNode in subTreeNodes: if above: gm.append("grNode %i is part of the pruned subtree from %i-%i up. No workee!" % ( grNode.nodeNum, pnNodeParnt.nodeNum, pnNode.nodeNum)) else: gm.append("grNode %i is part of the pruned subtree below %i-%i. No workee!" % ( grNode.nodeNum, pnNodeParnt.nodeNum, pnNode.nodeNum)) raise P4Error(gm) # Prune it from the tree. if pnNodeParnt == self.root: if grNode.parent == self.root: # as well, gm.append( "prune node and graft node both have root as parent -- ie same origin and destination.") raise P4Error(gm) # Check if removal of the subtree will result in only 2 taxa singles = 0 for ch in self.root.iterChildren(): if ch != pnNode: if not ch.leftChild: singles += 1 if singles == 2: gm.append( "Removing subtree at %i will leave only 2 taxa." % pnNode.nodeNum) raise P4Error(gm) newRoot = None for ch in self.root.iterChildren(): if ch != pnNode: if ch.leftChild: newRoot = ch break # print "rerooting to node %i" % newRoot.nodeNum self.reRoot(newRoot) # self.draw() # pnNodeParnt is not the root, and so we can be sure pnNodeParnt has a parent. # the pnNode is the left, middle, or right child of the parent if pnNodeParnt.leftChild == pnNode: # its the left child # remove the pnNodeParnt along with the subtree # (pnNodeParnt.parent.leftChild, (pnNode, pnNode.sibling)pnNodeParnt, pnNodeParnt.sibling)pnNodeParnt.parent; # +----------1:pnNodeParnt.parent.leftChild # | # | +-----------3:pnNode # pnNodeParnt.parent:0----------2:pnNodeParnt # | +-----------4:pnNode.sibling # | # +----------5:pnNodeParnt.sibling pnNode.sibling.parent = pnNodeParnt.parent if pnNodeParnt.parent.leftChild == pnNodeParnt: pnNodeParnt.parent.leftChild = pnNode.sibling elif pnNodeParnt.parent.leftChild.sibling == pnNodeParnt: pnNodeParnt.parent.leftChild.sibling = pnNode.sibling # if pnNodeParnt.parent is the root, it can have 3 children, and # maybe this ... else: pnNodeParnt.parent.leftChild.sibling.sibling = pnNode.sibling pnNode.sibling.sibling = pnNodeParnt.sibling #pnNode.sibling = None #pnNodeParnt.parnt = None else: # its a right child # (pnNodeParnt.parent.leftChild, (pnNodeLeftSib, pnNode)pnNodeParnt, pnNodeParnt.sibling)pnNodeParnt.parent; # +-----------1:pnNodeParnt.parent.leftChild # | # | +-----------3:pnNodeLeftSib # pnNodeParnt.parent:0-----------2:pnNodeParnt # | +-----------4:pnNode # | # +-----------5:pnNodeParnt.sibling pnNodeLeftSib = pnNodeParnt.leftChild pnNodeLeftSib.parent = pnNodeParnt.parent pnNodeLeftSib.sibling = pnNodeParnt.sibling if pnNodeParnt.parent.leftChild == pnNodeParnt: pnNodeParnt.parent.leftChild = pnNodeLeftSib elif pnNodeParnt.parent.leftChild.sibling == pnNodeParnt: pnNodeParnt.parent.leftChild.sibling = pnNodeLeftSib # if pnNodeParnt.parent is the root, it can have 3 children, and # maybe this ... else: pnNodeParnt.parent.leftChild.sibling.sibling = pnNodeLeftSib # To look at the pruned tree, before grafting ... if 0: print("removal of subtree at %i-%i gives .." % (pnNodeParnt.nodeNum, pnNode.nodeNum)) self._nTax = 0 self.preAndPostOrderAreValid = 0 # This won't work unless preAndPostOrderAreValid set to 0 self.draw() # Now graft it back on ... # (grNodeParnt.leftChild, grNode, grNode.sibling)grNodeParnt; # +-------1:grNodeParnt.leftChild # | # grNodeParnt:0-------2:grNode # | # +-------3:grNode.sibling # (grNode, grNode.sibling)grNodeParnt; # +-------1:grNode # grNodeParnt:0 # +-------2:grNode.sibling # (grNodeParnt.leftChild, grNodeParnt.leftChild.sibling, grNode)grNodeParnt; # +-------1:grNodeParnt.leftChild # | # grNodeParnt:0-------2:grNodeParnt.leftChild.sibling # | # +-------3:grNode grNodeParnt = grNode.parent pnNodeParnt.parent = grNodeParnt grNode.parent = pnNodeParnt pnNodeParnt.sibling = grNode.sibling grNode.sibling = pnNode pnNode.sibling = None grNode.sibling = pnNode pnNodeParnt.leftChild = grNode if grNodeParnt.leftChild == grNode: grNodeParnt.leftChild = pnNodeParnt elif grNodeParnt.leftChild.sibling == grNode: grNodeParnt.leftChild.sibling = pnNodeParnt else: grNodeParnt.leftChild.sibling.sibling = pnNodeParnt # To look at the tree after grafting ... if 0: print("grafting the subtree at grNode %i gives ..." % grNode.nodeNum) self._nTax = 0 self.preAndPostOrderAreValid = 0 # This won't work unless preAndPostOrderAreValid set to 0 self.draw() self.preAndPostOrderAreValid = 0 self.setPreAndPostOrder()
[docs] def randomSpr(self): """Do a random spr move. """ myAbove = random.choice([True, False]) tNodes = [n for n in self.iterNodesNoRoot()] while 1: try: pNode = random.choice(tNodes) except IndexError: # we have run out of choices, all were unsuitable. return tNodes.remove(pNode) if pNode == self.root: continue if myAbove: # iterDown() will return pNode, the root, and whatever else. # If whatever else is only 2 nodes, it won't work. nodesDown = [n2 for n2 in pNode.iterDown()] if len(nodesDown) <= 4: continue else: # iterPostOrder() ends with the pNode. We need at least the # pNode and 3 others, so total 3 is too few. nodesUp = [n2 for n2 in pNode.iterPostOrder()] # self.draw() # print "--", pNode.nodeNum, [n2.nodeNum for n2 in nodesUp] if len(nodesUp) <= 3: continue break if myAbove: #pNode = self.node(pNNum) possibles = [n2.nodeNum for n2 in pNode.iterDown() if n2 is not pNode and n2 is not pNode.parent and n2.parent is not pNode.parent] else: #pNode = self.node(pNNum) possibles = [n2.nodeNum for n2 in pNode.iterPreOrder() if n2.parent is not pNode and pNode is not n2] if 0: self.draw() print("above=%s, pNode %i, " % (myAbove, pNode.nodeNum), subtreeNodeNums) print(possibles) sys.exit() safety = 0 giveUp = 20 while 1: safety += 1 if safety >= giveUp: break gNNum = random.choice(possibles) if gNNum == pNode.nodeNum: continue if gNNum == self.root.nodeNum: continue # print "===", "pNode=%i" % pNode.nodeNum, "gNNum=%i" % gNNum, # subtreeNodeNums if pNode.parent == self.root and self.node(gNNum).parent == self.root: continue if not myAbove and self.node(gNNum).parent == pNode: continue break if safety >= giveUp: print("pNode is %i, above=%s" % (pNode.nodeNum, myAbove)) self.draw() raise P4Error # print("randomSpr() pruneNum %i, above=%s, graftNum %i" % (pNode.nodeNum,myAbove, gNNum)) self.spr(pruneNode=pNode, above=myAbove, graftNode=gNNum)
[docs] def inputTreesToSuperTreeDistances(self, inputTrees, doSd=True, doScqdist=True): """Return the topology distance between input trees and pruned self. Either or both of two distances, RF ('sd') and quartet ('scqdist') distances, are returned. See also the Trees method of the same name that does a bunch of them. """ for inTree in inputTrees: if not inTree.taxNames: inTree.taxNames = [n.name for n in inTree.iterLeavesNoRoot()] # inTree.makeSplitKeys() totalSd = 0 totalScqdist = 0 for inTree in inputTrees: sDupe = self.dupe() toRemove = [] for n in sDupe.iterLeavesNoRoot(): if n.name not in inTree.taxNames: toRemove.append(n) for n in toRemove: sDupe.removeNode(n) sDupe.taxNames = inTree.taxNames if doSd: rfDist = sDupe.topologyDistance( inTree, metric='sd', resetSplitKeySet=True) totalSd += rfDist if doScqdist: qd = sDupe.topologyDistance(inTree, metric='scqdist') totalScqdist += qd if doSd and doScqdist: return totalSd, totalScqdist elif doSd: return totalSd elif doScqdist: return totalScqdist
[docs] def resolvePolytomyAtNode(self, theNode, resolution=2): """Resolve the polytomy at theNode Resolve randomly, by joining up children with new nodes. After resolution, the theNode should have arg resolution children. """ if not theNode.leftChild: return children = [] p = theNode.leftChild while p: children.append(p) p = p.sibling while len(children) > resolution: rNode = random.choice(children) children.remove(rNode) # print(f" about to prune subtree at node {rNode.nodeNum}") rSubtree = self.pruneSubTreeWithoutParent(rNode) otherNode = random.choice(children) nBetween = self.addNodeBetweenNodes(theNode, otherNode) self.reconnectSubTreeWithoutParent(rSubtree, nBetween) self.preAndPostOrderAreValid = 0 self.setPreAndPostOrder() # self.draw() children = [] p = theNode.leftChild while p: children.append(p) p = p.sibling
[docs] def resolve(self, biRoot=False): """Randomly resolve all polytomies It does it in postOrder, calling Tree.resolvePolytomyAtNode() on internal nodes. It will work on a partially resolved tree, or on a comb tree. If biRoot is set to False, the default, then the final resolution will leave a triRoot'ed root. If biRoot is set to True, then the final resolution will leave a biRoot'ed tree. """ self.setPreAndPostOrder() for n in self.iterInternalsNoRootPostOrder(): self.resolvePolytomyAtNode(n, resolution=2) if biRoot: myRes = 2 else: myRes = 3 self.resolvePolytomyAtNode(self.root, resolution=myRes)
# resolvePolytomyAtNode() does self.setPreAndPostOrder() ################################################# write #################################################
[docs] def patristicDistanceMatrix(self): """Matrix of distances along tree path. This method sums the branch lengths between each pair of taxa, and puts the result in a DistanceMatrix object, which is returned. Self.taxNames is required. """ gm = ['Tree.patristicDistanceMatrix()'] if not self.taxNames: gm.append("No taxNames.") raise P4Error(gm) # The tree will be rearranged, so make a copy to play with, so # self is undisturbed. t = self.dupe() d = DistanceMatrix() d.names = self.taxNames d.dim = len(d.names) # print d.names d.matrix = [] for i in range(d.dim): d.matrix.append([0.0] * d.dim) for i in range(d.dim): n1 = t.node(d.names[i]) t.reRoot(n1, moveInternalName=False) for j in range(i + 1, d.dim): n2 = t.node(d.names[j]) # sum up distances between n1 and n2 sum = n2.br.len n2 = n2.parent while n2 != n1: sum = sum + n2.br.len n2 = n2.parent # print "Dist from %s to %s is %f" % (d.names[i], d.names[j], # sum) d.matrix[i][j] = sum d.matrix[j][i] = sum return d
[docs] def tPickle(self, fName=None): """Pickle self to a file with a 'p4_tPickle' suffix. If there is an attached Data object, it is not pickled. If there is an attached model object, it is pickled. Pointers to c-structs are not pickled. If fName is supplied, the file name becomes fName.p4_tPickle, unless fName already ends with .p4_tPickle. If fName is not supplied, self.name is used in fName's place. If neither is supplied, the pid is used as fName. If a file with the chosen name already exists, it is silently over-written! p4 can read a p4_tPickle file from the command line or using the read() function, as usual. (This would not be a good option for long-term storage, because if you upgrade p4 and the p4 Classes change a lot then it may become impossible to unpickle it. If that happens, you can use the old version of p4 to unpickle.) """ suffix = '.p4_tPickle' if not self.name and not fName: fName = '%s' % os.getpid() if fName: if fName.endswith(suffix): fN = fName else: fN = fName + suffix elif self.name: if self.name.endswith(suffix): fN = self.name else: fN = self.name + suffix f = open(fN, 'wb') pickle.dump(self.dupe(), f, pickle.HIGHEST_PROTOCOL) # pickle.dump(self, f, 1) # Don't do this -- has pointers that would # not have been malloc'ed! And data! f.close()
[docs] def writeNexus(self, fName=None, append=0, writeTaxaBlockIfTaxNamesIsSet=1, message=None): """Write the tree out in Nexus format, in a trees block. If fName is None, the default, it is written to sys.stdout. #NEXUS is written unless we are appending-- set append=1. If you want to write with a translation, use a Trees object. """ gm = ['Tree.writeNexus()'] if fName == None or fName == sys.stdout: f = sys.stdout if not append: f.write('#NEXUS\n\n') else: if append: if os.path.isfile(fName): try: f = open(fName, 'a') except IOError: gm.append("Can't open %s for appending." % fName) raise P4Error(gm) else: if 0: print("Tree.writeNexus()") print(" 'append' is requested,") print(" but '%s' is not a regular file (doesn't exist?)." \ % fName) print(" Writing to a new file instead.") try: f = open(fName, 'w') f.write('#NEXUS\n\n') except IOError: gm.append("Can't open %s for writing." % fName) raise P4Error(gm) else: try: f = open(fName, 'w') f.write('#NEXUS\n\n') except IOError: gm.append("Can't open %s for writing." % fName) raise P4Error(gm) if writeTaxaBlockIfTaxNamesIsSet and self.taxNames: f.write('begin taxa;\n') f.write(' dimensions ntax=%s;\n' % self.nTax) f.write(' taxlabels') for i in self.taxNames: f.write(' %s' % p4.func.nexusFixNameIfQuotesAreNeeded(i)) f.write(';\nend;\n\n') f.write('begin trees;\n') if message: f.write(' [%s\n ]\n' % message) if self.logLike: f.write(' [logLike for tree %s is %f]\n' % (self.name, self.logLike)) f.write(' tree %s = [&U] ' % p4.func.nexusFixNameIfQuotesAreNeeded(self.name)) if self.recipWeight: if self.recipWeight == 1: f.write('[&W 1] ') else: f.write('[&W 1/%i] ' % self.recipWeight) if hasattr(self, 'weight'): f.write('[&W %f] ' % self.weight) self.writeNewick(f) f.write('end;\n\n') if f != sys.stdout: f.close()
[docs] def write(self): """This writes out the Newick tree description to sys.stdout.""" self.writeNewick(sys.stdout)
[docs] def writePhylip(self, fName=None, withTranslation=0, translationHash=None, doMcmcCommandComments=0): """Write the tree in Phylip or Newick format. (This method is just a dupe of writeNewick(). Without the 'toString' or 'append' args.) fName may also be an open file object. """ self.writeNewick( fName, withTranslation, translationHash, doMcmcCommandComments)
[docs] def writeNewick(self, fName=None, withTranslation=0, translationHash=None, doMcmcCommandComments=0, toString=False, append=False, spaceAfterComma=True): """Write the tree in Newick, aka Phylip, format. This is done in a Nexus-oriented way. If taxNames have spaces or odd characters, they are single-quoted. There is no restriction on the length of the taxon names. A translationHash can be used. fName may also be an open file object. If 'toString' is turned on, then 'fName' should be None, and a Newick representation of the tree is returned as a string. The method 'writePhylip()' is the same as this, with fewer arguments. """ gm = ['Tree.writeNewick()'] sList = [] if withTranslation and not translationHash: gm.append("No translationHash.") raise P4Error(gm) if fName and toString: gm.append( "You cannot write to a file and string at the same time.") raise P4Error(gm) if doMcmcCommandComments: if not self.model: gm.append("No model attached to tree.") gm.append("Set doMcmcCommandComments=0") raise P4Error(gm) # print 'self.preAndPostOrderAreValid = %s' % # self.preAndPostOrderAreValid if not self.preAndPostOrderAreValid: self.setPreAndPostOrder() # print "self.preOrder = %s" % self.preOrder # Don't count un-used nodes. nNodes = len([n for n in self.iterNodes()]) # print "nNodes = %i" % nNodes if nNodes == 1: # print "Single node. isLeaf=%s, name=%s" % (self.root.isLeaf, # self.root.name) if self.root.isLeaf: if withTranslation: sList.append('%s' % translationHash[self.root.name]) elif self.root.name: sList.append( '%s' % p4.func.nexusFixNameIfQuotesAreNeeded(self.root.name)) else: sList.append('()') else: # Will this ever happen? gm.append( "Something is wrong. There is only one node, and it is not terminal.") raise P4Error(gm) elif nNodes > 1: writeBrLens = 0 for n in self.iterNodesNoRoot(): if n.br.len != 0.1: writeBrLens = 1 break stack = [] for n in self.iterPreOrder(): stack.append(n) if n.leftChild: sList.append('(') continue while len(stack): n1 = stack.pop() # print "stacklen=%i, n1 name=%s" % (len(stack), n1.name) if n1.isLeaf: if n1 == self.root: sList.append(')') if withTranslation: sList.append('%s' % (translationHash[n1.name])) else: if n1.name: sList.append( '%s' % p4.func.nexusFixNameIfQuotesAreNeeded(n1.name)) else: if n1 != self.root: gm.append("Terminal node with no name?") raise P4Error(gm) else: sList.append(')') if n1.name: sList.append( '%s' % p4.func.nexusFixNameIfQuotesAreNeeded(n1.name)) if writeBrLens: if n1 != self.root: sList.append(':%g' % n1.br.len) if doMcmcCommandComments: sList.append(self._getMcmcCommandComment(n1)) if n1.sibling: if spaceAfterComma: sList.append(', ') else: sList.append(',') break sList.append(';') if toString: return "".join(sList) # no newline elif fName == None: print("".join(sList)) # with default newline from print sList.append('\n') # now all will have a newline if isinstance(fName, str): if append: fName2 = open(fName, 'a') else: fName2 = open(fName, 'w') fName2.write(''.join(sList)) fName2.close() elif hasattr(fName, 'write'): fName.write(''.join(sList)) # Somebody else opened the fName, so somebody else can close it. else: gm.append("I don't understand (%s) passed to me to write to." % fName) raise P4Error(gm)
def _getMcmcCommandComment(self, theNode): sList = [' [&'] for pNum in range(self.model.nParts): mp = self.model.parts[pNum] if mp.nComps > 1: doWrite = True if mp.ndch2: if mp.ndch2_writeComps: doWrite = True else: doWrite = False if doWrite: sList.append(' c%i.%i' % (pNum, theNode.parts[pNum].compNum)) if theNode != self.root: if mp.nRMatrices > 1: doWrite = True if mp.ndrh2: if mp.ndrh2_writeRMatrices: doWrite = True else: doWrite = False if doWrite: sList.append(' r%i.%i' % (pNum, theNode.br.parts[pNum].rMatrixNum)) if mp.nGdasrvs > 1: sList.append(' g%i.%i' % (pNum, theNode.br.parts[pNum].gdasrvNum)) sList.append(']') if len(sList) == 2: # ie [&] only, eg ndrh2 but only 1 comp return "" return ''.join(sList)
[docs] def draw(self, showInternalNodeNames=1, addToBrLen=0.2, width=None, showNodeNums=1, partNum=0, model=None): """Draw the tree to the screen. This method makes a text drawing of the tree and writes it to sys.stdout. Arg addToBrLen adds, by default 0.2, to each branch length, to make the tree more legible. If you want the branch lengths more realistic, you can set it to zero, or better, use vector graphics for drawing the trees. Setting arg model aids in drawing trees with tree-hetero models. If the model characteristic (usually composition or rMatrix) differs over the tree, this method can draw it for you. See also :meth:`Tree.Tree.textDrawList`, which returns the drawing as a list of strings. See the method :meth:`Tree.Tree.setTextDrawSymbol`, which facilitates drawing different branches with different symbols. """ s = self.textDrawList(showInternalNodeNames=showInternalNodeNames, addToBrLen=addToBrLen, width=width, autoIncreaseWidth=True, showNodeNums=showNodeNums, partNum=partNum, model=model) print('\n'.join(s))
[docs] def textDrawList(self, showInternalNodeNames=1, addToBrLen=0.2, width=None, autoIncreaseWidth=True, showNodeNums=1, partNum=0, model=False): if len(self.nodes) == 0: return [''] elif len(self.nodes) == 1: if showNodeNums: return ['%i:%s' % (self.nodes[0].nodeNum, self.nodes[0].name)] else: return ['%s' % self.nodes[0].name] gm = ['Tree.textDrawList()'] if not self.preAndPostOrderAreValid: self.setPreAndPostOrder() p = TreePicture(self) p.fName = None p.width = width p.xScale = None p.yScale = 1 p.pointsPerLetter = 1 p.addToBrLen = addToBrLen p.textShowNodeNums = showNodeNums p.showInternalNodeNames = showInternalNodeNames if showNodeNums: p.nameOffset = 0 else: p.nameOffset = 1 p.xOrigin = 0.0 p.yOrigin = 0.0 p.textSize = 1 p.labelTextSize = 1 # If the width is not specified, make a guess if width == None: tLen = 0 # Longest number of horizontal sections to draw longestNameLen = 0 for n in self.iterNodes(): if n.isLeaf and n != self.root: if n.name: # This assumes short internal node name lengths if len(n.name) > longestNameLen: longestNameLen = len(n.name) thisLen = 0 n1 = n # print "x n1 is node %i" % n1.nodeNum while n1 != self.root: n1 = n1.parent # print "y n1 is node %i" % n1.nodeNum thisLen += 1 if thisLen > tLen: tLen = thisLen # print "tLen =", tLen # print "longestNameLen =", longestNameLen rootNameLen = 0 if self.root.name: rootNameLen = len(self.root.name) + 1 p.width = (10 * tLen) + rootNameLen + longestNameLen if p.width > 100: p.width = 100 # print "p.width =", p.width # sys.exit() # Make sure the names fit. if not autoIncreaseWidth: for n in self.iterNodes(): if n.isLeaf and n != self.root: if n.name and len(n.name) > p.width: gm.append( "There are long names, and the given width is not enough.") raise P4Error(gm) if model: try: partNum = int(partNum) except: gm.append("partNum arg should be an integer.") raise P4Error(gm) if not self.model: gm.append("If model arg is set, then self.model must exist.") raise P4Error(gm) if partNum < 0 or partNum >= self.model.nParts: gm.append( "Zero-based partNum %i is out of range of %s parts." % (partNum, self.model.nParts)) raise P4Error(gm) p.partNum = partNum p.doModel = 1 doComps = 1 doRMatrices = 1 if self.model.parts[partNum].nComps < 2: doComps = 0 if self.model.parts[partNum].nRMatrices < 2: doRMatrices = 0 if not doComps and not doRMatrices: p._setPos(autoIncreaseWidth) s = p.textString(returnAsList=True) s.append( "Both the composition of the model and the rate matrix are homogeneous in part %i.\n" % partNum) return s # First do compositions s = [] if doComps: if 0: if self.model.nParts > 1: print("Compositions for part %i" % partNum) else: print("\nCompositions\n------------") p.textDrawModelThing = var.TEXTDRAW_COMP p._setPos(autoIncreaseWidth) s = p.textString(returnAsList=True) # print s # Then do RMatrices if doRMatrices: if 0: if self.model.nParts > 1: print("RMatrices for part %i" % partNum) else: print("\nRMatrices\n---------") p.textDrawModelThing = var.TEXTDRAW_RMATRIX p._setPos(autoIncreaseWidth) if s: s += p.textString(returnAsList=True) else: s = p.textString(returnAsList=True) if self.model.parts[partNum].nComps == 1: s.append( "The composition of the model is homogeneous in part %i\n" % partNum) elif self.model.parts[partNum].nRMatrices == 1: s.append( "The rate matrix is homogeneous in part %i\n" % partNum) # Don't bother with GDASRV, yet return s else: p._setPos(autoIncreaseWidth) s = p.textString(returnAsList=True) return s
# outFileName=None, width=500, heightFactor=0.85, pointsPerLetter=6.0, # textSize=11, labelSize=9, putInternalNodeNamesOnBranches=0)
[docs] def eps(self, fName=None, width=500, putInternalNodeNamesOnBranches=0): """Make a basic eps drawing of self. The 'width' is in postscript points. By default, internal node names label the node, where the node name goes on the right of the node. You can make the node name label the branch by setting 'putInternalNodeNamesOnBranches'. """ gm = ['Tree.eps()'] if not self.preAndPostOrderAreValid: self.setPreAndPostOrder() p = TreePicture(self) p.addToBrLen = 0.0 p.width = width p.yScale = 17.0 p.xScale = None p.pointsPerLetter = 6.0 p.textSize = 11 p.labelTextSize = 8 p.putInternalNodeNamesOnBranches = putInternalNodeNamesOnBranches p.xOrigin = 5.0 p._setPos() s = p.vectorString() if not fName: if self.name: fName = '%s.eps' % self.name else: fName = '%i.eps' % os.getpid() # if not fName.endswith('.eps'): # fName = '%s.eps' % fName f = open(fName, 'w') f.write(s) f.close()
################################################# model ################################################# @property def data(self): """(property) The data object""" return self._data @data.setter def data(self, theData): """Sets self.data""" # Two cases. Either # 1. Self already has a data. # - deleteCStuff(), and replace the old data with the new data. # 2. Self has no data. # a. Has a model. So just check for compatibility. # b. Has no model, so has never seen a data before. complaintHead = 'Tree.data (property setter)' if self.name: complaintHead += ", tree '%s'" % self.name if self.fName: complaintHead += ", file %s" % self.fName gm = [complaintHead] if isinstance(theData, Data) or theData == None: pass else: gm.append("Set data only to a Data object, or None, ok?") raise P4Error(gm) if self.data or self.model: # Normally there won't be anything to delete, but you never know... self.deleteCStuff() if not theData: self._data = None return self._data = theData if self.model: # We have seen a data object before (otherwise we would not # have been able to set the model). Check for compatibility. # print("{gm[0]} self.model exits.") if not self.taxNames: gm.append( "Self has model, but no taxNames. Programming error.") raise P4Error(gm) # Check for same number of taxa treeNTax = len(self.taxNames) dataNTax = len(theData.taxNames) if treeNTax != dataNTax: gm.append("The number of taxa in the tree (%s)" % treeNTax) gm.append("is not the same as in the data (%s)" % dataNTax) raise P4Error(gm) # Check for mis-matched taxNames isBad = 0 for tn in self.taxNames: if tn not in theData.taxNames: isBad = 1 break for tn in theData.taxNames: if tn not in self.taxNames: isBad = 1 break if isBad: gm.append("TaxName mismatch between the tree and the data.") gm.append("Here they are, sorted to show mis-matches.") gm.append(" %25s %25s" % ('data', 'tree')) self.taxNames.sort() theData.taxNames.sort() for i in range(len(self.taxNames)): if theData.taxNames[i] == self.taxNames[i]: gm.append(" %25s %25s" % (theData.taxNames[i], self.taxNames[i])) else: gm.append(" %25s %25s ***" % (theData.taxNames[i], self.taxNames[i])) raise P4Error(gm) # Same number of parts if len(theData.parts) != self.model.nParts: gm.append("nParts mis-match. len(theData.parts)=%i, model.nParts=%i" % ( len(theData.parts), self.model.nParts)) raise P4Error(gm) # Check dims and symbols in the parts for pNum in range(self.model.nParts): if theData.parts[pNum].dim != self.model.parts[pNum].dim: gm.append("Parts dim mis-match.") raise P4Error(gm) if theData.parts[pNum].symbols != self.model.parts[pNum].symbols: gm.append("Parts symbols mis-match.") raise P4Error(gm) # Set seqNum for n in self.iterLeavesNoRoot(): if n.seqNum != self.taxNames.index(n.name): gm.append("seqNums do not match up with taxNames.") raise P4Error(gm) else: # no model self.setNewModel()
[docs] def setNewModel(self): theData = self.data assert self.data, "set the data first" # assert not self.model, "this method is to be used when there is no model" if self.model: self.model = None # When you do this method, self gets a suitable # model. We have here no model, so we may have never seen a # data before. Or we might have just lost the model. # Check for same number of taxa treeNTax = 0 treeTaxNames = [] for n in self.iterNodes(): if n.isLeaf: treeNTax += 1 treeTaxNames.append(n.name) dataNTax = len(theData.taxNames) if treeNTax != dataNTax: gm.append("The number of taxa in the tree (%s)" % treeNTax) gm.append("is not the same as in the data (%s)" % dataNTax) raise P4Error(gm) # Check for mis-matched taxNames isBad = 0 for tn in treeTaxNames: if tn not in theData.taxNames: isBad = 1 break for tn in theData.taxNames: if tn not in treeTaxNames: isBad = 1 break if isBad: gm.append("TaxName mismatch between the tree and the data.") gm.append("Here they are, sorted to show mis-matches.") gm.append(" %25s %25s" % ('data', 'tree')) treeTaxNames.sort() theData.taxNames.sort() for i in range(len(treeTaxNames)): if theData.taxNames[i] == treeTaxNames[i]: gm.append(" %25s %25s" % (theData.taxNames[i], treeTaxNames[i])) else: gm.append("*** %25s %25s" % (theData.taxNames[i], treeTaxNames[i])) raise P4Error(gm) # attach self.taxNames = theData.taxNames self._data = theData # print "_setData. len(theData.parts) = %s" % len(theData.parts) # calls self.deleteCStuff() self.model = Model(len(theData.parts)) for n in self.iterNodes(): if n.parts: n.parts = [] for i in range(self.model.nParts): n.parts.append(NodePart()) for n in self.iterNodes(): if n != self.root: n.br.parts = [] for i in range(self.model.nParts): n.br.parts.append(NodeBranchPart()) # Set modelPart dims and symbols for pNum in range(self.model.nParts): self.model.parts[pNum].dim = theData.parts[pNum].dim self.model.parts[pNum].symbols = theData.parts[pNum].symbols # There is intentionally no default pInvar, forcing the user to be # explicit. for p in self.model.parts: p.pInvar = None # Set seqNum for n in self.iterNodes(): if n.isLeaf: n.seqNum = self.taxNames.index(n.name)
@property def model(self): """(property) The model object""" return self._model @model.setter def model(self, theModel): gm = ['Tree.model (property setter)'] # print gm[0] # print " Got '%s'" % theModel if isinstance(theModel, Model) or theModel == None: pass else: gm.append("Attempt to set Tree.model to '%s'. " % theModel) gm.append( "Don't set the model to anything other than 'None' or a Model, ok? ") gm.append("(And generally the user only sets it to None.) ") raise P4Error(gm) if self.model or self.data: self.deleteCStuff() # print 'Tree._setModel() finished deleteCStuff()' self._model = theModel @model.deleter def model(self): gm = ['Tree.model'] gm.append("Caught an attempt to delete self.model, but") gm.append("self.model is a property, so you shouldn't delete it.") gm.append("But you can set it to None if you like.") raise P4Error(gm) def _checkModelThing(self, partNum, symbol, complaintHead): gm = [complaintHead] if not self.data: gm.append("No data. Set the data first.") raise P4Error(gm) if not self.model: # When you set the self.data, a model object of suitable dimensions # is made and attached to self. If we have got here, it is # because the model has subsequently been lost. So just # re-instate it. self.setNewModel() if partNum < 0 or partNum >= self.model.nParts: gm.append("Zero-based partNum (%s) is out of range (of %s parts)" % (partNum, self.model.nParts)) raise P4Error(gm) if symbol: if not isinstance(symbol, str) or len(symbol) != 1: gm.append("Symbols must be 1-length strings.") raise P4Error(gm) if symbol == '?': gm.append("Got assigned text drawing symbol '?'.") gm.append("Don't use it-- it is reserved for missing model components") raise P4Error(gm)
[docs] def newComp(self, partNum=0, free=0, spec='empirical', val=None, symbol=None): """Make, attach, and return a new Comp object. The arg *spec* should be a string, one of:: 'equal' no val 'empirical' no val 'specified' val=[aList] 'wag', etc no val (ie one of the empirical protein models, including cpREV, d78, jtt, mtREV24, mtmam, wag, etc) If spec='specified', then you specify dim or dim-1 values in a list as the 'val' arg. This method returns a Comp object, which you can ignore if it is a tree-homogeneous model. However, if it is a tree-hetero model then you may want to get that Comp object so that you can place it on the tree explicitly with setModelComponentOnNode(), like this:: c0 = newComp(partNum=0, free=1, spec='empirical') c1 = newComp(partNum=0, free=1, spec='empirical') myTree.setModelComponentOnNode(c0, node=myTree.root, clade=1) myTree.setModelComponentOnNode(c1, node=5, clade=1) myTree.setModelComponentOnNode(c1, node=18, clade=0) Alternatively, you can simply let p4 place them randomly:: newComp(partNum=0, free=1, spec='empirical') newComp(partNum=0, free=1, spec='empirical') myTree.setModelComponentsOnNodesRandomly() Calculation of probability matrices for likelihood calcs etc are wrong when there are any comp values that are zero, so that is not allowed. Any zeros are converted to var.PIVEC_MIN, which is 1e-13 this week. Hopefully close enough to zero for you. """ gm = ['Tree.newComp()'] self._checkModelThing(partNum, symbol, gm[0]) if self.model.cModel: self.deleteCStuff() mt = Comp() mt.partNum = partNum mt.free = free # spec if spec not in var.compSpecs: gm.append("The spec should be one of %s" % var.compSpecs) raise P4Error(gm) mt.spec = spec mt.num = len(self.model.parts[partNum].comps) if symbol: mt.symbol = symbol else: try: mt.symbol = var.modelSymbols[mt.num] except IndexError: mt.symbol = '-' self.model.parts[partNum].comps.append(mt) # assign val dim = self.model.parts[partNum].dim if spec == 'equal': mt.val = numpy.ones(dim, dtype=numpy.double) / dim elif spec == 'empirical': assert mt.val is None elif spec == 'specified': # print(f"Got val {val}, type {type(val)}") # if val == None or val == []: # gm.append("Specified comp, but no val.") # raise P4Error(gm) try: val = list(val) except TypeError: gm.append("Comp is 'specified', but bad 'val' arg.") gm.append("The 'val' arg should be a list or tuple.") raise P4Error(gm) if len(val) == dim or len(val) == dim - 1: pass else: gm.append("Bad length for val arg (%i). Should be dim or dim-1 long." % len(val)) gm.append("(Dim for this part is %i)" % dim) raise P4Error(gm) # I allow val's of dim or dim-1. if len(val) == dim - 1: lastVal = 1.0 - sum(val) if lastVal > 0.0: val = val + [1.0 - sum(val)] else: gm.append("Bad comp vals %s" % val) gm.append("sum to 1.0 or more.") raise P4Error(gm) else: # len = dim theSum = sum(val) theDiff = math.fabs(theSum - 1.0) # How big to make the delta? With reasonably good, normalized # protein comps (where all the values had just been divided by # the total, so it should have summed to 1.0 at that point) I # kept getting 1.1e-16. So make it 5.e-16. No, too small. # With protein I am getting diffs of 8.88178e-16, so make the # cutoff 9e-16 if theDiff > 9.e-16: # 1e-17 was too small for protein gm.append("Bad comp vals %s" % val) gm.append("does not sum to 1.0") gm.append("The sum = %f" % theSum) gm.append("abs(1.0 - theSum) = %g" % theDiff) raise P4Error(gm) # Are any specified values less than PIVEC_MIN? needsNormalizing = 0 for i in range(len(val)): thisVal = val[i] if thisVal < var.PIVEC_MIN: print(gm[0]) print(" Specifying a comp of zero for a character is not allowed.") print(" The minimum is %g" % var.PIVEC_MIN) myVal = (1.5 + random.random()) * var.PIVEC_MIN print(" Re-setting to %g" % myVal) val[i] = myVal needsNormalizing = 1 if needsNormalizing: theSum = sum(val) for i in range(len(val)): val[i] /= theSum if math.fabs(sum(val) - 1.0) > 5.e-16: gm.append("Bad comp vals %s" % val) gm.append("does not sum to 1.0") raise P4Error(gm) # print "sum(val) - 1.0 = %f (%g)" % (sum(val) - 1.0, sum(val) - # 1.0) mt.val = val elif spec in var.rMatrixProteinSpecs: mt.val = p4.func.getProteinEmpiricalModelComp(spec) return mt
[docs] def newRMatrix(self, partNum=0, free=0, spec='ones', val=None, symbol=None): """Make, attach, and return a new RMatrix instance. spec should be one of: - 'ones' - for JC, poisson, F81 - '2p' - for k2p and hky - 'specified' - 'cpREV' - 'd78' - 'jtt' - 'mtREV24' - 'mtmam' - 'wag' - 'rtRev' - 'tmjtt94' - 'tmlg99' - 'lg' - 'blosum62' - 'hivb' - 'mtart' - 'mtzoa' - 'gcpREV' - 'stmtREV' - 'vt' - 'pmb' See var.rMatrixProteinSpecs You do not set the 'val' arg unless the spec is 'specified' or '2p'. If spec='2p', then you set val to kappa. If the spec is 'specified', you specify all the numerical values in a list given as the 'val' arg. The length of that list will be (((dim * dim) - dim) / 2), so for DNA, where dim=4, you would specify a list containing 6 numbers. """ # not implemented: # 'blosum62a' # 'blosum62b' # 'phat70' complaintHead = '\nTree.newRMatrix()' gm = [complaintHead] self._checkModelThing(partNum, symbol, complaintHead) if self.model.cModel: self.deleteCStuff() mt = RMatrix() mt.partNum = partNum #mt.dim = self.data.parts[partNum].dim mt.free = free if spec not in var.rMatrixSpecs: gm.append("Got unknown rMatrix spec '%s'." % spec) gm.append("Should be one of: %s" % var.rMatrixSpecs) raise P4Error(gm) mt.spec = spec mt.num = len(self.model.parts[partNum].rMatrices) if symbol: mt.symbol = symbol else: try: mt.symbol = var.modelSymbols[mt.num] except IndexError: mt.symbol = '-' self.model.parts[partNum].rMatrices.append(mt) # assign val dim = self.model.parts[partNum].dim if var.rMatrixNormalizeTo1: goodLen = int((((dim * dim) - dim) / 2)) else: goodLen = int((((dim * dim) - dim) / 2) - 1) v = None if spec == 'specified': if val: # should check that values are all floats if len(val) == goodLen: v = numpy.array(val, dtype=numpy.double) if var.rMatrixNormalizeTo1: v /= v.sum() elif var.rMatrixNormalizeTo1 and len(val) == goodLen - 1: gm.append("var.rMatrixNormalizeTo1 is set, val length should be %i, got %i" % ( goodLen, len(val))) raise P4Error(gm) else: gm.append("Bad length for arg val. Length %i, should be %i" % (len(val), goodLen)) raise P4Error(gm) else: gm.append("spec is 'specified', but there are no specified rMatrix values.") gm.append("Specify rMatrix values by eg val=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]") raise P4Error(gm) elif spec == 'ones': v = numpy.array([1.0] * goodLen, dtype=numpy.double) if var.rMatrixNormalizeTo1: v /= v.sum() elif spec == '2p': try: v = float(val) except (ValueError, TypeError): gm.append("Kappa ('val' arg) should be a float. Setting to 2.0") v = 2.0 if v < var.KAPPA_MIN: gm.append("Kappa is too small. Setting to %f" % var.KAPPA_MIN) v = var.KAPPA_MIN elif v > var.KAPPA_MAX: gm.append("Kappa is too big. Setting to %f" % var.KAPPA_MAX) v = var.KAPPA_MAX v = numpy.array([v], dtype=numpy.double) elif spec in var.rMatrixProteinSpecs: if self.data.parts[partNum].dataType != 'protein': gm.append("A protein matrix has been specified, but the dataType for part %i is %s." % ( partNum, self.data.parts[partNum].dataType)) raise P4Error(gm) if free: gm.append('The rMatrix should not be free if it is an empirical protein matrix.') raise P4Error(gm) else: gm.append(f"I don't know spec '{spec}'") gm.append(f"Programming error") raise P4Error(gm) mt.val = v # type numpy.ndarray, or None for specified protein return mt
[docs] def newGdasrv(self, partNum=0, free=0, val=None, symbol=None): gm = ['Tree.newGdasrv()'] if not self.model: gm.append("Set the data first. Eg myTree.data = Data()") raise P4Error(gm) if self.model.cModel: self.deleteCStuff() # check if there is an nGammaCat > 1: if self.model.parts[partNum].nGammaCat == 1: gm.append("For this part (%s), the number of nGammaCat has been set to 1." % partNum) gm.append("So gdasrv won't work.") gm.append("You can set the nGammaCat with yourTree.setNGammaCat(partNum=x, nGammaCat=y)") raise P4Error(gm) # check val if val == None: gm.append("Please specify a val, a positive float.") raise P4Error(gm) try: v = float(val) except: gm.append("Arg val must be a float. Got '%s'" % val) raise P4Error(gm) # This week, we have in defines.h # define GAMMA_SHAPE_MIN 0.000001 # define GAMMA_SHAPE_MAX 300.0 if v <= 0.000001 or v >= 300.0: gm.append("Arg val must be between 0.000001 and 300. Got %f" % v) raise P4Error(gm) self._checkModelThing(partNum, symbol, gm[0]) mt = Gdasrv() mt.nGammaCat = self.model.parts[partNum].nGammaCat mt.partNum = partNum mt.free = free # no spec or dim mt.num = len(self.model.parts[partNum].gdasrvs) if symbol: mt.symbol = symbol else: mt.symbol = var.modelSymbols[mt.num] self.model.parts[partNum].gdasrvs.append(mt) mt.freqs = numpy.zeros(mt.nGammaCat, dtype=numpy.double) mt.rates = numpy.zeros(mt.nGammaCat, dtype=numpy.double) mt._val[0] = v mt.calcRates() return mt
[docs] def setPInvar(self, partNum=0, free=0, val=0.0): complaintHead = '\nTree.setPInvar()' gm = [complaintHead] # check val try: v = float(val) except: gm.append("Arg val must be a float. Got '%s'" % val) raise P4Error(gm) if v < 0.0 or v >= 1.0: gm.append( "Arg val must be zero or more, and less than 1. Got %f" % v) raise P4Error(gm) self._checkModelThing(partNum, None, complaintHead) if self.model.cModel: self.deleteCStuff() mt = PInvar() mt.partNum = partNum mt.free = free mt.val = v self.model.parts[partNum].pInvar = mt
[docs] def setRelRate(self, partNum=0, val=0.0): complaintHead = '\nTree.setRelRate()' gm = [complaintHead] # check val try: v = float(val) except: gm.append("Arg val must be a float. Got '%s'" % val) raise P4Error(gm) if v <= 0.0 or v >= 1000.0: gm.append( "Arg val must be more than zero, and less than 1000 (arbitrarily). Got %f" % v) raise P4Error(gm) self._checkModelThing(partNum, None, complaintHead) if self.model.cModel: self.deleteCStuff() self.model.parts[partNum].relRate = v
[docs] def setModelComponentOnNode(self, theModelComponent, node=None, clade=1): complaintHead = '\nTree.setModelComponentOnNode()' gm = [complaintHead] if theModelComponent and \ (isinstance(theModelComponent, Comp) or isinstance(theModelComponent, RMatrix) or isinstance(theModelComponent, Gdasrv)): pass else: gm.append("Expecting a model component instance of some sort.") gm.append("Ie a comp, rMatrix, or gdasrv, instance.") gm.append("Got theModelComponent = %s" % theModelComponent) raise P4Error(gm) if self.model.cModel: self.deleteCStuff() partNum = theModelComponent.partNum if node == None: theNode = self.root else: theNode = self.node(node) isBad = 0 if isinstance(theModelComponent, Comp): if theModelComponent != self.model.parts[partNum].comps[theModelComponent.num]: isBad = 1 elif isinstance(theModelComponent, RMatrix): if theModelComponent != self.model.parts[partNum].rMatrices[theModelComponent.num]: isBad = 1 elif isinstance(theModelComponent, Gdasrv): if theModelComponent != self.model.parts[partNum].gdasrvs[theModelComponent.num]: isBad = 1 else: # This will never happen-- we checked above. Overkill. gm.append("I don't recognise theModelComponent.") raise P4Error(gm) if isBad: gm.append("The model component can only be set on the tree that made it.") raise P4Error(gm) # For the root, we set comps and nothing else. For other nodes we # set anything. if theNode == self.root: if isinstance(theModelComponent, Comp): theNode.parts[partNum].compNum = theModelComponent.num else: if isinstance(theModelComponent, Comp): #print(f"node {theNode.nodeNum}, partNum {partNum}, comp num {theModelComponent.num}") theNode.parts[partNum].compNum = theModelComponent.num elif isinstance(theModelComponent, RMatrix): theNode.br.parts[partNum].rMatrixNum = theModelComponent.num elif isinstance(theModelComponent, Gdasrv): theNode.br.parts[partNum].gdasrvNum = theModelComponent.num if clade: aboves = self.getNodeNumsAbove(theNode, leavesOnly=0) for i in aboves: if isinstance(theModelComponent, Comp): self.nodes[i].parts[partNum].compNum = theModelComponent.num elif isinstance(theModelComponent, RMatrix): self.nodes[i].br.parts[ partNum].rMatrixNum = theModelComponent.num elif isinstance(theModelComponent, Gdasrv): self.nodes[i].br.parts[ partNum].gdasrvNum = theModelComponent.num
[docs] def setModelThingsRandomly(self, forceRepresentation=2): """This method has been renamed to setModelComponentsOnNodesRandomly""" gm = ["Tree.setModelThingsRandomly() has been renamed Tree.setModelComponentsOnNodesRandomly()"] raise P4Error(gm)
[docs] def setModelComponentsOnNodesRandomly(self, forceRepresentation=2): """Place model components (semi-)randomly on the tree. For example, if there are 2 compositions in model part partNum, this method will decorate each node of the tree with zeros and ones, randomly. The actual component set is node.parts[partNum].compNum. If the model is homogeneous, it will just put zeros in all the nodes. We want to have each model component on the tree somewhere, and so it is not really randomly set. If the model component numbers were assigned randomly on the tree, it may occur that some model component numbers by chance would not be represented. This is not allowed, and you can set forceRepresentation to some positive integer, 1 or more. That number will be the lower limit allowed on the number of nodes that get assigned the model component number. For example, if forceRepresentation is set to 2, then each model component must get assigned to at least 2 nodes.""" gm = ['Tree.setModelComponentsOnNodesRandomly()'] if not self.model or not self.model.nParts: gm.append("No model parts?") raise P4Error(gm) if self.model.cModel: self.deleteCStuff() # self.model.dump() if not isinstance(forceRepresentation, int) or forceRepresentation < 1: gm.append("Arg 'forceRepresentation' should be 1 or more.") gm.append("Got forceRepresentation = %s" % forceRepresentation) raise P4Error(gm) for i in self.preOrder: if i == var.NO_ORDER: gm.append("This method does not work if any nodes are not used in the tree.") raise P4Error(gm) for pNum in range(self.model.nParts): mp = self.model.parts[pNum] # First do comps if mp.nComps == 1: for n in self.nodes: n.parts[pNum].compNum = 0 elif mp.nComps > 1: if mp.ndch2: pass else: nNodes = len(self.nodes) if (mp.nComps * forceRepresentation) > nNodes: gm.append("Part %i" % pNum) gm.append( "There are not enough nodes (%i) to put %i" % (nNodes, mp.nComps)) gm.append( "comps on at least forceRepresentation (%i) nodes." % forceRepresentation) raise P4Error(gm) nList = self.nodes[:] random.shuffle(nList) # get the forceRepresentation out of the way first for mtNum in range(mp.nComps): for fr in range(forceRepresentation): n = nList.pop() n.parts[pNum].compNum = mtNum # Now do the rest for n in nList: n.parts[pNum].compNum = random.randrange(mp.nComps) else: gm.append("No comps in part %i" % pNum) raise P4Error(gm) # Second do rMatrices if mp.nRMatrices == 1: for n in self.nodes: if n != self.root: n.br.parts[pNum].rMatrixNum = 0 elif mp.nRMatrices > 1: if mp.ndrh2: pass else: nNodes = len(self.nodes) - 1 if (mp.nRMatrices * forceRepresentation) > nNodes: gm.append("Part %i" % pNum) gm.append( "There are not enough nodes (%i) to put %i" % (nNodes, mp.nRMatrices)) gm.append( "rMatrices on at least forceRepresentation (%i) nodes." % forceRepresentation) raise P4Error(gm) nList = self.nodes[:] nList.remove(self.root) random.shuffle(nList) # get the forceRepresentation out of the way first for mtNum in range(mp.nRMatrices): for fr in range(forceRepresentation): n = nList.pop() n.br.parts[pNum].rMatrixNum = mtNum # Now do the rest for n in nList: n.br.parts[pNum].rMatrixNum = random.randrange( mp.nRMatrices) else: gm.append("No rMatrices in part %i" % pNum) raise P4Error(gm) # Third do gdasrvs if mp.nGammaCat > 1: if mp.nGdasrvs == 1: for n in self.nodes: if n != self.root: n.br.parts[pNum].gdasrvNum = 0 elif mp.nGdasrvs > 1: nNodes = len(self.nodes) - 1 if (mp.nGdasrvs * forceRepresentation) > nNodes: gm.append("Part %i" % pNum) gm.append( "There are not enough nodes (%i) to put %i" % (nNodes, mp.nGdasrvs)) gm.append( "gdasrvs on at least forceRepresentation (%i) nodes." % forceRepresentation) raise P4Error(gm) nList = self.nodes[:] nList.remove(self.root) random.shuffle(nList) # get the forceRepresentation out of the way first for mtNum in range(mp.nGdasrvs): for fr in range(forceRepresentation): n = nList.pop() n.br.parts[pNum].gdasrvNum = mtNum # Now do the rest for n in nList: n.br.parts[pNum].gdasrvNum = random.randrange( mp.nGdasrvs) else: gm.append( "No gdasrvs in part %i and yet nGammaCat > 1" % pNum) raise P4Error(gm)
# self.dump(model=True)
[docs] def setModelComponentsNNodes(self): """Set nNodes for all model components""" gm = ['Tree.setModelComponentsNNodes()'] if not self.model or not self.model.nParts: gm.append("No model parts?") raise P4Error(gm) for pNum in range(self.model.nParts): mp = self.model.parts[pNum] if not mp.nComps: gm.append("No comps in model part %i." % pNum) raise P4Error(gm) elif not mp.nRMatrices: gm.append("No rMatrices in model part %i." % pNum) raise P4Error(gm) for pNum in range(self.model.nParts): mp = self.model.parts[pNum] # First do comps if mp.nComps == 1: pass elif mp.nComps > 1: for mtNum in range(mp.nComps): mp.comps[mtNum].nNodes = 0 for n in self.iterNodes(): mp.comps[n.parts[pNum].compNum].nNodes += 1 # Second do rMatrices if mp.nRMatrices == 1: pass elif mp.nRMatrices > 1: for mtNum in range(mp.nRMatrices): mp.rMatrices[mtNum].nNodes = 0 for n in self.iterNodesNoRoot(): mp.rMatrices[n.br.parts[pNum].rMatrixNum].nNodes += 1 # Third do gdasrvs if mp.nGammaCat > 1: if mp.nGdasrvs == 1: pass elif mp.nGdasrvs > 1: for mtNum in range(mp.nGdasrvs): mp.gdasrvs[mtNum].nNodes = 0 for n in self.iterNodesNoRoot(): mp.gdasrvs[n.br.parts[pNum].gdasrvNum].nNodes += 1 else: gm.append("No gdasrvs in part %i" % pNum) raise P4Error(gm)
[docs] def summarizeModelComponentsNNodes(self): """Summarize nNodes for all model components if isHet""" gm = ['Tree.summarizeModelComponentsNNodes()'] if not self.model or not self.model.nParts: gm.append("No model parts?") raise P4Error(gm) if not self.model.isHet: gm.append("This method is for hetero models") raise P4Error(gm) for pNum in range(self.model.nParts): mp = self.model.parts[pNum] if not mp.nComps: gm.append("No comps in model part %i." % pNum) raise P4Error(gm) elif not mp.nRMatrices: gm.append("No rMatrices in model part %i." % pNum) raise P4Error(gm) for pNum in range(self.model.nParts): print("\n%6s %s:" % ("Part", pNum)) mp = self.model.parts[pNum] # First do comps if mp.nComps == 1: pass elif mp.ndch2: print("%16s" % "ndch2 is on") elif mp.nComps > 1: for mtNum in range(mp.nComps): # print " comp %i nNodes=%i" % (mtNum, # mp.comps[mtNum].nNodes) print("%16s %i %s = %i" % ("composition", mtNum, "nNodes", mp.comps[mtNum].nNodes)) # Second do rMatrices if mp.nRMatrices == 1: pass elif mp.ndrh2: print("%16s" % "ndrh2 is on") elif mp.nRMatrices > 1: for mtNum in range(mp.nRMatrices): print("%16s %i %s = %i" % ("rate matrix", mtNum, "nNodes", mp.rMatrices[mtNum].nNodes)) # Third do gdasrvs if mp.nGammaCat > 1: if mp.nGdasrvs == 1: pass elif mp.nGdasrvs > 1: for mtNum in range(mp.nGdasrvs): print(" gdasrv %i nNodes =%i" % (mtNum, mp.gdasrvs[mtNum].nNodes)) else: gm.append("No gdasrvs in part %i" % pNum) raise P4Error(gm)
[docs] def setTextDrawSymbol(self, theSymbol='-', node=None, clade=1): gm = ['\nTree.setTextDrawString()'] if not theSymbol or not isinstance(theSymbol, str) or len(theSymbol) != 1: gm.append("theSymbol should be a single character string.") raise P4Error(gm) if not node: theNode = self.root else: theNode = self.node(node) if theNode == self.root: pass else: theNode.br.textDrawSymbol = theSymbol if clade: aboves = self.getNodeNumsAbove(theNode, leavesOnly=0) for i in aboves: self.nodes[i].br.textDrawSymbol = theSymbol
[docs] def setNGammaCat(self, partNum=0, nGammaCat=1): gm = ['\nTree.setNGammaCat()'] if not self.data or not self.model: gm.append("No data?") raise P4Error(gm) if self.model.cModel: self.deleteCStuff() if partNum < 0 or partNum >= self.model.nParts: gm.append("PartNum %s is out of range of %s parts." % (partNum, self.model.nParts)) raise P4Error(gm) try: x = int(nGammaCat) except ValueError: gm.append("'%s' does not appear to be an integer." % i) raise P4Error(gm) if x < 1: gm.append("nGammaCat should not be less than 1.") raise P4Error(gm) elif x > 16: gm.append("nGammaCat '%s' exceeds the arbitrary limit of 16." % x) raise P4Error(gm) self.model.parts[partNum].nGammaCat = nGammaCat
[docs] def modelSanityCheck(self, resetEmpiricalComps=True): """Check that the tree, data, and model specs are good to go. Complain and exit if there is anything wrong that might prevent a likelihood evaluation from being done. We are assuming that a data object exists and is attached, and that model stuff has been set. Check that each part has at least 1 each from comps, rMatrices, and gdasrvs (if nGammaCat is > 1). Check that each node has a comp, rMatrix, and gdasr. Check that all comps, rMatrices, gdasrvs are used on a node somewhere. Here relRate, ie the relative rate of each data partition, is adjusted based on the size of the data partitions. newRelRate_p = oldRelRate_p * (Sum_p[oldRelRate_i * partLen_i] / Sum[partLen_i]) That ensures that Sum(relRate_i * partLen_i) = totalDataLength, ie that the weighted mean of the rates is 1.0. This method also tallies up the number of free prams in the whole model, and sets self.model.nFreePrams. """ complaintHead = '\nTree.modelSanityCheck()' gm = [complaintHead] # print("\nTree.modelSanityCheck() here. self.model.nParts=%s" % self.model.nParts) # print("\nTree.modelSanityCheck() here. resetEmpiricalComps=%s" % resetEmpiricalComps) isBad = 0 complaints = [] if not self.data: complaints.append(' No data.') isBad = 1 if not self.model: complaints.append(' No model.') isBad = 1 # Set isHet. for pNum in range(self.model.nParts): mp = self.model.parts[pNum] mp.isHet = 0 if mp.nComps > 1 or mp.nRMatrices > 1: mp.isHet = 1 if mp.nGammaCat > 1 and mp.nGdasrvs > 1: mp.isHet = 1 # This week ndch2 does not play well with other hetero models like ndch, # so insist that all partitions are or are not ndch2. firstPartIsNdch2 = self.model.parts[0].ndch2 for pNum in range(self.model.nParts): mp = self.model.parts[pNum] if mp.ndch2 != firstPartIsNdch2: complaints.append(" Can't mix ndch2 with non-ndch2 models.") isBad = 1 # Check that all parts have all the required stuff. Make a list # of errors. If there is something missing or wrong, don't die # right away, but add the problem to the list, and write it all # out at the end. It gives the user a chance to fix more than one # error at a time. for pNum in range(self.model.nParts): complaints.append(' Part %i' % pNum) partIsBad = 0 mp = self.model.parts[pNum] # Check if essential things have been set if not mp.nComps: complaints.append(' No comps in part %s' % pNum) partIsBad = 1 if not mp.nRMatrices: complaints.append(' No rMatrices in part %s' % pNum) partIsBad = 1 if mp.nGammaCat > 1: if not mp.nGdasrvs: complaints.append(' No gdasrvs in part %s' % pNum) partIsBad = 1 if mp.nGammaCat == 1: if mp.nGdasrvs: complaints.append( ' There should be no gdasrvs in part %s, with nGammaCat=1' % pNum) partIsBad = 1 if not mp.pInvar: complaints.append(' No pInvar in part %s' % pNum) partIsBad = 1 if mp.ndch2: if mp.nComps != len(list(self.iterNodes())): complaints.append('Part %i, ndch2 needs a comp for each node' % pNum) complaints.append(f" {len(list(self.iterNodes()))} nodes in the tree, {mp.nComps} comps") partIsBad = 1 if partIsBad: gm.append(" (Indices are zero-based.)") gm += complaints raise P4Error(gm) # Check if comp values have been set. for mt in mp.comps: if mt.spec != 'empirical' or not resetEmpiricalComps: if mt.val is None: complaints.append( ' No composition val in part %s' % pNum) partIsBad = 1 if len(mt.val) != mp.dim: complaints.append(' Composition val is wrong length (%i), but dim is %i' % ( len(mt.val), mp.dim)) partIsBad = 1 # We don't want multiple rMatrices or free rMatrices if mp.dim is 2 if mp.dim == 2: if mp.nRMatrices > 1: complaints.append( ' Part %s is dim 2, but we have more than one rMatrix' % pNum) partIsBad = 1 mt = mp.rMatrices[0] # hopefully only one if mt.free: complaints.append( ' Part %s is dim 2, but rMatrix 0 is free' % pNum) partIsBad = 1 mp.nCat = mp.nGammaCat # If the model part isHet, we need to check that all nodes # have something assigned, and that all model components are # used. If the model part is not het, we can skip that, # but we need to check that all the # node.parts[pNum].compNum are 0, and all the # node.br.parts[pNum].rMatrixNum and # node.br.parts[pNum].gdasrvNum are set to 0. if not mp.isHet: # print "model part %i is not het" % pNum for n in self.iterNodes(): # print("pNum = %i, n.nodeNum=%i, len n.parts = %i" % (pNum, n.nodeNum, len(n.parts))) n.parts[pNum].compNum = 0 if n != self.root: n.br.parts[pNum].rMatrixNum = 0 if mp.nGammaCat > 1: n.br.parts[pNum].gdasrvNum = 0 else: # isHet # If there is only one comp, rMatrix, or gdasrv, then # simply set it. if mp.nComps == 1: for n in self.iterNodes(): n.parts[pNum].compNum = 0 if mp.nRMatrices == 1: for n in self.iterNodes(): if n != self.root: n.br.parts[pNum].rMatrixNum = 0 if mp.nGammaCat > 1 and mp.nGdasrvs == 1: for n in self.iterNodes(): if n != self.root: n.br.parts[pNum].gdasrvNum = 0 # print "model part %i is het" % pNum # New ad hoc attribute 'isUsed', to keep track of whether # any node uses it. for mt in mp.comps: mt.isUsed = 0 for mt in mp.rMatrices: mt.isUsed = 0 for mt in mp.gdasrvs: mt.isUsed = 0 # Does every node have all required things? for n in self.iterNodes(): mtNum = n.parts[pNum].compNum if mtNum >= 0 and mtNum < mp.nComps: mt = mp.comps[mtNum] mt.isUsed = 1 else: complaints.append(' Part %s, node %s has no comp.' % (pNum, n.nodeNum)) partIsBad = 1 if n != self.root: mtNum = n.br.parts[pNum].rMatrixNum if mtNum >= 0 and mtNum < mp.nRMatrices: mt = mp.rMatrices[n.br.parts[pNum].rMatrixNum] mt.isUsed = 1 else: complaints.append(' Part %s, node %s has no rMatrix.' % (pNum, n.nodeNum)) partIsBad = 1 if mp.nGammaCat > 1: mtNum = n.br.parts[pNum].gdasrvNum if mtNum >= 0 and mtNum < mp.nGdasrvs: mt = mp.gdasrvs[n.br.parts[pNum].gdasrvNum] mt.isUsed = 1 else: complaints.append(' Part %s, node %s has no gdasrv. nGammaCat=%s' % ( pNum, n.nodeNum, mp.nGammaCat)) partIsBad = 1 if mp.nGammaCat == 1: if n.br.parts[pNum].gdasrvNum != -1: complaints.append(' Part %s, node %s has a gdasrv, but nGammaCat is 1.' % ( pNum, n.nodeNum)) partIsBad = 1 # Is every model component used? for mt in mp.comps: if not mt.isUsed: complaints.append(' Part %s, comp %s is not used.' % (pNum, mt.num)) partIsBad = 1 for mt in mp.rMatrices: if not mt.isUsed: complaints.append( ' Part %s, rMatrix %s is not used.' % (pNum, mt.num)) partIsBad = 1 for mt in mp.gdasrvs: if not mt.isUsed: complaints.append( ' Part %s, gdasrv %s is not used.' % (pNum, mt.num)) partIsBad = 1 # Clean up ad hoc attr 'isUsed' for mt in mp.comps: del(mt.isUsed) for mt in mp.rMatrices: del(mt.isUsed) for mt in mp.gdasrvs: del(mt.isUsed) if partIsBad: isBad = 1 else: complaints.append(' ok') # ################################## if resetEmpiricalComps: self.setEmpiricalComps() # self.model.isHet if any part isHet self.model.isHet = 0 for pNum in range(self.model.nParts): if self.model.parts[pNum].isHet: self.model.isHet = 1 break # relativeRates self.model.doRelRates = 0 if self.model.nParts > 1: for p in self.model.parts: if p.relRate != 1.0: # This week, the default relRate is 1.0 self.model.doRelRates = 1 break if self.model.relRatesAreFree: self.model.doRelRates = 1 if self.model.doRelRates: totDataLen = 0 for p in self.data.parts: totDataLen += p.nChar fact = 0.0 for i in range(self.model.nParts): fact += (self.model.parts[i].relRate * self.data.parts[i].nChar) fact = float(totDataLen) / fact for p in self.model.parts: p.relRate *= fact if 0: print("RelativeRates (adjusted for length)") for i in range(self.model.nParts): p = self.model.parts[i] print(" part %s, nChar %5s, relRate %s" % (p.num, self.data.parts[i].nChar, p.relRate)) if 1: total = 0.0 for i in range(self.model.nParts): total += (self.model.parts[i].relRate * (float(self.data.parts[i].nChar) / float(totDataLen))) if abs(total - 1.0) > 1.0e-12: gm.append( 'Error in relativeRate calculation (total=%s).' % total) raise P4Error(gm) # print "modelSanityCheck. relRatesAreFree=%s, doRelRates=%s" % # (self.model.relRatesAreFree, self.model.doRelRates) # model.nFreePrams self.model.nFreePrams = 0 for mp in self.model.parts: for mt in mp.comps: if mt.free: self.model.nFreePrams += mp.dim - 1 for mt in mp.rMatrices: if mt.free: if mt.spec == '2p': self.model.nFreePrams += 1 else: self.model.nFreePrams += ( ((mp.dim * mp.dim) - mp.dim) / 2) - 1 for mt in mp.gdasrvs: if mt.free: self.model.nFreePrams += 1 if mp.pInvar.free: self.model.nFreePrams += 1 # print "Tree.modelSanityCheck(). Counted %i free params." % # self.model.nFreePrams if self.model.doRelRates and self.model.relRatesAreFree: self.model.nFreePrams += self.model.nParts - 1 if isBad: gm.append("(Indices are zero-based.)") gm += complaints raise P4Error(gm)
[docs] def setEmpiricalComps(self): """Set any empirical model comps to the comp of the data. This is done by self.modelSanityCheck(), but sometimes you may want to do it at other times. For example, do this after exchanging Data objects, or after simulating. In those cases there does not seem to be a reasonable way to do it automatically.""" complaintHead = '\nTree.setEmpiricalComps()' gm = [complaintHead] if not self.model: gm.append("This tree has no model.") raise P4Error(gm) if not self.data: gm.append("This tree has no data.") raise P4Error(gm) for mp in self.model.parts: for c in mp.comps: if c.spec == 'empirical': # print "got empirical comp, comp %s in part %s. (nComps=%i, isHet=%s)" % ( # c.num, mp.num, mp.nComps, mp.isHet) if not mp.isHet: seqNums = None elif mp.nComps == 1: seqNums = None else: seqNums = [] # for n in self.nodes: # print "node %2i seqNum=%3i n.parts[%i].compNum=%3i" % ( # n.nodeNum, n.seqNum, mp.num, n.parts[mp.num].compNum) for n in self.iterNodes(): # Is the comp used by the node? if n.parts[mp.num].compNum == c.num: # print "comp %s is used by node %s" % (c.num, # n.nodeNum) if n.isLeaf: nodeNums = [n.nodeNum] else: nodeNums = self.getNodeNumsAbove( n, leavesOnly=1) # gm.append("nodeNums for %s = %s" % # (n.nodeNum, nodeNums) for i in nodeNums: seqNum = self.nodes[i].seqNum if seqNum not in seqNums: seqNums.append(seqNum) # print "setEmpiricalComps() got seqNums = %s" % # seqNums if not seqNums: gm.append( "Something is wrong here. part %i, comp %i." % (mp.num, c.num)) gm.append( "This comp object has no sequences from which to get the empirical comp.") gm.append( "Maybe you need to yourTree.setModelModelComponentOnNode() or ") gm.append("yourTree.setModelComponentsOnNodesRandomly()") #gm.append( # "Or maybe its an extra comp in an RJ MCMC? -- If so, fix") #gm.append("the comp val to eg 'equal'.") raise P4Error(gm) # dim long, not dim - 1 c.val = self.data.parts[mp.num].composition(seqNums) # print " seqNums=%s, c.val=%s" % (seqNums, c.val) needsNormalizing = 0 for i in range(len(c.val)): if c.val[i] < var.PIVEC_MIN: c.val[i] = var.PIVEC_MIN + (0.2 * var.PIVEC_MIN) + (var.PIVEC_MIN * random.random()) needsNormalizing = 1 theSum = numpy.sum(c.val) # print "setEmpiricalComps(). Got theSum = %i" % theSum # We may have asked for the comp of an empty sequence, # in which case val is all zeros. Check for that. if math.fabs(1.0 - theSum) > 0.1: gm.append( "Something is very wrong here. Empirical comp vals should sum to 1.0") gm.append("The sum of the comp vals for part %s, comp %s, is %s" % ( mp.num, c.num, theSum)) gm.append( "Probably the sequences from which the composition was taken were blank.") raise P4Error(gm) if needsNormalizing or abs(theSum - 1.0) > 1e-16: c.val /= theSum
#################################################### model fit ####################################################
[docs] def simsForModelFitTests(self, reps=10, seed=None): """Do simulations for model fit tests. The model fit tests are the Goldman-Cox test, and the tree- and model-based composition fit test. Both of those tests require simulations, optimization of the tree and model parameters on the simulated data, and extraction of statistics for use in the null distribution. So might as well do them together. The Goldman-Cox test is not possible if there are any gaps or ambiguities, and in that case Goldman-Cox simulation stats are not collected. Doing the simulations is therefore the time-consuming part, and so this method facilitates doing that job in sections. If you do that, set the random number seed to different numbers. If the seed is not set, the process id is used. (So obviously you should explicitly set the seed if you are doing several runs in the same process.) Perhaps you may want to do the simulations on different machines in a cluster. The stats are saved to files. The output files have the seed number attached to the end, so that different runs of this method will have different output file names. Hopefully. When your model uses empirical comps, simulation uses the empirical comp of the original data for simulation (good), then the optimization part uses the empirical comp of the newly-simulated data (also good, I think). In that case, if it is tree-homogeneous, the X^2_m statistic would be identical to the X^2 statistic. You would follow this method with the modelFitTests() method, which uses all the stats files to make null distributions to assess significance of the same stats from self.""" #gm = ['Tree.simsForModelFitTests()'] # Make a new data object in which to do the sims, so we do not over-write self # print "a self.data = %s" % self.data # self.data.dump() savedData = self.data self.data = None # This triggers self.deleteCStuff() self.data = savedData.dupe() # We need 2 trees, one for sims, and one for evaluations. We can # use self for sims. Make a copy for evaluations. evalTree = self.dupe() evalTree.data = self.data # make sure all the memory works ... self.calcLogLike(verbose=0) evalTree.calcLogLike(verbose=0) # We can't do the Goldman-Cox test if there are any gaps or # ambiguities. doGoldmanCox = True for a in self.data.alignments: if a.hasGapsOrAmbiguities(): doGoldmanCox = False break # print "sims doGoldmanCox = %s" % doGoldmanCox # Collect info about the observed data statsHashList = [] # one for each data part for pNum in range(self.data.nParts): h = {} statsHashList.append(h) h['individualNSites'] = [] h['observedIndividualCounts'] = [] for j in range(self.data.nTax): h['individualNSites'].append( pf.partSequenceSitesCount(self.data.parts[pNum].cPart, j)) # no gaps or qmarks h['observedIndividualCounts'].append( self.data.parts[pNum].composition([j])) # (In the line above, its temporarily composition, not counts) # print "got seq %i comp = %s' % (j, h['observedIndividualCounts"][-1]) # At the moment, h['observedIndividualCounts'] has composition, # not counts. So multiply by h['individualNSites'] for i in range(self.data.nTax): for j in range(self.data.parts[pNum].dim): h['observedIndividualCounts'][i][ j] *= h['individualNSites'][i] # We will want to skip any sequences composed of all gaps skipTaxNums = [] for pNum in range(self.data.nParts): stn = [] for tNum in range(self.data.nTax): if not statsHashList[pNum]['individualNSites'][tNum]: stn.append(tNum) skipTaxNums.append(stn) # print "skipTaxNums = %s" % skipTaxNums if seed == None: seed = os.getpid() pf.reseedCRandomizer(int(seed)) # Open up some output files in which to put the sim data outfileBaseName = 'sims' # Could be an argument, user-assignable. if doGoldmanCox: f2Name = outfileBaseName + '_GoldmanStats_%s' % seed f2 = open(f2Name, 'w') f2.write('# part\tunconstr L\t log like \tGoldman-Cox stat\n') f3Name = outfileBaseName + '_CompStats_%s' % seed f3 = open(f3Name, 'w') # When sims are done when the comp is empirical (whether or not # free) we need to re-set the comps based on the newly-simulated # data. So first find out if any comps are empirical. hasEmpiricalComps = 0 for mp in self.model.parts: for c in mp.comps: if c.spec == 'empirical': hasEmpiricalComps = 1 break # print "hasEmpiricalComps=%s" % hasEmpiricalComps # Do the sims for i in range(reps): self.simulate() if hasEmpiricalComps: # Set empirical comps based on newly-simulated data evalTree.setEmpiricalComps() evalTree.optLogLike(verbose=0) # The time-consuming part if doGoldmanCox: if self.data.nParts > 1: self.data.calcUnconstrainedLogLikelihood2() diff = self.data.unconstrainedLogLikelihood - \ evalTree.logLike f2.write( '-1\t%f\t%f\t%f\n' % (self.data.unconstrainedLogLikelihood, evalTree.logLike, diff)) for pNum in range(self.data.nParts): unc = pf.getUnconstrainedLogLike( self.data.parts[pNum].cPart) # 0 for getSiteLikes like = pf.p4_partLogLike( evalTree.cTree, self.data.parts[pNum].cPart, pNum, 0) diff = unc - like f2.write('%i\t%f\t%f\t%f\n' % (pNum, unc, like, diff)) for pNum in range(self.data.nParts): h = statsHashList[pNum] # pf.p4_expectedCompositionCounts returns a tuple of tuples # representing the counts of the nodes in proper alignment # order. ret = pf.p4_expectedCompositionCounts(evalTree.cTree, pNum) h['expectedIndividualCounts'] = list(ret) # alignment order # print h['expectedIndividualCounts'] h['overallSimStat'] = 0.0 h['individualSimStats'] = [0.0] * self.data.nTax for seqNum in range(self.data.nTax): if seqNum in skipTaxNums[pNum]: pass else: obsCounts = list( pf.singleSequenceBaseCounts(self.data.parts[pNum].cPart, seqNum)) # obsCounts is the counts observed in the simulation. # It assumes that there are no gaps. If there are # gaps, adjust the counts. if h['individualNSites'][seqNum] != self.data.parts[pNum].nChar: factor = float( h['individualNSites'][seqNum]) / self.data.parts[pNum].nChar # print "factor = %s" % factor for j in range(self.data.parts[pNum].dim): obsCounts[j] = float(obsCounts[j]) * factor # print obsCounts for j in range(self.data.parts[pNum].dim): # Avoid dividing by Zero. if h['expectedIndividualCounts'][seqNum][j]: dif = obsCounts[ j] - h['expectedIndividualCounts'][seqNum][j] h['individualSimStats'][ seqNum] += ((dif * dif) / h['expectedIndividualCounts'][seqNum][j]) h['overallSimStat'] += h['individualSimStats'][seqNum] f3.write('%i\t' % pNum) for seqNum in range(self.data.nTax): f3.write('%f\t' % h['individualSimStats'][seqNum]) f3.write('%f\n' % h['overallSimStat']) # print h['overallSimStat'] if doGoldmanCox: f2.close() f3.close() # Replace the saved data # Since we are replacing an exisiting data, this triggers # self.deleteCStuff() self.data = savedData
[docs] def modelFitTests(self, fName='model_fit_tests_out', writeRawStats=0): """Do model fit tests on the data. The two tests are the Goldman-Cox test, and the tree- and model- based composition fit test. Both require simulations with optimizations in order to get a null distribution, and those simulations need to be done before this method. The simulations should be done with the simsForModelFitTests() method. Self should have a data and a model attached, and be optimized. The Goldman-Cox test (Goldman 1993. Statistical tests of models of DNA substitution. J Mol Evol 36: 182-198.) is a test for overall fit of the model to the data. It does not work if the data have gaps or ambiguities. The tree- and model-based composition test asks the question: 'Does the composition implied by the model fit the data?' If the model is homogeneous and empirical comp is used, then this is the same as the chi-square test except that the null distribution comes from simulations, not from the chi-square distribution. In that case only the question is, additionally, 'Are the data homogeneous in composition?', ie the same question asked by the chi-square test. However, the data might be heterogeneous, and the model might be heterogeneous over the tree; the tree- and model-based composition fit test can ask whether the heterogeneous model fits the heterogeneous data. The composition is tested in each data partition, separately. The test is done both overall, ie for all the sequences together, and for individual sequences. If you just want a compo homogeneity test with empirical homogeneous comp, try the compoTestUsingSimulations() method-- its way faster, because there are not optimizations in the sims part. Output is verbose, to a file.""" gm = ['Tree.modelFitTests()'] self.calcLogLike(verbose=0) # Usually True. Set to False for debugging, experimentation, getting # individual stats, etc doOut = True # We can't do the Goldman-Cox test if there are any gaps or # ambiguities. doGoldmanCox = True for a in self.data.alignments: if a.hasGapsOrAmbiguities(): doGoldmanCox = False break # print "test doGoldmanCox = %s" % doGoldmanCox rawFName = '%s_raw.py' % fName #flob = sys.stderr #fRaw = sys.stderr if doOut: flob = open(fName, 'w') else: flob = None if writeRawStats: fRaw = open(rawFName, 'w') else: fRaw = None ####################### # Goldman-Cox stats ####################### # For a two-part data analysis, the first few lines of the # sims_GoldmanStats_* file will be like the following. Its in # groups of 3-- the first one for all parts together (part number # -1), and the next lines for separate parts. # part unconstr L log like Goldman-Cox stat # -1 -921.888705 -1085.696919 163.808215 # 0 -357.089057 -430.941958 73.852901 # 1 -564.799648 -654.754962 89.955314 # -1 -952.063037 -1130.195799 178.132761 # 0 -362.164119 -439.709824 77.545705 # ... and so on. # For a one-part analysis, it will be the same except that one sim # gets only one line, starting with zero. if doGoldmanCox: goldmanOverallSimStats = [] if self.data.nParts > 1: goldmanIndividualSimStats = [] for partNum in range(self.data.nParts): goldmanIndividualSimStats.append([]) import glob goldmanFNames = glob.glob('sims_GoldmanStats_*') # print "nParts=%s" % self.data.nParts # print goldmanFNames for fName1 in goldmanFNames: f2 = open(fName1) aLine = f2.readline() if not aLine: gm.append("Empty file %s" % fName1) raise P4Error(gm) if aLine[0] != '#': gm.append( "Expecting a '#' as the first character in file %s" % fName1) raise P4Error(gm) aLine = f2.readline() # print "a got line %s" % aLine, while aLine: if self.data.nParts > 1: splitLine = aLine.split() if len(splitLine) != 4: gm.append( "Bad line in Goldman stats file %s" % fName1) gm.append("'%s'" % aLine) raise P4Error(gm) if int(splitLine[0]) != -1: gm.append( "Bad line in Goldman stats file %s" % fName1) gm.append("First item should be -1") gm.append("'%s'" % aLine) raise P4Error(gm) # print splitLine[-1] goldmanOverallSimStats.append(float(splitLine[-1])) aLine = f2.readline() # print "b got line %s" % aLine, if not aLine: gm.append("Premature end to file %s" % fName1) raise P4Error(gm) for partNum in range(self.data.nParts): splitLine = aLine.split() # print "partNum %i, splitLine=%s" % (partNum, # splitLine) if len(splitLine) != 4: gm.append( "Bad line in Goldman stats file %s" % fName1) gm.append("'%s'" % aLine) raise P4Error(gm) try: splitLine[0] = int(splitLine[0]) except ValueError: gm.append( "Bad line in Goldman stats file %s" % fName1) gm.append( "First item should be the partNum %i" % partNum) gm.append("'%s'" % aLine) raise P4Error(gm) if splitLine[0] != partNum: gm.append( "Bad line in Goldman stats file %s" % fName1) gm.append( "First item should be the partNum %i" % partNum) gm.append("'%s'" % aLine) raise P4Error(gm) # for taxNum in range(self.data.nTax): # print splitLine[taxNum + 1] # print splitLine[-1] if self.data.nParts == 1: goldmanOverallSimStats.append(float(splitLine[-1])) else: goldmanIndividualSimStats[ partNum].append(float(splitLine[-1])) aLine = f2.readline() # print "c got line %s" % aLine, f2.close() # print "goldmanOverallSimStats =", goldmanOverallSimStats # print "goldmanIndividualSimStats =", goldmanIndividualSimStats # sys.exit() if doOut: flob.write('Model fit tests\n===============\n\n') flob.write( 'The data that we are testing have %i taxa,\n' % self.data.nTax) if len(self.data.alignments) == 1: flob.write('1 alignment, ') else: flob.write('%i alignments, ' % len(self.data.alignments)) if self.data.nParts == 1: flob.write('and 1 data partition.\n') else: flob.write('and %i data partitions.\n' % self.data.nParts) flob.write('The lengths of those partitions are as follows:\n') flob.write(' partNum nChar \n') for i in range(self.data.nParts): flob.write(' %3i %5i\n' % (i, self.data.parts[i].nChar)) self.data.calcUnconstrainedLogLikelihood2() if doOut: flob.write("\nThe unconstrained likelihood is %f\n" % self.data.unconstrainedLogLikelihood) flob.write( '(This is the partition-by-partition unconstrained log likelihood, \n') flob.write( 'ie the sum of the unconstrained log likes from each partition separately, \n') flob.write( 'and so will not be the same as that given by PAUP, if the data are partitioned.)\n') flob.write('\n\nGoldman-Cox test for overall model fit\n') flob.write('======================================\n') flob.write( 'The log likelihood for these data for this tree is %f\n' % self.logLike) flob.write('The unconstrained log likelihood for these data is %f\n' % self.data.unconstrainedLogLikelihood) originalGoldmanCoxStat = self.data.unconstrainedLogLikelihood - \ self.logLike if doOut: flob.write( 'The Goldman-Cox statistic for the original data is the difference, %f\n' % originalGoldmanCoxStat) if self.data.nParts > 1: flob.write( '(The unconstrained log likelihood for these data is calculated partition by partition.)\n') flob.write('\n') if self.data.nParts > 1: originalGoldmanCoxStatsByPart = [] if doOut: flob.write('Stats by partition.\n') flob.write( 'part\t unconstrLogL\t log like \tGoldman-Cox stat\n') flob.write( '----\t ----------\t -------- \t----------------\n') for partNum in range(self.data.nParts): unc = pf.getUnconstrainedLogLike( self.data.parts[partNum].cPart) like = pf.p4_partLogLike( self.cTree, self.data.parts[partNum].cPart, partNum, 0) diff = unc - like if doOut: flob.write(' %i\t%f\t%f\t %f\n' % (partNum, unc, like, diff)) originalGoldmanCoxStatsByPart.append(diff) # Do the overall stat nSims = len(goldmanOverallSimStats) if doOut: flob.write('\nThere were %i simulations.\n\n' % nSims) if writeRawStats: fRaw.write('# Goldman-Cox null distributions.\n') if self.data.nParts > 1: fRaw.write( '# Simulation stats for overall data, ie for all data partitions combined.\n') else: fRaw.write('# Simulation stats.\n') fRaw.write('goldman_cox_overall = %s\n' % goldmanOverallSimStats) if self.data.nParts > 1: for partNum in range(self.data.nParts): fRaw.write( '# Simulation stats for data partition %i\n' % partNum) fRaw.write('goldman_cox_part%i = %s\n' % (partNum, goldmanIndividualSimStats[partNum])) prob = p4.func.tailAreaProbability( originalGoldmanCoxStat, goldmanOverallSimStats, verbose=0)[2] if doOut: flob.write('\n Overall Goldman-Cox test: ') if prob <= 0.05: flob.write('%13s' % "Doesn't fit.") else: flob.write('%13s' % 'Fits.') flob.write(' P = %5.3f\n' % prob) if self.data.nParts > 1: if doOut: flob.write(' Tests for individual data partitions:\n') for partNum in range(self.data.nParts): prob = p4.func.tailAreaProbability(originalGoldmanCoxStatsByPart[partNum], goldmanIndividualSimStats[partNum], verbose=0)[2] if doOut: flob.write( ' Part %-2i: ' % partNum) if prob <= 0.05: flob.write('%13s' % 'Doesn\'t fit.') else: flob.write('%13s' % 'Fits.') flob.write(' P = %5.3f\n' % prob) ######################### # COMPOSITION ######################### statsHashList = [] for pNum in range(self.data.nParts): h = {} statsHashList.append(h) h['individualNSites'] = [] h['observedIndividualCounts'] = [] for j in range(self.data.nTax): # print pf.partSequenceSitesCount(self.data.parts[pNum].cPart, # j) h['individualNSites'].append( pf.partSequenceSitesCount(self.data.parts[pNum].cPart, j)) # no gaps or qmarks # print self.data.parts[pNum].composition([j]) h['observedIndividualCounts'].append( self.data.parts[pNum].composition([j])) # The line above is temporarily composition, not counts # pf.expectedCompositionCounts returns a tuple of tuples # representing the counts of the nodes in proper alignment order. h['expectedIndividualCounts'] = list( pf.p4_expectedCompositionCounts(self.cTree, pNum)) # alignment order # At the moment, h['observedIndividualCounts'] has composition, # not counts. So multiply by h['individualNSites'] for i in range(self.data.nTax): for j in range(self.data.parts[pNum].dim): h['observedIndividualCounts'][i][ j] *= h['individualNSites'][i] # We will want to skip any sequences composed of all gaps skipTaxNums = [] for pNum in range(self.data.nParts): stn = [] for tNum in range(self.data.nTax): if not statsHashList[pNum]['individualNSites'][tNum]: stn.append(tNum) skipTaxNums.append(stn) # print "skipTaxNums = %s" % skipTaxNums # Do the boring old compo chi square test. if doOut: flob.write(longMessage1) # explanation ... for pNum in range(self.data.nParts): h = statsHashList[pNum] # Can't use p4.func.xSquared(), because there might be column # zeros. # print "observedIndividualCounts = %s' % # h['observedIndividualCounts"] nRows = len(h['observedIndividualCounts']) nCols = len(h['observedIndividualCounts'][0]) # I could have just used nSites, above theSumOfRows = p4.func._sumOfRows(h['observedIndividualCounts']) theSumOfCols = p4.func._sumOfColumns(h['observedIndividualCounts']) # print theSumOfCols isOk = 1 columnZeros = [] # for j in range(len(theSumOfRows)): # if theSumOfRows[j] == 0.0: # gm.append("Zero in a row sum. Programming error.") # raise P4Error(gm) for j in range(len(theSumOfCols)): if theSumOfCols[j] <= 0.0: columnZeros.append(j) theExpected = p4.func._expected(theSumOfRows, theSumOfCols) # print "theExpected = %s" % theExpected # print "columnZeros = %s" % columnZeros xSq = 0.0 for rowNum in range(nRows): if rowNum in skipTaxNums[pNum]: pass else: xSq_row = 0.0 for colNum in range(nCols): if colNum in columnZeros: pass else: theDiff = h['observedIndividualCounts'][rowNum][ colNum] - theExpected[rowNum][colNum] xSq_row += (theDiff * theDiff) / \ theExpected[rowNum][colNum] xSq += xSq_row dof = (nCols - len(columnZeros) - 1) * \ (nRows - len(skipTaxNums[pNum]) - 1) prob = p4.func.chiSquaredProb(xSq, dof) if doOut: flob.write( ' Part %i: Chi-square = %f, (dof=%i) P = %f\n' % (pNum, xSq, dof, prob)) for pNum in range(self.data.nParts): h = statsHashList[pNum] h['overallStat'] = 0.0 h['individualStats'] = [0.0] * self.data.nTax for i in range(self.data.nTax): if i in skipTaxNums[pNum]: pass # h['individualStats'] stays at zeros else: for j in range(self.data.parts[pNum].dim): # Avoid dividing by Zero. if h['expectedIndividualCounts'][i][j]: dif = h['observedIndividualCounts'][i][ j] - h['expectedIndividualCounts'][i][j] h['individualStats'][ i] += ((dif * dif) / h['expectedIndividualCounts'][i][j]) h['overallStat'] += h['individualStats'][i] h['overallSimStats'] = [] h['individualSimStats'] = [] for i in range(self.data.nTax): h['individualSimStats'].append([]) if 0: print("h['individualNSites'] = %s" % h['individualNSites']) print("h['observedIndividualCounts'] = %s" % h['observedIndividualCounts']) print("h['expectedIndividualCounts'] = %s" % h['expectedIndividualCounts']) print("h['overallStat'] = %s" % h['overallStat']) print("h['individualStats'] = %s" % h['individualStats']) raise P4Error(gm) import glob compoFNames = glob.glob('sims_CompStats_*') # print compoFNames for fName1 in compoFNames: f2 = open(fName1) aLine = f2.readline() if not aLine: gm.append("Empty file %s" % fName1) raise P4Error(gm) # print "a got line %s" % aLine, while aLine: for partNum in range(self.data.nParts): h = statsHashList[partNum] splitLine = aLine.split() if len(splitLine) != (self.data.nTax + 2): gm.append( "Bad line in composition stats file %s" % fName1) gm.append("'%s'" % aLine) raise P4Error(gm) if int(splitLine[0]) != partNum: gm.append( "Bad line in composition stats file %s" % fName1) gm.append( "First item should be the partNum %i" % partNum) gm.append("'%s'" % aLine) raise P4Error(gm) # for taxNum in range(self.data.nTax): # print splitLine[taxNum + 1] # print splitLine[-1] h['overallSimStats'].append(float(splitLine[-1])) for i in range(self.data.nTax): h['individualSimStats'][i].append( float(splitLine[i + 1])) #raise P4Error(gm) aLine = f2.readline() if not aLine: break # print "b got line %s" % aLine, f2.close() nSims = len(statsHashList[0]['overallSimStats']) if doOut: # Explain tree- and model-based compo fit stat, X^2_m flob.write(longMessage2) flob.write(' %i simulation reps were used.\n\n' % nSims) spacer1 = ' ' * 10 for partNum in range(self.data.nParts): h = statsHashList[partNum] if doOut: flob.write('Part %-2i:\n-------\n\n' % partNum) flob.write('Statistics from the original data\n') flob.write('%s%30s: %f\n' % (spacer1, 'Overall observed stat', h['overallStat'])) flob.write('%s%30s:\n' % (spacer1, 'Stats for individual taxa')) for taxNum in range(self.data.nTax): if taxNum not in skipTaxNums[partNum]: flob.write('%s%30s: %f\n' % ( spacer1, self.data.taxNames[taxNum], h['individualStats'][taxNum])) else: flob.write('%s%30s: skipped\n' % (spacer1, self.data.taxNames[taxNum])) flob.write( '\nAssessment of fit from null distribution from %i simulations\n' % nSims) flob.write('%s%30s: ' % (spacer1, 'Overall')) prob = p4.func.tailAreaProbability( h['overallStat'], h['overallSimStats'], verbose=0)[2] if doOut: if prob <= 0.05: flob.write('%13s' % 'Doesn\'t fit.') else: flob.write('%13s' % 'Fits.') flob.write(' P = %5.3f\n' % prob) ############# theRet = prob ############# for taxNum in range(self.data.nTax): if doOut: flob.write('%s%30s: ' % (spacer1, self.data.taxNames[taxNum])) if taxNum in skipTaxNums[partNum]: if doOut: flob.write('%13s\n' % 'skipped.') else: prob = p4.func.tailAreaProbability(h['individualStats'][taxNum], h['individualSimStats'][taxNum], verbose=0)[2] if doOut: if prob <= 0.05: flob.write('%13s' % "Doesn't fit.") else: flob.write('%13s' % 'Fits.') flob.write(' P = %5.3f\n' % prob) if writeRawStats: fRaw.write('#\n# Tree and model based composition fit test\n') fRaw.write('# =========================================\n') fRaw.write( '# Simulation statistics, ie the null distributions\n\n') fRaw.write('# Part %i:\n' % partNum) fRaw.write('part%i_overall_compo_null = %s\n' % (partNum, h['overallSimStats'])) for taxNum in range(self.data.nTax): fRaw.write('part%i_%s_compo_null = %s\n' % (partNum, _fixFileName( self.data.taxNames[taxNum]), h['individualSimStats'][taxNum])) # Yes, it is possible to close sys.stdout if flob and flob != sys.stdout: flob.close() if fRaw and fRaw != sys.stdout: fRaw.close() return theRet
[docs] def compoTestUsingSimulations(self, nSims=100, doIndividualSequences=0, doChiSquare=0, verbose=1): """Compositional homogeneity test using a null distribution from simulations. This does a compositional homogeneity test on each data partition. The statistic used here is X^2, obtained via Data.compoChiSquaredTest(). The null distribution of the stat is made using simulations, so of course you need to provide a tree with a model, with optimized branch lengths and model parameters. This is a comp homogeneity test, so the model should be tree-homogeneous. The analysis usually tests all sequences in the data partition together (like paup), but you can also 'doIndividualSequences' (like puzzle). Beware that the latter is a multiple simultaneous stats test, with associated Type I Error. For purposes of comparison, this test can also do compo tests in the style of PAUP and puzzle, using chi-square to assess significance. Do this by turning 'doChiSquare' on. The compo test in PAUP tests all sequences together, while the compo test in puzzle tests all sequences separately. There are advantages and disadvantages to the latter-- doing all sequences separately allows you to identify the worst offenders, but suffers due to the problems of multiple simultaneous stats tests. There are slight differences between the computation of the Chi-square in PAUP and puzzle and the p4 version. The compo test in PAUP (basefreq) does the chi-squared test, but if sequences are blank it still counts them in the degrees of freedom; p4 does not count blank sequences in the degrees of freedom. Puzzle simply uses the row sums, ie the contributions of each sequence to the total X-squared, and assesses significance with chi-squared using the number of symbols minus 1 as the degrees of freedom. Ie for DNA dof=3, for protein dof=19. Puzzle correctly gets the composition from sequences with gaps, but does not do the right thing for sequences with ambiguities like r, y, and so on. P4 does calculate the composition correctly when there are such ambiguities. So p4 will give you the same numbers as paup and puzzle for the chi-squared part as long as you don't have blank sequences or ambiguities like r and y. This uses the Data.compoChiSquaredTest() method to get the stats. See the doc string for that method, where it describes how zero column sums (ie some character is absent) can be dealt with. Here, when that method is invoked, 'skipColumnZeros' is turned on, so that the analysis is robust against data with zero or low values for some characters. For example:: # First, do a homog opt, and pickle the optimized tree. # Here I use a bionj tree, but you could use whatever. read('d.nex') a = var.alignments[0] dm = a.pDistances() t = dm.bionj() d = Data() t.data = d t.newComp(free=1, spec='empirical') t.newRMatrix(free=1, spec='ones') t.setNGammaCat(nGammaCat=4) t.newGdasrv(free=1, val=0.5) t.setPInvar(free=0, val=0.0) t.optLogLike() t.name = 'homogOpt' t.tPickle() # Then, do the test ... read('homogOpt.p4_tPickle') t = var.trees[0] read('d.nex') d = Data() t.data = d t.compoTestUsingSimulations() # Output would be something like ... # Composition homogeneity test using simulations. # P-values are shown. # Part Num 0 # Part Name all # -------------------- -------- # All Sequences 0.0000 # Or using more sims for more precision, and also doing the # Chi-square test for contrast ... t.compoTestUsingSimulations(nSims=1000, doChiSquare=True) # Output might be something like ... # Composition homogeneity test using simulations. # P-values are shown. # (P-values from Chi-Square are shown in parens.) # Part Num 0 # Part Name all # -------------------- -------- # All Sequences 0.0140 # (Chi-Squared Prob) (0.9933) # It returns the P-value for the sims for the first data # partition. It is often the case, as above, that this test will show significance while the Chi-square test does not. """ gm = ['Tree.compoTestUsingSimulations()'] # print "inComp = %s" % self.model.parts[0].comps[0].val if not self.data: gm.append("No data. Set the data first.") raise P4Error(gm) if not self.model: gm.append("No model. You need to set the model first.") raise P4Error(gm) self.modelSanityCheck() if self.model.isHet: gm.append("The model for this tree is tree-heterogeneous.") gm.append("This test is not valid for tree-hetero models.") raise P4Error(gm) # Make a new data object in which to do the sims, so we do not over-write self # print "a self.data = %s" % self.data # self.data.dump() savedData = self.data self.data = None # This triggers self.deleteCStuff() self.data = savedData.dupe() # print "b self.data = %s" % self.data # self.data.dump() #raise P4Error(gm) # Check for missing sequences in any of the parts. Missing seq # nums go in skips, a list of lists. skips = [] for pNum in range(self.data.nParts): skips.append([]) for pNum in range(self.data.nParts): for tNum in range(self.data.nTax): nSites = pf.partSequenceSitesCount( self.data.parts[pNum].cPart, tNum) # no gaps, no missings if not nSites: skips[pNum].append(tNum) # Get the original stats from self.data. # compoChiSquaredTest(self, verbose=1, skipColumnZeros=0, useConstantSites=1, skipTaxNums=None, getRows=0) original = self.data.compoChiSquaredTest(verbose=0, skipColumnZeros=1, skipTaxNums=skips, getRows=doIndividualSequences) # print "original =", original # Make some empty lists in which to put our stats full = [] if doIndividualSequences: rows = [] for pNum in range(self.data.nParts): full.append([]) if doIndividualSequences: onePartRows = [] for i in range(self.data.nTax): onePartRows.append([]) rows.append(onePartRows) # Do the sims for i in range(nSims): # if i < 5: # print "%i simComp = %s" % (i, self.model.parts[0].comps[0].val) self.simulate() ret = self.data.compoChiSquaredTest(skipColumnZeros=1, skipTaxNums=skips, getRows=doIndividualSequences, verbose=0) # print "%i ret=%s" % (i, ret) for pNum in range(self.data.nParts): full[pNum].append(ret[pNum][0]) if doIndividualSequences: for tNum in range(self.data.nTax): if tNum not in skips[pNum]: rows[pNum][tNum].append(ret[pNum][3][tNum]) # Find the longest part name length, and heading width, so the output # looks nice. partWid = 8 for p in self.data.parts: if len(p.name) > partWid: partWid = len(p.name) partWid += 2 headWid = 20 for tN in self.data.taxNames: if len(tN) > headWid: headWid = len(tN) headWid += 2 headSig = '%' + "%i" % (headWid - 2) + 's ' # Get the all-sequences tail area probs partTaps = [] for pNum in range(self.data.nParts): partTaps.append( p4.func.tailAreaProbability(original[pNum][0], full[pNum], verbose=0)[2]) #print("partTaps is ", partTaps) # Intro if verbose: print("Composition homogeneity test using simulations.") print("P-values are shown.") if doChiSquare: print("(P-values from Chi-Square are shown in parens.)") print() # Print the Part Nums and Part Names if verbose: print(headSig % 'Part Num', end=' ') for pNum in range(self.data.nParts): print(('%i' % pNum).center(partWid), end=' ') print() print(headSig % 'Part Name', end=' ') for pNum in range(self.data.nParts): print(self.data.parts[pNum].name.center(partWid), end=' ') print() print(headSig % ('-' * (headWid - 2)), end=' ') for pNum in range(self.data.nParts): print(('-' * (partWid - 2)).center(partWid), end=' ') print() # Print the all-sequences results if verbose: print(headSig % 'All Sequences', end=' ') for pNum in range(self.data.nParts): print(('%6.4f' % partTaps[pNum]).center(partWid), end=' ') print() if doChiSquare: print(headSig % '(Chi-Squared Prob)', end=' ') for pNum in range(self.data.nParts): print(('(%6.4f)' % original[pNum][2]).center(partWid), end=' ') print() if doIndividualSequences and verbose: print() # print "Individual sequences" # print "--------------------" for tNum in range(self.data.nTax): print(headSig % self.data.taxNames[tNum], end=' ') for pNum in range(self.data.nParts): if tNum not in skips[pNum]: ret = p4.func.tailAreaProbability( original[pNum][3][tNum], rows[pNum][tNum], verbose=0)[2] print(('%6.4f' % ret).center(partWid), end=' ') else: print(('-' * 4).center(partWid), end=' ') print() if doChiSquare: print(headSig % ' ', end=' ') for pNum in range(self.data.nParts): # degrees of freedom dof = self.data.parts[pNum].dim - 1 if tNum not in skips[pNum]: ret = p4.func.chiSquaredProb( original[pNum][3][tNum], dof) print(('(%6.4f)' % ret).center(partWid), end=' ') else: print(('-' * 4).center(partWid), end=' ') print() # Replace the saved data # Since we are replacing an exisiting data, this triggers # self.deleteCStuff() self.data = savedData #print(partTaps, "\n" * 2) # one for each data part return partTaps[0] # only the first part
[docs] def bigXSquaredSubM(self, verbose=False): """Calculate the X^2_m stat This can handle gaps and ambiguities. Column zeros in the observed is not a problem with this stat, as we are dividing by the expected composition, and that comes from the model, which does not allow compositions with values of zero. """ if not self.cTree: self._commonCStuff(resetEmpiricalComps=True) l = [] for pNum in range(self.data.nParts): if verbose: print("Part %i" % pNum) print("======") obs = [] nSites = [] # no gaps or ? for taxNum in range(self.nTax): thisNSites = pf.partSequenceSitesCount( self.data.parts[pNum].cPart, taxNum) comp = self.data.parts[pNum].composition([taxNum]) for symbNum in range(self.data.parts[pNum].dim): comp[symbNum] *= thisNSites nSites.append(thisNSites) obs.append(comp) if verbose: print("\n Observed") print(" " * 10, end=' ') for symbNum in range(self.data.parts[pNum].dim): print("%8s" % self.data.parts[pNum].symbols[symbNum], end=' ') print() for taxNum in range(self.nTax): print("%10s" % self.taxNames[taxNum], end=' ') for symbNum in range(self.data.parts[pNum].dim): print("%8.4f" % obs[taxNum][symbNum], end=' ') print(" n=%i" % nSites[taxNum]) # pf.p4_expectedCompositionCounts returns a tuple of tuples # representing the counts of the nodes in proper alignment order. exp = list(pf.p4_expectedCompositionCounts(self.cTree, pNum)) if verbose: print("\n Expected") print(" " * 10, end=' ') for symbNum in range(self.data.parts[pNum].dim): print("%8s" % self.data.parts[pNum].symbols[symbNum], end=' ') print() for taxNum in range(self.nTax): print("%10s" % self.taxNames[taxNum], end=' ') for symbNum in range(self.data.parts[pNum].dim): print("%8.4f" % exp[taxNum][symbNum], end=' ') print(" n=%i" % nSites[taxNum]) # do the summation theSum = 0.0 for taxNum in range(self.nTax): for symbNum in range(self.data.parts[pNum].dim): x = obs[taxNum][symbNum] - exp[taxNum][symbNum] theSum += (x * x) / exp[taxNum][symbNum] l.append(theSum) if verbose: print("The bigXSquaredSubM stat for this part is %.5f" % theSum) return l
[docs] def compStatFromCharFreqs(self, verbose=False): """Calculate a statistic from observed and model character frequencies. Call it c_m, little c sub m. It is calculated from observed character frequencies and character frequencies expected from the (possibly tree-heterogeneous) model. It would be the sum of abs(obs-exp)/exp """ if not self.cTree: self._commonCStuff(resetEmpiricalComps=True) # pf.p4_expectedComposition returns a tuple of tuples of tuples # representing the counts of the nodes in proper alignment order. exp = pf.p4_expectedComposition(self.cTree) l = [] for pNum in range(self.data.nParts): if verbose: print("Part %i" % pNum) print("======") obs = [] for taxNum in range(self.nTax): comp = self.data.parts[pNum].composition([taxNum]) obs.append(comp) if verbose: print("\n Observed") print(" " * 10, end=' ') for symbNum in range(self.data.parts[pNum].dim): print("%8s" % self.data.parts[pNum].symbols[symbNum], end=' ') print() for taxNum in range(self.nTax): print("%10s" % self.taxNames[taxNum], end=' ') for symbNum in range(self.data.parts[pNum].dim): print("%8.4f" % obs[taxNum][symbNum], end=' ') print() if verbose: print("\n Expected") print(" " * 10, end=' ') for symbNum in range(self.data.parts[pNum].dim): print("%8s" % self.data.parts[pNum].symbols[symbNum], end=' ') print() for taxNum in range(self.nTax): print("%10s" % self.taxNames[taxNum], end=' ') for symbNum in range(self.data.parts[pNum].dim): print("%8.4f" % exp[pNum][taxNum][symbNum], end=' ') print() # do the summation theSum = 0.0 for taxNum in range(self.nTax): for symbNum in range(self.data.parts[pNum].dim): theSum += math.fabs(obs[taxNum][symbNum] - exp[pNum] [taxNum][symbNum]) / exp[pNum][taxNum][symbNum] l.append(theSum) if verbose: print("The c_m stat for this part is %.5f" % theSum) return l
[docs] def getEuclideanDistanceFromSelfDataToExpectedComposition(self): """Calculate the c_E stat between self.data and model expected composition. The expected composition comes from the current tree (self) and model. There is an expected composition of each sequence in each part, and is obtained via pf.p4_expectedComposition(cTree). In non-stationary evolution, the expected composition of sequences approach the model composition asymptotically as the branch increases. I am calling the Euclidean distance from the actual sequence composition to the expected composition c_E. Returns: A list of lists --- the c_E for each sequence, for each part. Order of the sequences is as in the Data. """ if not self.cTree: self.calcLogLike(verbose=False) expected = pf.p4_expectedComposition(self.cTree) #print expected allParts = [] for pNum in range(self.model.nParts): thePart = self.data.parts[pNum] statsForPart = [] for seqNum in range(self.data.nTax): expectedForPartSeq = expected[pNum][seqNum] compForPartSeq = thePart.composition([seqNum]) c_E = 0.0 for cNum in range(thePart.dim): dif = expectedForPartSeq[cNum] - compForPartSeq[cNum] c_E += (dif * dif) c_E = math.sqrt(c_E) statsForPart.append(c_E) allParts.append(statsForPart) return allParts
######################################################## opt and sim ######################################################## def __del__(self, freeTree=pf.p4_freeTree, freeNode=pf.p4_freeNode, mysys=sys): #mysys.stdout.write('Tree.__del__() here.\n') # mysys.stdout.flush() # Refers to nodes, which causes grief. if hasattr(self, "splitKeyHash"): del(self.splitKeyHash) self._data = None # self._model = None # model is needed for freeNode() # If this is not here, then nodes tend to hang around forever ... if 1: for n in self.nodes: n.wipe() for n in self.nodes: if n.cNode: #mysys.stdout.write(' Tree.__del__(), freeing node %i\n' % n.nodeNum) # mysys.stdout.flush() freeNode(n.cNode) n.cNode = None for n in self.nodes: del(n) self.root = None self.nodes = None if self.cTree: if self.doDataPart: dp_freeTree(self.cTree) else: freeTree(self.cTree) self.cTree = None #mysys.stdout.write('Tree.__del__() finished.\n') # mysys.stdout.flush()
[docs] def deleteCStuff(self): """Deletes c-pointers from nodes, self, and model, but not the data.""" # print 'Tree.deleteCStuff() here.' for n in self.nodes: if n.cNode: # print ' about to free node %i, cNode %s' % (n.nodeNum, # n.cNode) pf.p4_freeNode(n.cNode) n.cNode = 0 if self.cTree: # print ' about to free cTree' pf.p4_freeTree(self.cTree) self.cTree = 0 # I need to delay deleting the cModel until after deleting the # self.cStuff, because free-ing self.cStuff (eg nodes) # requires the cModel. if self.model and self.model.cModel: # print ' about to free cModel' pf.p4_freeModel(self.model.cModel) self.model.cModel = 0
[docs] def _allocCStuff(self, resetEmpiricalComps=True): """Allocate c-memory for self and its nodes.""" gm = ['Tree._allocCStuff()'] # Make sure the nodeNums go from zero to N-1 for i,n in enumerate(self.iterNodes()): # allow NO_ORDER nodes at the end if self.nodes[i].nodeNum != i: gm.append( "Programming error: Problem with node number %i." % i) gm.append("Nodes should be numbered consecutively from zero.") raise P4Error(gm) self.modelSanityCheck(resetEmpiricalComps=resetEmpiricalComps) if not self.data.cData: self.data._setCStuff() if not self.model.cModel: self.model.allocCStuff() if var.doDataPart: # print 'about to dp_newTree' self.cTree = pf.dp_newTree(len(self.nodes), self.preOrder, self.postOrder, self.data.cData, self.model.cModel) self.doDataPart = 1 if not self.cTree: gm.append("Unable to allocate a cTree") raise P4Error(gm) for n in self.iterNodes(): n.doDataPart = 1 # print 'about to dp_newNode (%i)' % n.nodeNum cNode = pf.dp_newNode( n.nodeNum, self.cTree, n.seqNum, n.isLeaf) if not cNode: gm.append("Unable to allocate a cNode.") raise P4Error(gm) n.cNode = cNode else: nLeaves = 0 for n in self.iterNodes(): if n.isLeaf: nLeaves += 1 self.partLikes = numpy.zeros(self.model.nParts, dtype=numpy.double) self.cTree = pf.p4_newTree(len(list(self.iterNodes())), nLeaves, self.preOrder, self.postOrder, var._newtAndBrentPowellOptPassLimit, self.partLikes, self.data.cData, self.model.cModel) if not self.cTree: gm.append("Unable to allocate a cTree") raise P4Error(gm) for i in range(len(self.nodes)): n = self.nodes[i] if n.nodeNum == var.NO_ORDER: continue # seems a little mixed up. if i in self.preOrder: inTree = 1 else: inTree = 0 # We include the inTree as a flag for whether the node # is in the tree or not. If the inTree flag is 0, # then the node is not actually part of the tree, and so # clNeedsUpdating is turned off. n.cNode = pf.p4_newNode( n.nodeNum, self.cTree, n.seqNum, n.isLeaf, inTree) if not n.cNode: gm.append("Unable to allocate a cNode") raise P4Error(gm)
[docs] def setCStuff(self): """Transfer info about self to c-language stuff. Transfer relationships among nodes, the root position, branch lengths, model usage info (ie what model attributes apply to what nodes), and pre- and post-order.""" #gm = ['Tree.setCStuff()'] # Set node relations, br.len, root, node modelNums, preOrder?, # postOrder # Set relations- parent, leftChild, sibling. Here's the code for # pf.p4_setRelative(int theCNode, int relation, int relNum) # parent- relation = 0, leftChild- relation = 1, sibling- relation # = 2 for n in self.iterNodes(): if n.parent: pf.p4_setNodeRelation(n.cNode, 0, n.parent.nodeNum) else: pf.p4_setNodeRelation(n.cNode, 0, -1) # "-1" gives NULL if n.leftChild: pf.p4_setNodeRelation(n.cNode, 1, n.leftChild.nodeNum) else: pf.p4_setNodeRelation(n.cNode, 1, -1) if n.sibling: pf.p4_setNodeRelation(n.cNode, 2, n.sibling.nodeNum) else: pf.p4_setNodeRelation(n.cNode, 2, -1) # Root pf.p4_setTreeRoot(self.cTree, self.root.cNode) # br.lens for n in self.iterNodesNoRoot(): #pf.p4_setBrLen(n.cNode, n.br.len, n.br.lenChanged) pf.p4_setBrLen(n.cNode, n.br.len) # Model usage info if self.model.isHet: for pNum in range(self.model.nParts): if self.model.parts[pNum].isHet: # print "setCStuff(). about to setCompNum" for n in self.iterNodes(): pf.p4_setCompNum(n.cNode, pNum, n.parts[pNum].compNum) if n != self.root: pf.p4_setRMatrixNum( n.cNode, pNum, n.br.parts[pNum].rMatrixNum) pf.p4_setGdasrvNum( n.cNode, pNum, n.br.parts[pNum].gdasrvNum) # pre- and postOrder if not self.preAndPostOrderAreValid: self.setPreAndPostOrder()
[docs] def _commonCStuff(self, resetEmpiricalComps=True): """Allocate and set c-stuff, and setPrams.""" if not self.data: if self.name: gm = ["Tree %s (_commonCStuff)" % self.name] else: gm = ["Tree (_commonCStuff)"] gm.append( "This tree has no data attached. Before doing an optimization, likelihood") gm.append( "calculation, or simulation, you need to do something like this:") gm.append(" theTree.data = theData") raise P4Error(gm) #print("self.cTree = %s" % self.cTree) if not self.cTree: # This calls self.modelSanityCheck(), which calls # self.setEmpiricalComps() self._allocCStuff(resetEmpiricalComps=resetEmpiricalComps) #print("About to self.model.setCStuff()") self.model.setCStuff() #print("About to self.setCStuff()") self.setCStuff() #print("about to p4_setPrams()...") pf.p4_setPrams(self.cTree, -1) # "-1" means do all parts
#print("finished _commonCStuff()")
[docs] def calcLogLike(self, verbose=1, resetEmpiricalComps=True): """Calculate the likelihood of the tree, without optimization.""" self._commonCStuff(resetEmpiricalComps=resetEmpiricalComps) # print("about to p4_treeLogLike()...") # second arg is getSiteLikes self.logLike = pf.p4_treeLogLike(self.cTree, 0) if verbose: print("Tree.calcLogLike(). %f" % self.logLike)
[docs] def optLogLike(self, verbose=1, method="BOBYQA", optBrLens=True): """Calculate the likelihood of the tree, with optimization. There are different optimization methods-- choose one. I've made 'BOBYQA' the default, as it is very fast and seems to be working. It is from the nlopt library. Other opt methods include --- newtAndBrentPowell -- fairly fast, and works well. It was the default. Perhaps use this in combination with BOBYQA, eg t.optLogLike(method="BOBYQA") t.optLogLike(method="newtAndBrentPowell") The 'allBrentPowell' optimizer was the default several years ago, as it seems to be the most robust, although it is slow. It might be good for checking important calculations. 'newtAndBOBYQA' --- fast and seems to work well. As suggested above, for difficult optimizations it may help to repeat the call to optLogLike(), perhaps with a different method. Arg optBrLens (default True), can be turned off. This week, this only works with method="BOBYQA". """ gm = ["Tree.optLogLike()"] if verbose: theStartTime = time.time() if 0: for n in self.iterNodesNoRoot(): if n.br.len < var.BRLEN_MIN: gm.append("All branch lengths should be greater than or equal to var.BRLEN_MIN,") gm.append(f" which at the moment is {var.BRLEN_MIN}") gm.append(f"Got a branch length of {n.br.len:.8f} {n.br.len:g}") gm.append("Either make the branch length bigger, or lower var.BRLEN_MIN.") gm.append("You could, for example, t.stripBrLens() which makes all br lens default 0.1") raise P4Error(gm) if not optBrLens: if method != "BOBYQA": gm.append("Turning arg optBrLens off only works with BOBYQA") raise P4Error(gm) self._commonCStuff() if method == "newtAndBrentPowell": pf.p4_newtSetup(self.cTree) pf.p4_newtAndBrentPowellOpt(self.cTree) elif method == "allBrentPowell": pf.p4_allBrentPowellOptimize(self.cTree) elif method == "newtAndBOBYQA": pf.p4_newtSetup(self.cTree) pf.p4_newtAndBOBYQAOpt(self.cTree) elif method == "BOBYQA": if optBrLens: pf.p4_allBOBYQAOptimize(self.cTree, 1) else: pf.p4_allBOBYQAOptimize(self.cTree, 0) else: gm.append('method should be one of "newtAndBrentPowell", "allBrentPowell", "newtAndBOBYQA", or "BOBYQA"') raise P4Error(gm) # Do a final like calc. (second arg is getSiteLikes) self.logLike = pf.p4_treeLogLike(self.cTree, 0) # get the brLens brLens = pf.p4_getBrLens(self.cTree) for n in self.iterNodesNoRoot(): n.br.len = brLens[n.nodeNum] # get the other free prams prams = pf.p4_getFreePrams(self.cTree) self.model.restoreFreePrams(prams) if verbose: print("optLogLike = %f" % self.logLike) theEndTime = time.time() print("cpu time %s seconds." % (theEndTime - theStartTime))
[docs] def optTest(self): self._commonCStuff() theStartTime = time.time() doXfer = 0 for i in range(1): if doXfer: self.model.setCStuff() self.setCStuff() pf.p4_setPrams(self.cTree, -1) self.logLike = pf.p4_treeLogLike(self.cTree, 0) if doXfer: # get the brLens brLens = pf.p4_getBrLens(self.cTree) for i in range(len(self.nodes)): n = self.nodes[i] if n != self.root: n.br.len = brLens[i] # get the other free prams prams = pf.p4_getFreePrams(self.cTree) self.model.restoreFreePrams(prams) print("time %s seconds." % (time.time() - theStartTime))
[docs] def simulate(self, calculatePatterns=True, resetSequences=True, resetNexusSetsConstantMask=True, refTree=None): """Simulate into the attached data. The tree self needs to have a data and model attached. Generation of random numbers uses the GSL random number generator. The state is held in var.gsl_rng, which is None by default. If you do a simulation using this method, it will use ``var.gsl_rng`` if it exists, or make it if it does not exist yet. When it makes it, it seeds the state based on the current time. That should give you lots of variation in the simulations. If on the other hand you want to make simulations that are the same you can reseed the randomizer with the same seed whenever you do it, like this:: if not var.gsl_rng: var.gsl_rng = pf.gsl_rng_get() # unusually, set the seed with each simulation mySeed = 23 # your chosen int seed pf.gsl_rng_set(var.gsl_rng, mySeed) The usual way to simulate does not use reference data. An unusual way to simulate comes from (inspired by?) PhyloBayes, where the simulation is conditional on the original data. It uses conditional likelihoods of that reference data at the root. To turn that on, set refTree to the tree+model+data that you would like to use. Calculate a likelihood with that refTree before using it, so that conditional likelihoods are set. The tree and model for refTree should be identical to the tree and model for self. Args: calculatePatterns (bool): True by default. Whether to "compress" the newly simulated data to facilitate a faster likelihood calculation. resetSequences (bool): True by default. whether to bring the simulated sequences in C back into Python resetNexusSetsConstantMask (bool): True by default. When simulations are made, the constant mask in any associated nexus sets will get out of sync. Setting this to True makes a new mask and sets it. refTree (Tree): None by default. If supplied, a tree+model+data which has had its likelihood calculated, where the tree+model is identical to self. """ if refTree: from p4.tree import Tree assert isinstance(refTree, Tree) assert refTree.model assert refTree.data if not refTree.cTree: refTree.calcLogLike(verbose=False) assert refTree.model.cModel assert refTree.data.cData if not var.gsl_rng: var.gsl_rng = pf.gsl_rng_get() pf.gsl_rng_set(var.gsl_rng, int(time.time())) self._commonCStuff() if refTree: assert refTree.data.cData != self.data.cData assert refTree.data.nParts == self.data.nParts assert refTree.data.nTax == self.data.nTax for i in range(self.data.nTax): assert refTree.data.taxNames[i] == self.data.taxNames[i] assert len(refTree.data.alignments) == len(self.data.alignments) assert refTree.logLike, "Do a likelihood calculation with the refTree before using it here." # could have some more checks ... # If there is a NexusSets object attached to any of the alignments # in the Data, the constant sites mask at least will become out of sync, but we can't just # delete the whole nexusSets object, as they define what the parts are. # for a in self.data.alignments: # # if a.nexusSets: # a.nexusSets = None # Probably better to do something like this # a.nexusSets.constant.mask = self.constantMask() # at the end. # print "About to pf.p4_simulate(self.cTree)" if refTree: pf.p4_simulate(self.cTree, refTree.cTree, var.gsl_rng) else: pf.p4_simulate(self.cTree, 0, var.gsl_rng) if calculatePatterns: for p in self.data.parts: pf.makePatterns(p.cPart) pf.setGlobalInvarSitesVec(p.cPart) if resetSequences: self.data.resetSequencesFromParts() if resetNexusSetsConstantMask: for a in self.data.alignments: if a.nexusSets: a.nexusSets.constant.mask = a.constantMask() else: if resetNexusSetsConstantMask: gm = ['Tree.simulate().'] gm.append( "resetSequences is not set, but resetNexusSetsConstantMask is set,") gm.append("which is probably not going to work as you want.") raise P4Error(gm)
[docs] def ancestralStateDraw(self): """Make a draw from the inferred root character state distribution This method works on a tree with an attached model and data. Conditional on the tree, branch lengths, model, and data, this method infers the ancestral character states of the root node. However, that inference is probabilistic, a distribution, and this method takes a single draw. It returns a string. """ gm = ['Tree.ancestralStateDraw().'] self._commonCStuff() self.logLike = pf.p4_treeLogLike(self.cTree, 0) draw = numpy.empty(4, dtype=numpy.int32) ancSts = [] for pNum in range(self.data.nParts): dp = self.data.parts[pNum] ancStsPart = [] for seqPos in range(dp.nChar): pf.p4_drawAncState(self.cTree, pNum, seqPos, draw) if draw[1] >= 0: # gamma cat if it is a variable site, else -1 assert draw[2] == 0 # not invar assert draw[0] >= 0 # char num ancStsPart.append(dp.symbols[draw[0]]) elif draw[2]: # isInvar, zero if not assert draw[0] == -1 assert draw[1] == -1 assert draw[3] >= 0 # invar char num ancStsPart.append(dp.symbols[draw[3]]) else: gm.append("Problem with returned draw. Got %s" % draw) raise P4Error(gm) assert len(ancStsPart) == dp.nChar ancSts.append(''.join(ancStsPart)) return ''.join(ancSts)
[docs] def getSiteLikes(self): """Likelihoods, not log likes. Placed in self.siteLikes, a list.""" self._commonCStuff() # second arg is getSiteLikes self.logLike = pf.p4_treeLogLike(self.cTree, 1) self.siteLikes = [] for p in self.data.parts: self.siteLikes += pf.getSiteLikes(p.cPart)
# def getSiteRates(self): # """Get posterior mean site rate, and gamma category. # This says two things -- # 1. The posterior mean site rate, calculated like PAML # 2. Which GDASRV category contributes most to the likelihood. # The posterior mean site rate calculation requires that there be # only one gdasrv over the tree, which will usually be the case. # For placement in categories, if its a tie score, then it is placed # in the first one. # The list of site rates, and the list of categories, both with one # value for each site, are put into separate numpy arrays, returned # as a list, ie [siteRatesArray, categoriesArray] # There is one of these lists for each data partition, and the results as a # whole are returned as a list. So if you only have one data # partition, then you get a 1-item list, and that single item is a list with 2 # numpy arrays. Ie [[siteRatesArray, categoriesArray]] # If nGammaCat for a partition is 1, it will give that partition an # array of ones for the site rates and zeros for the categories. # """ # self._commonCStuff() # # second arg is getSiteLikes # self.logLike = pf.p4_treeLogLike(self.cTree, 0) # #self.winningGammaCats = [] # # for p in self.data.parts: # # self.winningGammaCats += pf.getWinningGammaCats(p.cPart) # results = [] # for partNum in range(len(self.data.parts)): # if len(self.model.parts[partNum].gdasrvs) > 1: # gm = ['Tree.getSiteRates()'] # gm.append("Part %i has %i gdasrvs. Maximum 1 allowed." % ( # partNum, len(self.model.parts[partNum].gdasrvs))) # raise P4Error(gm) # for partNum in range(len(self.data.parts)): # p = self.data.parts[partNum] # if self.model.parts[partNum].nGammaCat == 1: # siteRates = numpy.ones(p.nChar, dtype=numpy.double) # gammaCats = numpy.zeros(p.nChar, numpy.int32) # elif self.model.parts[partNum].nGammaCat > 1: # siteRates = numpy.zeros(p.nChar, dtype=numpy.double) # gammaCats = numpy.zeros(p.nChar, numpy.int32) # work = numpy.zeros( # self.model.parts[partNum].nGammaCat, dtype=numpy.double) # for charNum in range(p.nChar): # gammaCats[charNum] = -1 # #pf.getWinningGammaCats(self.cTree, p.cPart, i, gammaCats, work) # pf.getSiteRates( # self.cTree, p.cPart, partNum, siteRates, gammaCats, work) # # print siteRates # # print gammaCats # # print work # if 0: # counts = numpy.zeros( # self.model.parts[partNum].nGammaCat, numpy.int32) # for charNum in range(p.nChar): # counts[winningGammaCats[charNum]] += 1 # print(counts) # else: # raise P4Error("This should not happen.") # results.append([siteRates, gammaCats]) # return results