import sys
import string
import io
import math
import copy
import os
import p4.func
import time
import glob
from p4.var import var
from p4.p4exceptions import P4Error
from p4.node import Node, NodePart, NodeBranch, NodeBranchPart
from p4.nexustoken import nextTok, safeNextTok
from p4.distancematrix import DistanceMatrix
import numpy
import p4.pf as pf
from p4.model import Model
from p4.data import Data
from p4.alignment import Part
from p4.treepicture import TreePicture
import random
import pickle
def _fixFileName(fName):
if fName.count('.') or fName.count(' '):
fName = list(fName)
for i in range(len(fName)):
theChar = fName[i]
if theChar == '.' or theChar == ' ':
fName[i] = '_'
fName = ''.join(fName)
return fName
longMessage1 = """
(Boring old) Chi-square test for compositional homogeneity
==========================================================
The statistic is Sum[(Obs - Exp)^2 / Exp],
where Exp comes from the data.
The statistic is X^2 (X squared), in the sense used by
Sokal & Rohlf in 'Biometry', to distinguish it from
'chi square', which is a distribution, not a statistic.
This is (mostly) the same as the test in PAUP, using the
basefreq command. There are small differences having
to do with calculation of degrees of freedom.
Significance is assessed by Chi-square.
"""
longMessage2 = """
Tree- and model-based composition fit test
==========================================
The statistic is Sum[(Obs - Exp)^2 / Exp],
like the statistic used in the Chi-squared test,
except that Exp comes from the model, not from the data.
I call the statistic X^2_m (X squared sub m) since it is like
the X^2 statistic used in the chi-square test above, but uses
the m subscript to say that the expected values come from the model.
Significance is assessed by simulations on the tree and model.
A critical point of 95% is used to decide if the data fits or not.
"""
# def __del__(self, freeTree=pf.p4_freeTree, freeNode=pf.p4_freeNode):
# def __del__(self, freeTree=pf.p4_freeTree, dp_freeTree = pf.dp_freeTree, mysys=sys):
# def __del__(self, freeTree=pf.p4_freeTree, dp_freeTree = pf.dp_freeTree):
[docs]
class RMatrix(object):
def __init__(self):
self.num = -1
self.partNum = None
self.free = None
self.spec = None
self.symbol = None
self.val = None
self.nNodes = 0
[docs]
class Gdasrv(object):
def __init__(self):
self.num = -1
self.partNum = None
self.free = None
self.symbol = None
# self.val=None
self._val = numpy.zeros(1, dtype=numpy.double)
self.freqs = None
self.rates = None
self.nGammaCat = None
self.c = None # a p4_gdasrvStruct, if it exists. It is used in calcRates, below.
self.nNodes = 0
def _getVal(self):
return self._val
def _setVal(self, theVal):
if theVal < 1.e-16:
gm = ["Gdasrv._setVal()"]
gm.append("Attempt to set Gdasrv.val (ie alpha) to %g" % theVal)
gm.append(
"However, we cannot calculate the discrete categories with a value so low.")
raise P4Error(gm)
self._val[0] = theVal
self.calcRates()
def _delVal(self):
gm = ["Don't/Can't delete this Gdasrv property."]
raise P4Error(gm)
val = property(_getVal, _setVal, _delVal)
[docs]
def calcRates(self):
# Use either the p4_gdasrvStruct, or just use the NumPy
# array vals (np = NumPy).
# print("self.c = %s" % self.c)
if self.c:
pf.gdasrvCalcRates(self.c)
else:
pf.gdasrvCalcRates_np(self.nGammaCat, self._val[0], self.freqs, self.rates)
# print('xxx self.rates = %s, val=%s' % (self.rates, self._val[0]))
[docs]
class Comp(object):
def __init__(self):
self.num = -1
self.partNum = None
self.free = None
self.spec = None
self.symbol = None
self._val = None
self.nNodes = 0
def _getVal(self):
return self._val
def _setVal(self, theVal):
if self._val is None:
self._val = numpy.array(theVal)
else:
#print("Resetting comp val.")
assert len(self._val) == len(theVal)
for i in range(len(theVal)):
self._val[i] = theVal[i]
def _delVal(self):
gm = ["Don't/Can't delete this Comp property."]
raise P4Error(gm)
val = property(_getVal, _setVal, _delVal)
[docs]
class PInvar(object):
def __init__(self):
self.num = -1
self.partNum = None
self.free = None
self.val = None
[docs]
class Tree(object):
"""A phylogenetic tree.
**Some instance variables**
* ``fName``, if the Tree was read from a file
* ``name``, the name of the tree.
* ``root``, the root node
* ``nodes``, a list of nodes.
* ``preOrder`` and ``postOrder``, lists of node numbers
* ``recipWeight``, the weight, if it exists, is usually 1/something, so the reciprocal looks nicer ...
* ``nexusSets``, if it exists, a NexusSets object.
**Properties**
* ``taxNames``, a list of names. Usually the order is important!
* ``data``, a :class:`Data.Data` object
* ``model``, a :class:`Model.Model` object
* ``nTax``, the number of taxa
* ``nInternalNodes``, the number of non-leaf nodes
**The node method**
You often will want to refer to a node in a tree. This can be
done via its name, or its nodeNum, or as an object, via the method
:meth:`~p4.tree.Tree.node`. For example, from a Tree ``t`` you can
get node number 3 via::
n = t.node(3)
and you can get the node that is the parent of the Mastodon via::
n = t.node('Mastodon').parent
For many methods that require specifying a node, the method argument is *nodeSpecifier*, eg::
t.reRoot(23)
``reRoots``'s the tree to node number 23.
**Describe, draw, and get information about the tree**
.. autosummary::
:nosignatures:
~Tree.dump
~Tree.draw
~Tree.textDrawList
~Tree.tv
~Tree.btv
~Tree.isFullyBifurcating
~Tree.taxSetIsASplit
~Tree.getAllLeafNames
~Tree.getChildrenNums
~Tree.getDegree
~Tree.getLen
~Tree.getNodeNumsAbove
~Tree.getPreAndPostOrderAbove
~Tree.getPreAndPostOrderAboveRoot
~Tree.getSeqNumsAbove
~Tree.subTreeIsFullyBifurcating
~Tree.summarizeModelComponentsNNodes
~Tree.verifyIdentityWith
**Write**
.. autosummary::
:nosignatures:
~Tree.write
~Tree.writeNewick
~Tree.writeNexus
~Tree.writePhylip
~Tree.tPickle
See also Trees methods :meth:`p4.trees.Trees.writeNexus` and
:meth:`p4.trees.Trees.writeNewick` for doing trees by the bunch.
**Iteration over the nodes**
Sometimes you don't want to just iterate over the self.nodes list,
because after some manipulations a node might be in self.nodes but
not actually in the tree; using these 'iter' methods takes care of
that, skipping such nodes.
.. autosummary::
:nosignatures:
~Tree.iterInternals
~Tree.iterInternalsNoRoot
~Tree.iterInternalsNoRootPostOrder
~Tree.iterInternalsNoRootPreOrder
~Tree.iterInternalsPostOrder
~Tree.iterLeavesNoRoot
~Tree.iterLeavesPostOrder
~Tree.iterLeavesPreOrder
~Tree.iterNodes
~Tree.iterNodesNoRoot
~Tree.iterPostOrder
~Tree.iterPreOrder
~Tree.nextNode
See also :class:`~p4.node.Node` methods that do similar things starting from a given node.
**Copy**
.. autosummary::
:nosignatures:
~Tree.dupe
~Tree.copyToTree
~Tree.dupeSubTree
**In combination with Data and Model**
.. autosummary::
:nosignatures:
~Tree.calcLogLike
~Tree.optLogLike
~Tree.simulate
~Tree.getSiteLikes
~Tree.ancestralStateDraw
~Tree.bigXSquaredSubM
~Tree.compStatFromCharFreqs
~Tree.compoTestUsingSimulations
~Tree.modelFitTests
~Tree.modelSanityCheck
~Tree.simsForModelFitTests
~Tree.getEuclideanDistanceFromSelfDataToExpectedComposition
**Setting a model**
.. autosummary::
:nosignatures:
~Tree.newComp
~Tree.newRMatrix
~Tree.newGdasrv
~Tree.setPInvar
~Tree.setRelRate
~Tree.setModelComponentOnNode
~Tree.setModelComponentsOnNodesRandomly
~Tree.setModelComponentsNNodes
~Tree.summarizeModelComponentsNNodes
~Tree.setNGammaCat
~Tree.setTextDrawSymbol
**Tree manipulation**
.. autosummary::
:nosignatures:
~Tree.addLeaf
~Tree.addNodeBetweenNodes
~Tree.addSibLeaf
~Tree.addSubTree
~Tree.allBiRootedTrees
~Tree.collapseNode
~Tree.collapseClade
~Tree.ladderize
~Tree.lineUpLeaves
~Tree.nni
~Tree.nni2
~Tree.pruneSubTreeWithoutParent
~Tree.pruneSubTreeWithParent
~Tree.randomSpr
~Tree.randomizeTopology
~Tree.reRoot
~Tree.reconnectSubTreeWithoutParent
~Tree.reconnectSubTreeWithParent
~Tree.removeEverythingExceptCladeAtNode
~Tree.removeNode
~Tree.removeAboveNode
~Tree.removeRoot
~Tree.renameForPhylip
~Tree.resolve
~Tree.resolvePolytomyAtNode
~Tree.restoreDupeTaxa
~Tree.restoreNamesFromRenameForPhylip
~Tree.rotateAround
~Tree.spr
~Tree.stripBrLens
**Misc**
.. autosummary::
:nosignatures:
~Tree.checkDupedTaxonNames
~Tree.checkSplitKeys
~Tree.checkTaxNames
~Tree.checkThatAllSelfNodesAreInTheTree
~Tree.inputTreesToSuperTreeDistances
~Tree.makeSplitKeys
~Tree.readBipartitionsFromPaupLogFile
~Tree.recalculateSplitKeysOfNodeFromChildren
~Tree.setNexusSets
~Tree.topologyDistance
~Tree.tvTopologyCompare
~Tree.patristicDistanceMatrix
"""
def __init__(self):
self.fName = None # The name of the file it came from
self.name = None
self.root = None
self.nodes = []
# nodeNums of nodes root -> tips A numpy array
self.preOrder = None
# nodeNums of nodes tips -> root A numpy array
self.postOrder = None
self.preAndPostOrderAreValid = 0
# Usually weight is 1/N, so the reciprocal looks nicer
self.recipWeight = None
# self.weight = None # Only for floating point weights,
# so not usually ...
# An ordered list. self.taxNames is a property
self._taxNames = []
# A Data object. self.data is a property
self._data = None
self.cTree = None # A pointer to a c-struct
self.logLike = None
self.partLikes = None
# A Model object. self.model is a property
self._model = None
# self.nTax, a property
self._nTax = 0
self._nInternalNodes = -1
# self.nInternalNodes, a property
self.doDataPart = 0
self.nexusSets = None
self.nodeForSplitKeyDict = None
#########################################################
# Properties: data, model
#########################################################
#########################################################
# Properties: taxNames, nTax, nInternalNodes
#########################################################
@property
def taxNames(self):
"""(property) taxNames"""
return self._taxNames
@taxNames.setter
def taxNames(self, theTaxNames):
gm = ['Tree.taxNames']
if not isinstance(theTaxNames, list):
gm.append("You can only set property 'taxNames' to a list.")
gm.append("Got attempt to set to '%s'" % theTaxNames)
raise P4Error(gm)
self._taxNames = theTaxNames
if theTaxNames:
self.checkTaxNames()
@taxNames.deleter
def taxNames(self):
gm = ['Tree.taxNames']
gm.append(" Caught an attempt to delete self.taxNames, but")
gm.append("self.taxNames is a property, so you can't delete it.")
gm.append("But you can set it to an empty list if you like.")
raise P4Error(gm)
def _setTaxNamesFromLeaves(self):
tax = []
for n in self.iterNodes():
if n.isLeaf and n.name:
tax.append(n.name)
# This next line should not be needed, as root leaves should be
# leaves.
# terminal root that has a taxName
elif n == self.root and n.name and n.getNChildren() < 2:
tax.append(n.name)
tax.sort()
self._taxNames = tax
if self._taxNames:
self.checkTaxNames()
def _getNTax(self):
# We can't rely on len(self.taxNames), cuz it might not exist.
# if hasattr(self, '_nTax') and self._nTax:
if self._nTax:
return self._nTax
else:
nTax = 0
if self.nodes:
for n in self.iterNodes():
if n.isLeaf:
nTax += 1
self._nTax = nTax
return nTax
nTax = property(_getNTax)
"""(property) nTax"""
def _getNInternalNodes(self):
if self._nInternalNodes >= 0:
return self._nInternalNodes
elif not self.nodes:
return 0
else:
self._nInternalNodes = len([n for n in self.iterInternalsNoRoot()])
if not self.root.isLeaf:
self._nInternalNodes += 1
return self._nInternalNodes
def _setNInternalNodes(self, theNInternalNodes):
gm = ['Tree._setNInternalNodes()']
gm.append("Caught an attempt to set self.nInternalNodes, but")
gm.append(
"self.nInternalNodes is a property, so you shouldn't do that.")
raise P4Error(gm)
def _delNInternalNodes(self):
self._nInternalNodes = -1
nInternalNodes = property(_getNInternalNodes, _setNInternalNodes, _delNInternalNodes)
"""(property) nInternalNodes"""
##################################################
##################################################
[docs]
def dupe(self):
"""Duplicates self, but with no c-pointers. And no data object.
If there is a model, it is duped.
Returns a copy of self.
"""
storedData = None
if self.data:
storedData = self.data
# We don't want to have to copy a big data object, now do we?
self.data = None
dupe = copy.deepcopy(self)
# print 'Tree.dupe() self.root=%s, dupe.root=%s' % (self.root,
# dupe.root)
# Delete cPointers
for n in dupe.nodes:
if n.cNode:
n.cNode = None
if dupe.cTree:
dupe.cTree = None
if dupe.model and dupe.model.cModel:
dupe.model.cModel = None
if storedData:
self.data = storedData
return dupe
[docs]
def parseNexus(self, flob, translationHash=None, doModelComments=0):
"""Start parsing nexus format newick tree description.
From just after the command word 'tree', to the first paren of
the Newick part of the tree.
Args:
flob: an open file or file-like object
translationHash (dict): associates short names or numbers with
long proper taxon names
doModelComments (bool): whether to parse p4-specific model
command comments in the tree description
Returns:
None
"""
gm = ['Tree.parseNexus()'] # re-defined below
if 0:
print('Tree.parseNexus() translationHash = %s' % translationHash)
print(' doModelComments = %s (nParts)' % doModelComments)
print(' flob type is %s' % type(flob))
tok = safeNextTok(flob, 'Tree.parseNexus()')
# print 'parseNexus() tok = %s' % tok
tok = p4.func.nexusUnquoteName(tok)
if tok == '*':
print(gm[0])
print(" Ignoring '*' in tree description")
tok = safeNextTok(flob, 'Tree.parseNexus()')
if not p4.func.nexusCheckName(tok):
gm.append("Bad tree name: '%s'" % tok)
raise P4Error(gm)
self.name = tok
# print "got name: '%s'" % tok
# print "%s" % tok
gm = ["Tree.parseNexus() '%s'" % self.name] # re-defining
tok = safeNextTok(flob, gm[0])
if tok != '=':
gm.append("Tree name must be followed by '='")
raise P4Error(gm)
# Generally this is the beginning of the newick tree
# description. But we have to look ahead to see if there is a
# weight comment.
savedPos = flob.tell()
while 1:
beforeSafeNextTokPosn = flob.tell()
#print('mnop var.nexus_getAllCommandComments = %s' % var.nexus_getAllCommandComments)
tok = safeNextTok(flob, gm[0]) # skips [&U] if there is one
#print("parseNexus: tok after '=' is '%s'. Before pos %i, after pos %i" % (
# tok, beforeSafeNextTokPosn, flob.tell()))
# This next bit will only happen if either var.nexus_getWeightCommandComments
# or var nexus_getAllCommandComments is set.
if tok[0] == '[':
self.getWeightCommandComment(tok)
elif tok == '(':
flob.seek(beforeSafeNextTokPosn)
#print("wxy var.nexus_getAllCommandComments is %s" % var.nexus_getAllCommandComments)
self.parseNewick(flob, translationHash, doModelComments)
# self._initFinish()
break
elif tok == ';':
gm.append("Got ';' before any tree description.")
raise P4Error(gm)
elif tok[0] in string.ascii_letters + string.digits + '_' + "'":
flob.seek(savedPos, 0)
self.parseNewick(flob, translationHash, doModelComments)
# self._initFinish()
break
else:
gm.append('Expecting a newick tree description.')
raise P4Error(gm)
self._initFinish()
# print 'finished Tree.parseNexus()'
# print 'got recipWeight = %s' % self.recipWeight
# def printStack(self, theStack): # only used for debugging parseNewick()
# print 'stack = ',
# for n in theStack:
# print "%i['%s'] " % (n.nodeNum, n.name),
# print ''
[docs]
def parseNewick(self, flob, translationHash, doModelComments=0):
"""Parse Newick tree descriptions.
This is stack-based, and does not use recursion.
"""
if 0:
print('parseNewick here. doModelComments=%s' % doModelComments)
print(" translationHash=%s, self.taxNames=%s" % (translationHash, self.taxNames))
print(" flob type is %s, pos is %i" % (type(flob), flob.tell()))
print("wyy var.nexus_getAllCommandComments is %s" % var.nexus_getAllCommandComments)
print(f" var.punctuation is {var.punctuation}")
if self.name:
gm = ["Tree.parseNewick(), tree '%s'" % self.name]
else:
gm = ['Tree.parseNewick()']
if hasattr(flob, 'name') and flob.name:
self.fName = flob.name
gm[0] += ", File %s" % self.fName
stack = []
isAfterParen = 1 # to start, even tho its not true
isAfterComma = 0
parenNestLevel = 0
lastPopped = None
tok = nextTok(flob) # skip over [&U] if there is one.
# print("xx Got tok %s" % tok)
if not tok:
return
isQuotedTok = False
if tok.startswith("'"):
isQuotedTok = True
# Should generally be the opening paren, except if its a single-node
# tree.
tok = p4.func.nexusUnquoteName(tok)
if doModelComments:
# Turn on var.nexus_getAllCommandComments in order
# to be able to read model info on nodes, eg [& c0.1]
# restore at end
# We need this after the safeNextTok() above, in case there is a [&U] to be ignored.
savedP4Nexus_getAllCommandComments = var.nexus_getAllCommandComments
var.nexus_getAllCommandComments = 1
while tok != ';':
# print("top of loop tok '%s', isQuotedTok=%s, tok[0] is '%s'" % (tok, isQuotedTok, tok[0]))
if tok == '(':
# print "Got '(': new node (%i)." % len(self.nodes)
if not (isAfterParen or isAfterComma):
gm.append(
'Got badly-placed paren, not after a paren or comma.')
raise P4Error(gm)
newNode = Node()
if doModelComments:
for pNum in range(doModelComments):
newNode.parts.append(NodePart())
newNode.br.parts.append(NodeBranchPart())
# self.printStack(stack)
if len(stack):
newNode.parent = stack[-1]
if newNode.parent.leftChild == None:
newNode.parent.leftChild = newNode
else:
newNode.parent.rightmostChild().sibling = newNode
else:
if len(self.nodes) == 0:
self.root = newNode
# Sometimes. Generally not true-- corrected at the end.
newNode.isLeaf = 1
else:
gm.append('Something is wrong. Stack is empty.')
gm.append('Extra paren?')
raise P4Error(gm)
newNode.nodeNum = len(self.nodes)
self.nodes.append(newNode)
stack.append(newNode)
isAfterParen = 1
parenNestLevel += 1
elif tok == ',':
if isAfterParen:
gm.append('Got comma after paren.')
raise P4Error(gm)
elif isAfterComma:
gm.append('Got comma after comma.')
raise P4Error(gm)
# self.printStack(stack)
try:
lastPopped = stack.pop()
except IndexError:
gm.append('Empty stack. Out of place comma?')
raise P4Error(gm)
isAfterComma = 1
if len(stack) == 0:
gm.append('Empty stack. Out of place comma?')
raise P4Error(gm)
elif tok == ')':
try:
lastPopped = stack.pop()
except IndexError:
gm.append('Empty stack. Out of place unparen?')
raise P4Error(gm)
isAfterParen = 0
isAfterComma = 0
parenNestLevel = parenNestLevel - 1
if parenNestLevel < 0:
gm.append('Unmatched unparen.')
raise P4Error(gm)
if len(stack) == 0 and len(self.nodes) > 1:
gm.append('Empty stack. Out of place unparen?')
raise P4Error(gm)
elif tok[0] in string.ascii_letters or tok[0] in string.digits or tok[0] in var.nexus_safeChars \
or isQuotedTok or tok[0] in ['_', '#', '\\', '/', '"', '(', ')']:
# A single-node tree, not ()aName, rather just aName.
# print(f"parseNewick() here ACE tok={tok}")
if len(self.nodes) == 0:
isAfterParen = 1
if not (isAfterParen or isAfterComma):
# Probably a name of an internal node.
# print(f"parseNewick() here ACG tok={tok} isAfterParen={isAfterParen} len(stack)={len(stack)}")
if len(stack):
# print(f"parseNewick() here ACI stack[-1].name={stack[-1].name}")
# if stack[-1].isLeaf and stack[-1].name != '(':
if stack[-1].name:
if not var.newick_allowSpacesInNames:
# a second name after a node name, eg (A foo, B) =>foo is bad
# or eg (A, B)foo bar => bar is bad
gm.append("Badly placed token '%s'." % tok)
gm.append("Appears to be a second node name, after '%s'" % stack[-1].name)
gm.append('Missing comma maybe? Or punctuation or spaces in an unquoted name?')
gm.append("To allow reading Newick (or Nexus) with spaces, ")
gm.append("turn var.newick_allowSpacesInNames on")
raise P4Error(gm)
else:
stack[-1].name += ' '
stack[-1].name += tok
else:
# Usually this...
# print("parseNewick() here ACK naming node %i as '%s'" % (stack[-1].nodeNum, tok))
# We allow bad names on internal nodes, ie we do
# not nexusCheckName(tok)
stack[-1].name = tok
# We may have just named the root node, and it may be a leaf, and perhaps should have a translation.
# But we do not know that yet. Checked at the end.
else: # len(stack) == 0
if lastPopped and lastPopped.name == None: # ()A
# print "naming lastPopped node %i with '%s'" %
# (lastPopped.nodeNum, tok)
lastPopped.isLeaf = 1
#lastPopped.label = tok
lastPopped.name = tok
else:
gm.append("Badly placed token '%s' in tree description." % tok)
raise P4Error(gm)
else:
# A new terminal node.
if tok[0] in string.ascii_letters or tok[0] in ['_']:
if translationHash and tok in translationHash:
# print 'got key %s, val is %s' % (tok,
# translationHash[tok])
tok = translationHash[tok]
elif tok[0] in string.digits:
if var.nexus_allowAllDigitNames:
if translationHash and tok in translationHash:
# print 'got key %s, val is %s' % (tok,
# translationHash[tok])
tok = translationHash[tok]
else:
try:
tok = int(tok)
if translationHash and repr(tok) in translationHash:
tok = translationHash[repr(tok)]
elif translationHash and repr(tok) not in translationHash:
gm.append("There is a 'translation' for this tree, but the")
gm.append("number '%i' in the tree description" % tok)
gm.append('is not included in that translate command.')
raise P4Error(gm)
elif self.taxNames:
try:
tok = self.taxNames[tok - 1]
except IndexError:
gm.append("Can't make sense out of token '%s' for a new terminal node." % tok)
gm.append('There is no translate command, and the taxNames does not')
gm.append('have a value for that number.')
raise P4Error(gm)
else:
gm.append("We have a taxon name '%s', composed only of numerals." % tok)
gm.append(" ")
gm.append('The Nexus format allows tree specifications with no')
gm.append('translate command to use integers to refer to taxa.')
gm.append('That is possible because in a proper Nexus file the')
gm.append('taxa are defined before the trees. P4, however, does')
gm.append('not require definition of taxa before the trees, and in')
gm.append('the present case no definition was made. Deal with it.')
raise P4Error(gm)
except ValueError:
if translationHash and repr(tok) in translationHash:
tok = translationHash[repr(tok)]
# else: # starts with a digit, but it is not an int.
# gm.append('Problem token %s' % tok)
# raise P4Error(gm)
#print("Got terminal node '%s'" % tok)
newNode = Node()
if doModelComments:
for pNum in range(doModelComments):
newNode.parts.append(NodePart())
newNode.br.parts.append(NodeBranchPart())
newNode.isLeaf = 1
if p4.func.nexusCheckName(tok):
newNode.name = tok
# print 'got newNode.name = %s' % tok
else:
gm.append("Bad name '%s'" % tok)
raise P4Error(gm)
if len(stack):
newNode.parent = stack[-1]
if newNode.parent.leftChild == None:
newNode.parent.leftChild = newNode
else:
newNode.parent.rightmostChild().sibling = newNode
newNode.nodeNum = len(self.nodes)
if len(self.nodes) == 0:
self.root = newNode
self.nodes.append(newNode)
stack.append(newNode)
isAfterParen = 0
isAfterComma = 0
elif tok == ':':
# Looking for a br.len number, which might be eg 0.234 or -1.23e-05
# It might be a multi-token operation. Accumulate tok's in
# theNum
theNum = nextTok(flob)
if not theNum:
gm.append('Tree description ended with a colon. Bad!')
raise P4Error(gm)
# print " Got token after colon: '%s'" % theNum
if theNum == '-' or theNum == '+':
tok = nextTok(flob)
# print " Got tok: '%s' after '%s'" % (tok, theNum)
if not tok:
gm.append('Trying to deal with a branch length.')
gm.append("It didn't work, tho.")
gm.append("Got this after colon: '%s'" % theNum)
gm.append('followed by nothing.')
raise P4Error(gm)
theNum += tok
try:
# If it is a simple number like 0.123 or -23, then we are
# finished.
# Won't work if it ends in 'e'
stack[-1].br.len = float(theNum)
#print(' Successfully got br.len %f' % stack[-1].br.len)
except ValueError:
# The first bit after the colon is hopefully something like
# +1.23e
if theNum[-1] not in ['e', 'E']:
gm.append(
'Trying to deal with a branch length after a colon, but I am totally confused.')
gm.append("Can't make sense out of '%s'" % theNum)
raise P4Error(gm, 'newick_badBranchLength')
try:
float(theNum[:-1])
except ValueError:
gm.append(
'Trying to deal with a branch length after a colon, but I am totally confused.')
gm.append("Can't make sense out of '%s'" % theNum)
raise P4Error(gm, 'newick_badBranchLength')
# Now we are sure that the first bit *is* something like +1.23e
# We do not allow spaces after the 'e', so we do not use nextTok().
# That introduces a bug, where comments inserted in the number don't get ignored. <<== unfixed bug!
# The first thing must be a '+' or a '-'.
c = flob.read(1)
if not c:
gm.append(
'Trying to deal with a branch length, possibly in scientific notation.')
gm.append(
"Got '%s' after the colon, but then nothing." % theNum)
raise P4Error(gm)
if c not in ['+', '-']:
gm.append(
'Trying to deal with a branch length, possibly in scientific notation.')
gm.append("Got '%s' after the colon." % theNum)
gm.append(
"Expecting a '+' or '-' after that (no spaces allowed).")
gm.append("Got '%s'." % c)
raise P4Error(gm)
# Accumulate characters in 'theExp'. We need at least one
# digit.
theExp = c
c = flob.read(1)
if not c:
gm.append('Trying to deal with a branch length, possibly in scientific notation.')
gm.append("Got '%s%s' after the colon, but then nothing." % (theNum, theExp))
raise P4Error(gm)
if c not in string.digits:
gm.append("Trying to deal with a branch length, possibly in scientific notation.")
gm.append("Got '%s%s' after the colon." % (theNum, theExp))
gm.append('Expecting one or more digits.')
gm.append("Got '%s'" % c)
raise P4Error(gm)
theExp += c
# So we got one good digit. Are there any more?
while 1:
positionBeforeRead = flob.tell()
c = flob.read(1)
if not c:
gm.append('Trying to deal with a branch length, possibly in scientific notation.')
gm.append("Got '%s%s' after the colon, but then nothing." % (theNum, theExp))
raise P4Error(gm)
# We got something. If its a digit, add it to
# theExp. If its anything else, back up one
# space and then break
if c in string.digits:
theExp += c
else:
flob.seek(positionBeforeRead)
break
#print(" At this point, theNum='%s' and theExp='%s'" % (theNum, theExp))
try:
# print " Trying to see if theExp '%s' can be
# converted to an int." % theExp
int(theExp)
try:
theBrLen = float(theNum + theExp)
# print ' Successfully got br.len %g (from %s%s)'
# % (theBrLen, theNum, theExp)
stack[-1].br.len = theBrLen
except ValueError:
gm.append('Trying to deal with a branch length, possibly in scientific notation.')
gm.append("It didn't work, tho.")
gm.append("Got these after colon: '%s' and '%s'" % (theNum, theExp))
gm.append('And they could not be converted to an exponential float.')
raise P4Error(gm)
except ValueError:
gm.append('Trying to deal with a branch length, possibly in scientific notation.')
gm.append("It didn't work, tho.")
gm.append("Got these after colon: '%s' and '%s'." % (theNum, theExp))
gm.append('And the latter does not appear to be an int.')
raise P4Error(gm)
elif tok[0] == '[':
# print("openSquareBracket. Got tok '%s'" % tok)
# if doModelComments is set, it should be set to nParts.
if doModelComments:
# eg [& c0.1 r0.0]
n = stack[-1]
# print('got comment %s, node %i' % (tok, n.nodeNum))
cFlob = io.StringIO(tok)
cFlob.seek(2)
tok2 = safeNextTok(cFlob)
while 1:
if tok2 == ']':
break
elif tok2[0] in ['c', 'r', 'g']:
ending = tok2[1:]
splitEnding = ending.split('.')
try:
firstNum = int(splitEnding[0])
secondNum = int(splitEnding[1])
except ValueError:
gm.append('Bad command comment %s' % tok)
raise P4Error(gm)
if tok2[0] == 'c':
n.parts[firstNum].compNum = secondNum
if tok2[0] == 'r':
n.br.parts[firstNum].rMatrixNum = secondNum
if tok2[0] == 'g':
n.br.parts[firstNum].gdasrvNum = secondNum
else:
gm.append('Bad command comment %s' % tok)
raise P4Error(gm)
tok2 = safeNextTok(cFlob)
elif 0:
# Ugly hack for RAxML trees with bootstrap
# supports in square brackets after the br len, on
# internal nodes. First modify the eg [100] to be
# [&100], set var.nexus_getAllCommandComments =
# True, and turn on this elif section.
myNode = stack[-1]
assert not myNode.isLeaf
assert not myNode.name
mySupportString = tok[2:-1]
# print mySupportString
myNode.name = mySupportString
elif var.nexus_readBeastTreeCommandComments:
n = stack[-1]
i = 2
while i < (len(tok) - 1):
j = i
inBraces = False
while 1:
j += 1
# print tok[j]
if tok[j] == ']':
break
if tok[j] == '{':
inBraces = True
if inBraces:
if tok[j] == '}':
inBraces = False
else:
if tok[j] == ',':
break
substring = tok[i:j]
# print(substring)
splSS = substring.split('=')
theNameString = splSS[0].strip()
for badChr in "%()":
if badChr in theNameString:
theNameString = theNameString.replace(badChr, '')
theValString = splSS[1]
if '{' in theValString:
theValString = theValString.replace('{', '(')
theValString = theValString.replace('}', ')')
if theValString == 'true':
theVal = True
elif theValString == 'false':
theVal = False
else:
theVal = eval(theValString)
if 0:
# Check type
print(f"beast command comment theValString = {theValString}")
if isinstance(theVal, (int, float, tuple, bool)):
n.__setattr__(theNameString, theVal)
else:
print(f"reading beastTreeCommandComment; cannot parse value {theValString} for {theNameString}")
n.__setattr__(theNameString, theVal)
i = j + 1
elif 0:
# branch colours, after the branch length, eg '[&!color=#8caaf4]'
if tok.startswith("[&!color="):
myNode = stack[-1]
theColour = tok[9:-1]
#print(theColour)
myNode.br.colour = theColour
else:
gm.append("I can't make sense of the token '%s'" % tok)
if len(tok) == 1:
if tok[0] in var.punctuation:
gm.append("The token is in var.punctuation. If you don't think it should")
gm.append("be, you can modify what p4 thinks that punctuation is.")
gm.append("So you might do this:")
gm.append("var.punctuation = var.phylip_punctuation")
gm.append("(or use your own definition -- see var.py)")
gm.append("read('yourWackyTreeFile.phy')")
gm.append("That might work.")
if tok[0] not in var.punctuation:
gm.append("The token is not in your current var.punctuation.")
#gm.append("tok[0] is '%s'" % tok[0])
raise P4Error(gm)
sTok = safeNextTok(flob, 'Tree init, reading tree string')
if sTok.startswith("'"):
isQuotedTok = True
else:
isQuotedTok = False
tok = p4.func.nexusUnquoteName(sTok)
#print('got tok for next round = %s' % tok)
# This is the end of the "while tok != ';':" loop
# print '\n*** Stack len = %i ***' % len(stack)
if parenNestLevel > 0:
gm.append('Unmatched paren.')
raise P4Error(gm)
elif parenNestLevel < 0:
gm.append('Unmatched unparen.')
raise P4Error(gm)
if len(stack) == 0:
if len(self.nodes) == 1:
pass
else:
gm.append(
"Got an oddly-placed ';' in the tree %s description." % self.name)
self.dump(treeInfo=0, nodeInfo=1)
raise P4Error(gm)
elif len(stack) > 1:
gm.append(
"Got an oddly-placed ';' in the tree %s description." % self.name)
# gm.append('(stack len = %i)' % len(stack)
#self.dump(tree=0, node=1)
raise P4Error(gm)
if self.root.leftChild and self.root.leftChild.sibling: # usually this
self.root.isLeaf = 0
else:
# The root is a leaf
assert self.root.isLeaf
if self.root.name:
if translationHash and self.root.name in translationHash:
self.root.name = translationHash[self.root.name]
# Should a root on a stick be a leaf? If it is just for
# display purposes, then it should be ok to not be a leaf.
# But if you are going to re-Root, then it will cause trouble.
# So by default, a root on a stick should be a leaf. I think.
# Hopefully if you are dealing with this, then you know what
# you are doing and what you want, and how to modify things to
# get it.
# Uncomment this next line to make it always non-leaf, even if it is a
# leaf.
# self.root.isLeaf = 0 # do this to always have a non-leaf
# root-on-a-stick <-- Potential Trouble!!!
self.root.br = None
# self.draw()
#self.dump(tree=0, node=1, treeModel=0)
if doModelComments:
# restore the value of var.nexus_getAllCommandComments, which was
# saved above.
var.nexus_getAllCommandComments = savedP4Nexus_getAllCommandComments
def _initFinish(self):
if self.name:
gm = ["Tree._initFinish() tree '%s'" % self.name]
else:
gm = ['Tree._initFinish()']
# Checking for duped taxon names used to be here, but it was
# pushed further ahead, so that self.taxNames can be corrected
# also, if need be. At this point, self does not have
# taxNames set.
# Check that all terminal nodes have names
for item in self.nodes:
if item.isLeaf:
# print 'leaf name %s' % item.name
if not item.name:
if item == self.root:
if var.warnAboutTerminalRootWithNoName:
print('Tree._initFinish()')
print(' Non-fatal warning: the root is terminal, but has no name.')
print(' This may be what you wanted. Or not?')
print(' (To get rid of this warning, turn off var.warnAboutTerminalRootWithNoName)')
else:
gm.append('Got a terminal node with no name.')
raise P4Error(gm)
self.preOrder = numpy.array(
[var.NO_ORDER] * len(self.nodes), numpy.int32)
self.postOrder = numpy.array(
[var.NO_ORDER] * len(self.nodes), numpy.int32)
if len(self.nodes) > 1:
self.setPreAndPostOrder()
[docs]
def checkDupedTaxonNames(self):
# Called by p4.func._tryToReadNexusFile() and p4.func._tryToReadPhylipFile()
# Check for duped names
if self.name:
gm = ["Tree.checkDupedTaxonNames() tree '%s'" % self.name]
else:
gm = ['Tree.checkDupedTaxonNames()']
if self.fName:
gm[0] += ' file=%s' % self.fName
hasDupedName = 0
loNames = []
for n in self.nodes:
if n.isLeaf and n.name:
loNames.append(n.name.lower())
for loName in loNames:
if loNames.count(loName) > 1:
if var.allowDupedTaxonNames:
pass
elif not var.doRepairDupedTaxonNames:
gm.append(
"Got duplicated taxon (lowercased) name '%s'." % loName)
gm.append(
'Since var.doRepairDupedTaxonNames is not turned on, p4 will not fix duplications.')
gm.append('To repair duplications verbosely, set ')
gm.append('var.doRepairDupedTaxonNames = 1')
gm.append('To repair duplications silently, set')
gm.append('var.doRepairDupedTaxonNames = 2')
raise P4Error(gm)
hasDupedName = 1
break
if hasDupedName:
# print self.name
if var.allowDupedTaxonNames:
# more hacking ...
if var.allowDupedTaxonNames == 2: # ie silently.
pass
else:
complainedAlready = []
for loName in loNames:
if loNames.count(loName) > 1 and loName not in complainedAlready:
if self.name:
print(" Tree %s. Duped tax name (lowercased) '%s'" % (
self.name, loName))
else:
print(" Duped tax name (lowercased) '%s'" % loName)
complainedAlready.append(loName)
elif var.doRepairDupedTaxonNames:
repairedNames = []
for loName in loNames:
if loNames.count(loName) > 1 and n not in repairedNames:
repairCounter = 1
repairCounter2 = 1
for n in self.nodes:
if n.isLeaf:
if n.name and n.name.lower() == loName:
newName = '%s_%i' % (n.name, repairCounter)
if var.doRepairDupedTaxonNames == 1:
if self.name:
print(" Tree %s. Changing '%s' to '%s'" % (
self.name, n.name, newName))
else:
print(" Changing '%s' to '%s'" % (n.name, newName))
n.name = newName
repairedNames.append(loName)
repairCounter += 1
if self.taxNames:
for tNameNum in range(len(self.taxNames)):
tName = self.taxNames[tNameNum]
if tName.lower() == loName:
newName = '%s_%i' % (tName, repairCounter2)
self.taxNames[tNameNum] = newName
repairCounter2 += 1
assert repairCounter == repairCounter2, "Got a problem with re-naming duped taxa."
##############
##############
#
# dump()
#
##############
##############
[docs]
def dump(self, tree=0, node=0, model=0, all=0):
"""Print rubbish about self.
tree
is the default, showing basic info about the tree.
node
shows info about all the nodes.
model
shows which model component number goes on which node.
(which you can also get by drawing the tree)
(If you want the info about the model itself, do a
aTree.model.dump() instead.)
"""
if all:
self._doTreeInfo()
self._doNodeInfo()
self._doNodeModelInfo()
elif not tree and not node and not model:
self._doTreeInfo()
else:
if tree:
self._doTreeInfo()
if node:
self._doNodeInfo()
if model:
self._doNodeModelInfo()
def _doTreeInfo(self):
if self.name:
print("Tree '%s' dump" % self.name)
else:
print('Tree dump. No name.')
if self.fName:
print(" From file '%s'" % self.fName)
else:
print(" From an unknown file, or no file.")
if self.root:
print(' Node %i is root' % self.root.nodeNum)
else:
print(' There is no root')
if self.recipWeight:
print(' The tree recipWeight is %s' % self.recipWeight)
else:
print(' There is no recipWeight')
print(' There are %i nodes' % len(self.nodes))
terminals = 0
for i in self.nodes:
if i.isLeaf:
terminals += 1
print(' of which %i are terminal nodes' % terminals)
if self.data:
print(' There is a data object, with %i parts.' % self.data.nParts)
else:
print(' There is no data object.')
if self.data:
print(' The data came from file(s):')
for a in self.data.alignments:
if a.fName:
print(' %s' % a.fName)
if self.model:
print(' There is a model object, with %i parts.' % self.model.nParts)
if self.model.cModel:
print(' model.cModel is %i' % self.model.cModel)
else:
print(' There is no cModel.')
else:
print(' There is no model object.')
if self.taxNames:
print(' There is a taxNames list.')
else:
print(' There is no taxNames list.')
if self.cTree:
print(' cTree is %i' % self.cTree)
else:
print(' There is no cTree.')
[docs]
def _doNodeInfo(self):
"""Basic rubbish about nodes."""
print('\n-------- nodes -----------------------------------------')
print('%7s %6s %6s %6s %6s %7s %6s %4s' % ('nodeNum', 'isLeaf', 'parent', 'leftCh',
'siblng', 'br.len', 'seqNum', 'name'))
for n in self.nodes:
print('%7s %6s' % (n.nodeNum, n.isLeaf), end=' ')
if n.parent:
print('%6s' % n.parent.nodeNum, end=' ')
else:
print('%6s' % 'None', end=' ')
if n.leftChild:
print('%6s' % n.leftChild.nodeNum, end=' ')
else:
print('%6s' % 'None', end=' ')
if n.sibling:
print('%6s' % n.sibling.nodeNum, end=' ')
else:
print('%6s' % 'None', end=' ')
if n.br and (n.br.len or n.br.len == 0.0):
print('%7.3f' % n.br.len, end=' ')
else:
print('%7s' % 'None', end=' ')
if n.seqNum or n.seqNum == 0:
print('%6s' % n.seqNum, end=' ')
else:
print('%6s' % 'None', end=' ')
if n.name:
print(' %s' % n.name)
else:
print(' %s' % 'None')
print('--------------------------------------------------------\n')
doMore = 0
for n in self.iterNodesNoRoot():
if hasattr(n.br, 'name') and n.br.name:
doMore = 1
break
elif hasattr(n.br, 'uName') and n.br.uName:
doMore = 1
break
elif hasattr(n.br, 'support') and n.br.support:
doMore = 1
break
if doMore:
print('\n-------- more node stuff -------------------------------')
print('%7s %10s %10s %10s %4s' % ('nodeNum', 'br.name', 'br.uName', 'br.support', 'name'))
for n in self.nodes:
print('%7s' % n.nodeNum, end=' ')
if n.br and hasattr(n.br, 'name') and n.br.name:
print('%10s' % n.br.name, end=' ')
else:
print('%10s' % '-', end=' ')
if n.br and hasattr(n.br, 'uName') and n.br.uName:
print('%10s' % n.br.uName, end=' ')
else:
print('%10s' % '-', end=' ')
if n.br and hasattr(n.br, 'support') and n.br.support:
print('%10.4f' % n.br.support, end=' ')
else:
print('%10s' % '-', end=' ')
if n.name:
print(' %s' % n.name)
else:
print(' %s' % 'None')
print('--------------------------------------------------------\n')
doMore = 0
for n in self.nodes:
if hasattr(n, 'rootCount') and n.rootCount:
doMore = 1
break
if n.br:
if hasattr(n.br, 'color') and n.br.color:
doMore = 1
break
elif hasattr(n.br, 'biRootCount') and n.br.biRootCount:
doMore = 1
break
if doMore:
print('\n-------- even more node stuff --------------------------')
print('%7s %10s %14s %10s %4s' % ('nodeNum', 'br.color', 'br.biRootCount', 'rootCount', 'name'))
for n in self.nodes:
print('%7s' % n.nodeNum, end=' ')
if n.br and hasattr(n.br, 'color') and n.br.color:
print('%10s' % n.br.color, end=' ')
else:
print('%10s' % '-', end=' ')
if n.br and hasattr(n.br, 'biRootCount') and n.br.biRootCount:
print('%14s' % n.br.biRootCount, end=' ')
else:
print('%14s' % '-', end=' ')
if hasattr(n, 'rootCount') and n.rootCount:
print('%10s' % n.rootCount, end=' ')
else:
print('%10s' % '-', end=' ')
if n.name:
print(' %s' % n.name)
else:
print(' %s' % 'None')
print('--------------------------------------------------------\n')
def _doNodeModelInfo(self):
if not self.model:
print('\n****** Node Model Info. No model.')
if not self.data:
print('(no data attached, either)')
else:
print('\n****** Node Model Info. nParts=%s' % self.model.nParts)
if not self.data:
print('no data')
if not self.model.nParts:
return
print('\nComps in the nodes:')
print(' %13s' % 'nodeNum', end=' ')
for i in range(self.model.nParts):
print(' %8s' % 'part%i' % i, end=' ')
print('')
for n in self.nodes:
print(' %13i' % n.nodeNum, end=' ')
for i in range(self.model.nParts):
print('%8i' % n.parts[i].compNum, end=' ')
print('')
print('\nrMatrices in the nodes:')
print(' %13s' % 'nodeNum', end=' ')
for i in range(self.model.nParts):
print(' %8s' % 'part%i' % i, end=' ')
print('')
for n in self.iterNodesNoRoot():
print(' %13i' % n.nodeNum, end=' ')
for i in range(self.model.nParts):
print('%8i' % n.br.parts[i].rMatrixNum, end=' ')
print('')
print('\ngdasrvs in the nodes:')
print(' %13s' % '', end=' ')
for i in range(self.model.nParts):
print(' %8s' % 'part%i' % i, end=' ')
print('')
print(' %13s' % 'nGammaCats ->', end=' ')
for i in range(self.model.nParts):
print('%8i' % self.model.parts[i].nGammaCat, end=' ')
print('\n')
print(' %13s' % 'nodeNum', end=' ')
for i in range(self.model.nParts):
print(' %8s' % 'part%i' % i, end=' ')
print('')
for n in self.iterNodesNoRoot():
print(' %13i' % n.nodeNum, end=' ')
for i in range(self.model.nParts):
print('%8i' % n.br.parts[i].gdasrvNum, end=' ')
print('')
###########################
#
# Set a NexusSets object, for taxSets ...
#
###########################
[docs]
def setNexusSets(self):
"""Set self.nexusSets from var.nexusSets.
A deepcopy is made of var.nexusSets, only if it exists. If
var.nexusSets does not yet exist, a new blank one is not made
(cf this method in Alignment class, where it would be
made).
Important! This method depends on a correct taxNames.
"""
assert self.taxNames, "This method requires correct taxNames, in the correct order."
gm = ["Tree.setNexusSets()"]
if not var.nexusSets:
return
self.nexusSets = copy.deepcopy(var.nexusSets)
self.nexusSets.taxNames = self.taxNames
self.nexusSets.nTax = self.nTax
self.nexusSets.constant = None
self.nexusSets.gapped = None
self.nexusSets.charSets = []
self.nexusSets.charPartitions = []
if self.nexusSets.taxSets:
# print "%s. There are %i taxSets." % (gm[0], len(self.nexusSets.taxSets))
# Check that no taxSet name is a taxName
lowSelfTaxNames = [txName.lower()
for txName in self.taxNames]
for ts in self.nexusSets.taxSets:
if ts.lowName in lowSelfTaxNames:
gm.append(
"Can't have taxSet names that are the same (case-insensitive) as a tax name")
gm.append(
"Lowercased taxSet name '%s' is the same as a lowcased taxName." % ts.name)
raise P4Error(gm)
self.nexusSets.lowTaxNames = lowSelfTaxNames
# If it is standard format,
# convert triplets to numberTriplets, and then mask
for ts in self.nexusSets.taxSets:
if ts.format == 'standard':
ts.setNumberTriplets()
ts.setMask()
elif ts.format == 'vector':
assert ts.mask
if len(ts.mask) != self.nTax:
gm.append("taxSet %s" % ts.name)
gm.append(
"It is vector format, but the length is wrong.")
gm.append(
"taxSet mask is length %i, but self nTax is %i" % (len(ts.mask), self.nTax))
raise P4Error(gm)
else:
gm.append("taxSet %s" % ts.name)
gm.append("unknown format %s" % ts.format)
raise P4Error(gm)
# Now set ts.taxNames from the mask
ts.taxNames = [self.taxNames[i] for i,c in enumerate(ts.mask) if c == '1']
###########################
#
# Get lists of nodeNums ...
#
###########################
[docs]
def getNodeNumsAbove(self, nodeSpecifier, leavesOnly=0):
"""Gets a list of nodeNums, in postOrder, above nodeSpecifier.
The node specified is not included.
"""
x, y = self.getPreAndPostOrderAbove(nodeSpecifier)
if leavesOnly:
tOnly = []
for i in y[:-1]:
if self.nodes[i].isLeaf:
tOnly.append(i)
return tOnly
return y[:-1]
[docs]
def getSeqNumsAbove(self, nodeSpecifier):
"""Gets a list of seqNums above nodeSpecifier."""
x, y = self.getPreAndPostOrderAbove(nodeSpecifier)
seqNums = []
for nNum in y[:-1]:
n = self.nodes[nNum]
if n.isLeaf:
seqNums.append(n.seqNum)
return seqNums
# def getAllChildrenNums(self, specifier):
# """Returns a list of the nodeNums of all children of the specified node
# Ambiguous, unused.
# """
# theNode = self.node(specifier)
# if not theNode:
# gm = ['Tree.getChildrenNums()']
# gm.append('Bad node specifier')
# raise P4Error(gm)
# ret = []
# x, y = self.getPreAndPostOrderAbove(specifier)
# for i in y[:-1]:
# ret.append(self.nodes[i].nodeNum)
# return ret
[docs]
def getAllLeafNames(self, specifier):
"""Returns a list of the leaf names of all children
"""
theNode = self.node(specifier)
if not theNode:
gm = ['Tree.getChildrenNums()']
gm.append('Bad node specifier')
raise P4Error(gm)
ret = []
x, y = self.getPreAndPostOrderAbove(specifier)
for i in y[:-1]:
if self.nodes[i].isLeaf:
ret.append(self.nodes[i].name)
return ret
[docs]
def getChildrenNums(self, specifier):
"""Returns a list of nodeNums of children of the specified node.
See also Node.getNChildren()"""
theNode = self.node(specifier)
if not theNode:
gm = ['Tree.getChildrenNums()']
gm.append('Bad node specifier')
raise P4Error(gm)
ret = []
c = theNode.leftChild
while 1:
if c:
ret.append(c.nodeNum)
c = c.sibling
else:
return ret
[docs]
def setPreAndPostOrder(self):
"""Sets or re-sets self.preOrder and self.postOrder lists of node numbers.
PreOrder starts from the root and goes to the tips; postOrder
starts from the tips and goes to the root."""
self.getPreAndPostOrderAboveRoot()
self.preAndPostOrderAreValid = 1
[docs]
def getPreAndPostOrderAbove(self, nodeSpecifier):
"""Returns 2 lists of node numbers, preOrder and postOrder.
This uses a stack, not recursion, so it should work for large
trees without bumping into the recursion limit. The 2 lists
are relative to the node specified, and include the node
specified. PreOrder starts from theNode and goes to the tips;
postOrder starts from the tips and goes to theNode."""
gm = ['Tree.getPreAndPostOrderAbove()']
theNode = self.node(nodeSpecifier)
preOrder = [] # nodeNum's
postOrder = []
if not theNode.leftChild:
preOrder.append(theNode.nodeNum)
postOrder.append(theNode.nodeNum)
return preOrder, postOrder
stack = [] # nodes
stack.append(theNode)
preOrder.append(theNode.nodeNum)
while len(stack):
if stack[-1].leftChild:
# print 'leftChild: %i' % stack[-1].leftChild.nodeNum
theNodeNum = stack[-1].leftChild.nodeNum
stack.append(stack[-1].leftChild)
preOrder.append(theNodeNum)
elif stack[-1].sibling:
# print 'sibling: %i' % stack[-1].sibling.nodeNum
theNodeNum = stack[-1].sibling.nodeNum
theSib = stack[-1].sibling
# print ' postOrder appending u %i' %
# stack[-1].nodeNum
postOrder.append(stack[-1].nodeNum)
stack.pop()
stack.append(theSib)
preOrder.append(theNodeNum)
else:
# print ' postOrder appending v %i' %
# stack[-1].nodeNum
postOrder.append(stack[-1].nodeNum)
stack.pop()
if len(stack) == 0:
break
# print ' postOrder appending w %i' %
# stack[-1].nodeNum
postOrder.append(stack[-1].nodeNum)
theNode = stack.pop()
while not theNode.sibling:
if len(stack) == 0:
break
# print ' postOrder appending x %i' %
# stack[-1].nodeNum
postOrder.append(stack[-1].nodeNum)
theNode = stack.pop()
if len(stack) == 0:
break
if theNode.sibling:
stack.append(theNode.sibling)
preOrder.append(theNode.sibling.nodeNum)
else:
gm.append('Problemo.')
gm.append('xxx got theNode %s' % theNode.nodeNum)
raise P4Error(gm)
return preOrder, postOrder
[docs]
def getPreAndPostOrderAboveRoot(self):
"""Sets self.preOrder and self.postOrder.
This uses a stack, not recursion, so it should work for large
trees without bumping into the recursion limit. PreOrder
starts from the root and goes to the tips; postOrder starts
from the tips and goes to the root."""
gm = ['Tree.getPreAndPostOrderAboveRoot()']
theNode = self.root
preOrdIndx = 0
postOrdIndx = 0
if self.preOrder is None or self.postOrder is None or len(self.preOrder) != len(self.nodes) or len(self.postOrder) != len(self.nodes):
self.preOrder = numpy.array(
[var.NO_ORDER] * len(self.nodes), numpy.int32)
self.postOrder = numpy.array(
[var.NO_ORDER] * len(self.nodes), numpy.int32)
# print "xx self.preOrder=%s" % self.preOrder
if not theNode.leftChild:
self.preOrder[preOrdIndx] = theNode.nodeNum
preOrdIndx += 1
self.postOrder[postOrdIndx] = theNode.nodeNum
postOrdIndx += 1
else:
stack = [] # nodes
stack.append(theNode)
self.preOrder[preOrdIndx] = theNode.nodeNum
preOrdIndx += 1
while len(stack):
if stack[-1].leftChild:
# print 'leftChild: %i (%s)' %
# (stack[-1].leftChild.nodeNum, [n.nodeNum for n in stack])
theNodeNum = stack[-1].leftChild.nodeNum
stack.append(stack[-1].leftChild)
self.preOrder[preOrdIndx] = theNodeNum
preOrdIndx += 1
elif stack[-1].sibling:
# print 'sibling: %i (%s)' % (stack[-1].sibling.nodeNum,
# [n.nodeNum for n in stack])
theNodeNum = stack[-1].sibling.nodeNum
theSib = stack[-1].sibling
# print ' postOrder appending u %i' %
# stack[-1].nodeNum
self.postOrder[postOrdIndx] = stack[-1].nodeNum
postOrdIndx += 1
stack.pop()
stack.append(theSib)
try:
self.preOrder[preOrdIndx] = theNodeNum
except IndexError:
gm.append("preOrdIndx=%s, theNodeNum=%i" %
(preOrdIndx, theNodeNum))
gm.append("preOrder = %s" % self.preOrder)
raise P4Error(gm)
preOrdIndx += 1
else:
# print ' postOrder appending v %i' %
# stack[-1].nodeNum
self.postOrder[postOrdIndx] = stack[-1].nodeNum
postOrdIndx += 1
stack.pop()
if len(stack) == 0:
break
# print ' postOrder appending w %i' %
# stack[-1].nodeNum
self.postOrder[postOrdIndx] = stack[-1].nodeNum
postOrdIndx += 1
theNode = stack.pop()
while not theNode.sibling:
if len(stack) == 0:
break
# print ' postOrder appending x %i' %
# stack[-1].nodeNum
self.postOrder[postOrdIndx] = stack[-1].nodeNum
postOrdIndx += 1
theNode = stack.pop()
if len(stack) == 0:
break
if theNode.sibling:
stack.append(theNode.sibling)
# print "self.preOrder = %s, preOrdIndx=%i" %
# (self.preOrder, preOrdIndx)
self.preOrder[preOrdIndx] = theNode.sibling.nodeNum
preOrdIndx += 1
else:
gm.append('Problemo.')
gm.append('xxx got theNode %s' % theNode.nodeNum)
raise P4Error(gm)
if 1:
assert preOrdIndx == postOrdIndx
# print "a preOrdIndx = %i, len(self.nodes) = %i" % (preOrdIndx,
# len(self.nodes))
if preOrdIndx != len(self.nodes):
pOI = preOrdIndx
for i in range(preOrdIndx, len(self.nodes)):
self.preOrder[i] = var.NO_ORDER
self.postOrder[i] = var.NO_ORDER
preOrdIndx += 1
postOrdIndx += 1
# print "b preOrdIndx = %i, len(self.nodes) = %i" % (preOrdIndx,
# len(self.nodes))
assert preOrdIndx == len(self.nodes) and postOrdIndx == len(self.nodes)
[docs]
def iterPreOrder(self):
"""Node generator. Assumes preAndPostOrderAreValid."""
for i in self.preOrder:
j = int(i) # this speeds things up a lot! Two-fold!
if j != var.NO_ORDER:
# yield self.nodes[int(i)]
yield self.nodes[j]
[docs]
def iterPostOrder(self):
"""Node generator. Assumes preAndPostOrderAreValid."""
for i in self.postOrder:
j = int(i)
if j != var.NO_ORDER:
yield self.nodes[j]
[docs]
def iterNodes(self):
"""Node generator, in preOrder. Assumes preAndPostOrderAreValid."""
for i in self.preOrder:
j = int(i)
if j != var.NO_ORDER:
yield self.nodes[j]
[docs]
def iterNodesNoRoot(self):
"""Node generator, skipping the root. PreOrder."""
for i in self.preOrder:
j = int(i)
if j != var.NO_ORDER:
n = self.nodes[j]
if n != self.root:
yield n
[docs]
def iterLeavesNoRoot(self):
"""Leaf node generator, skipping the root. PreOrder."""
for i in self.preOrder:
j = int(i)
if j != var.NO_ORDER:
n = self.nodes[j]
if n != self.root and n.isLeaf:
yield n
[docs]
def iterLeavesPreOrder(self):
for i in self.preOrder:
j = int(i)
if j != var.NO_ORDER:
n = self.nodes[j]
if n.isLeaf:
yield n
[docs]
def iterLeavesPostOrder(self):
for i in self.postOrder:
j = int(i)
if j != var.NO_ORDER:
n = self.nodes[j]
if n.isLeaf:
yield n
[docs]
def iterInternalsNoRootPreOrder(self):
"""Internal post order node generator, skipping the root. Assumes preAndPostOrderAreValid."""
for i in self.preOrder:
j = int(i)
if j != var.NO_ORDER:
n = self.nodes[j]
if n != self.root and not n.isLeaf:
yield n
[docs]
def iterInternalsNoRootPostOrder(self):
"""Internal post order node generator, skipping the root. Assumes preAndPostOrderAreValid."""
for i in self.postOrder:
j = int(i)
if j != var.NO_ORDER:
n = self.nodes[j]
if n != self.root and not n.isLeaf:
yield n
[docs]
def iterInternalsPostOrder(self):
"""Internal post order node generator. Assumes preAndPostOrderAreValid."""
for i in self.postOrder:
j = int(i)
if j != var.NO_ORDER:
n = self.nodes[j]
if not n.isLeaf:
yield n
[docs]
def iterInternalsNoRoot(self):
"""Internal node generator, skipping the root. PreOrder"""
for i in self.preOrder:
j = int(i)
if j != var.NO_ORDER:
n = self.nodes[j]
if n != self.root and not n.isLeaf:
yield n
[docs]
def iterInternals(self):
"""Internal node generator. PreOrder. Including the root, if it is internal."""
for i in self.preOrder:
j = int(i)
if j != var.NO_ORDER:
n = self.nodes[j]
if not n.isLeaf:
yield n
################################################
#
[docs]
def stripBrLens(self):
"""Sets all node.br.len's to 0.1, the default in p4.
Then, if you were to write it out in Nexus or Newick format,
no branch lengths would be printed.
"""
for n in self.iterNodesNoRoot():
n.br.len = 0.1
[docs]
def getLen(self):
"""Return the sum of all br.len's."""
if self.cTree:
# About 0.01 msec for a tree with 1000 leaves
return pf.p4_getTreeLen(self.cTree)
else:
# About 2 msec for a tree with 1000 leaves.
total = 0.0
for n in self.iterNodesNoRoot():
total += n.br.len
return total
# def lenInternals(self):
# """Return the sum of all internal br.len's."""
# total = 0.0
# for n in self.iterInternalsNoRoot():
# total += n.br.len
# return total
# def stemminess(self):
# """Ratio of internal branches to overall tree length.
# Also called 'treeness'. Via Phillips and Penny, MPE 2003, but
# they cite Lanyon 1988."""
# total = self.len()
# internals = self.lenInternals()
# return internals/total
[docs]
def _makeRCSplitKeys(self, splitList=None):
"""Make long integer-valued split keys.
This is dependent on that the leaf bitkeys are already set, ie this method should only
be used by the reduced consensus supertree class
"""
if not self.preAndPostOrderAreValid:
self.setPreAndPostOrder()
# self.draw()
# allOnes = 2**(self.nTax) - 1
# print 'nTax = %i, allOnes = %i' % (self.nTax, allOnes)
# for n in self.iterInternalsPostOrder():
for n in self.iterInternalsNoRootPostOrder():
# if n != self.root:
# print 'doing node %s' % n.nodeNum
# if not n.isLeaf():
if n.leftChild:
childrenNums = self.getChildrenNums(n)
# x = childrenNums[0]
x = self.nodes[childrenNums[0]].br.rc
for i in childrenNums[1:]:
x = x | self.nodes[i].br.rc
# x = x | i
n.br.rc = x
if splitList != None:
splitList.append([x, 0])
n.br.rcList = [n.br.rc]
[docs]
def makeSplitKeys(self, makeNodeForSplitKeyDict=True):
"""Make long integer-valued split keys.
This needs to have self.taxNames set.
We make 2 kinds of split keys-- rawSplitKeys and splitKeys.
Both are attributes of node.br, so we have eg node.br.splitKey.
Raw split keys for terminal nodes are 2**n, where n is the index
of the taxon name. Eg for the first taxon, the rawSplitKey will
be 1, for the 3rd taxon the rawSplitKey will be 4.
RawSplitKeys for internal nodes are the rawSplitKey's for the
children, bitwise OR'ed together.
SplitKeys, cf rawSplitKeys, are in 'standard form', where the
numbers are even, ie do not contain the 1-bit. Having it in
standard form means that you can compare splits among trees.
If the rawSplitKey is even, then the splitKey is simply that,
unchanged. If, however, the rawSplitKey is odd, then the
splitKey is the rawSplitKey bit-flipped. For example, if
there are 5 taxa, and one of the rawSplitKeys is 9 (odd), we
can calculate the splitKey by bit-flipping, as::
01001 = 9 rawSplitKey
10110 = 22 splitKey
(Bit-flipping is done by exclusive-or'ing (xor) with 11111.)
The splitKey is readily converted to a splitString for
display, as 22 becomes '.**.*' (note the '1' bit is now on the
left). It is conventional that the first taxon, on the left,
is always a dot. (I don't know where the convention comes
from.)
The root has no rawSplitKey or splitKey.
For example, the tree::
+-------2:B (rawSplitKey = 2)
+---1
| +---------3:C (rawSplitKey = 4)
|
0-------------4:E (rawSplitKey = 16)
|
| +-----6:A (rawSplitKey = 1)
+----5
+-----------7:D (rawSplitKey = 8)
has 2 internal splits, on nodes 1 and 5.
::
Node n.br.rawSplitKey n.br.splitKey
1 6 6
5 9 22
There should be no duplicated rawSplitKeys, but if the tree
has a bifurcating root then there will be a duped splitKey.
This method will fail for trees with internal nodes that have
only one child, because that will make duplicated splits.
If arg *makeNodeForSplitKeyDict* is set, then it will make a
dictionary ``nodeForSplitKeyDict`` where the keys are the
splitKeys and the values are the corresponding nodes.
"""
gm = ['Tree.makeSplitKeys()']
#raise P4Error(gm)
if not self.taxNames:
gm.append('No taxNames.')
raise P4Error(gm)
if not self.preAndPostOrderAreValid:
self.setPreAndPostOrder()
# self.draw()
if makeNodeForSplitKeyDict:
self.nodeForSplitKeyDict = {}
allOnes = 2 ** (self.nTax) - 1
# print 'nTax = %i, allOnes = %i' % (self.nTax, allOnes)
for n in self.iterPostOrder():
if n != self.root:
# print 'doing node %s' % n.nodeNum
if not n.leftChild:
try:
n.br.rawSplitKey = 1 << self.taxNames.index(
n.name) # "<<" is left-shift
except ValueError:
gm.append('node.name %s' % n.name)
gm.append('is not in taxNames %s' % self.taxNames)
raise P4Error(gm)
# n.br.rawSplitKey = 1 << self.taxNames.index(n.name) #
# "<<" is left-shift
if n.br.rawSplitKey == 1:
n.br.splitKey = allOnes - 1
else:
n.br.splitKey = n.br.rawSplitKey
# print 'upward leaf node %s, rawSplitKey %s, splitKey
# %s' % (n.nodeNum, n.br.rawSplitKey, n.br.splitKey)
else:
childrenNums = self.getChildrenNums(n)
if len(childrenNums) == 1:
gm.append(
'Internal node has only one child. That will make a duped split.')
raise P4Error(gm)
x = self.nodes[childrenNums[0]].br.rawSplitKey
for i in childrenNums[1:]:
y = self.nodes[i].br.rawSplitKey
x = x | y # '|' is bitwise "OR".
n.br.rawSplitKey = x
# Make node.br.splitKey's in "standard form", ie
# without the first taxon, ie without a 1. To do that
# we use the '&' operator, to bitwise "and" with 1.
# Ie "Does rawSplitKey contain a 1?" or "Is rawSplitKey
# odd?"
if 1 & n.br.rawSplitKey:
# "^" is xor, a bit-flipper.
n.br.splitKey = allOnes ^ n.br.rawSplitKey
else:
n.br.splitKey = n.br.rawSplitKey
# print 'intern node %s, rawSplitKey %s, splitKey %s' %
# (n.nodeNum, n.br.rawSplitKey, n.br.splitKey)
if makeNodeForSplitKeyDict:
self.nodeForSplitKeyDict[n.br.splitKey] = n
if 0:
# There should be no duped rawSplitKeys
theKeys = []
for n in self.iterNodesNoRoot():
theKeys.append(n.br.rawSplitKey)
for k in theKeys:
if theKeys.count(k) > 1:
gm.append('Duped rawSplitKey %i.' % k)
for n in self.nodes:
if n != self.root:
print('%7s %4s %4s' % (n.nodeNum, n.br.rawSplitKey, n.br.splitKey))
raise P4Error(gm)
# Any duped splitKeys? There will be if the tree is bi-Rooted.
if 0:
theKeys = []
for n in self.iterNodesNoRoot():
theKeys.append(n.br.splitKey)
for k in theKeys:
if theKeys.count(k) > 1:
gm.append('Duped splitKey %i.' % k)
for n in self.iterNodesNoRoot():
print('%7s %4s %4s' % (n.nodeNum, n.br.rawSplitKey, n.br.splitKey))
raise P4Error(gm)
if 0:
print(gm[0])
print(self.taxNames)
print(f"root is node {self.root.nodeNum}")
print('nodeNum rawSplitKey splitKey')
# getSplitsFromKey would usually use self.nTax.
# Changed below to accommodate tempRoot for bi-rooted trees.
for n in self.iterNodesNoRoot():
print('%7s %4s %4s %s' % (
n.nodeNum, n.br.rawSplitKey, n.br.splitKey,
p4.func.getSplitStringFromKey(n.br.splitKey, len(self.taxNames))))
[docs]
def recalculateSplitKeysOfNodeFromChildren(self, aNode, allOnes):
children = [n for n in aNode.iterChildren()]
x = children[0].br.rawSplitKey
for n in children[1:]:
x = x | n.br.rawSplitKey # '|' is bitwise "OR".
aNode.br.rawSplitKey = x
# Ie "Does rawSplitKey contain a 1?" or "Is rawSplitKey odd?"
if 1 & aNode.br.rawSplitKey:
# "^" is xor, a bit-flipper.
aNode.br.splitKey = allOnes ^ aNode.br.rawSplitKey
else:
aNode.br.splitKey = aNode.br.rawSplitKey
[docs]
def checkSplitKeys(self, useOldName=False, glitch=True, verbose=True):
gm = ['Tree.checkSplitKeys()']
allOnes = 2 ** (self.nTax) - 1
# print 'nTax = %i, allOnes = %i' % (self.nTax, allOnes)
isBad = False
for n in self.iterPostOrder():
if n != self.root:
# print 'doing node %s' % n.nodeNum
if not n.leftChild:
# A long int, eg 1, has no upper limit on its value
try:
if useOldName:
rawSplitKey = 1 << self.taxNames.index(
n.oldName) # "<<" is left-shift
else:
rawSplitKey = 1 << self.taxNames.index(
n.name) # "<<" is left-shift
except ValueError:
if useOldName:
gm.append('node.name %s' % n.oldName)
else:
gm.append('node.name %s' % n.name)
gm.append('is not in taxNames %s' % self.taxNames)
raise P4Error(gm)
# n.br.rawSplitKey = 1 << self.taxNames.index(n.name) #
# "<<" is left-shift
if rawSplitKey == 1:
splitKey = allOnes - 1
else:
splitKey = rawSplitKey
# print 'upward leaf node %s, rawSplitKey %s, splitKey
# %s' % (n.nodeNum, n.br.rawSplitKey, n.br.splitKey)
else:
childrenNums = self.getChildrenNums(n)
if len(childrenNums) == 1:
gm.append(
'Internal node has only one child. That will make a duped split.')
raise P4Error(gm)
x = self.nodes[childrenNums[0]].br.rawSplitKey
for i in childrenNums[1:]:
y = self.nodes[i].br.rawSplitKey
x = x | y # '|' is bitwise "OR".
rawSplitKey = x
# Make node.br.splitKey's in "standard form", ie
# without the first taxon, ie without a 1. To do that
# we use the '&' operator, to bitwise "and" with 1.
# Ie "Does rawSplitKey contain a 1?" or "Is rawSplitKey
# odd?"
if 1 & rawSplitKey:
# "^" is xor, a bit-flipper.
splitKey = allOnes ^ rawSplitKey
else:
splitKey = rawSplitKey
# print 'intern node %s, rawSplitKey %s, splitKey %s' %
# (n.nodeNum, n.br.rawSplitKey, n.br.splitKey)
if n.br.rawSplitKey != rawSplitKey:
print("checkSplitKeys node %2i rawSplitKey: existing %s, calculated %s" % (n.nodeNum, n.br.rawSplitKey, rawSplitKey))
isBad = True
if n.br.splitKey != splitKey:
print("checkSplitKeys node %2i splitKey: existing %s, calculated %s" % (n.nodeNum, n.br.splitKey, splitKey))
isBad = True
if glitch and isBad:
raise P4Error(gm)
if verbose and not isBad:
print("checkSplitKeys(). ok")
[docs]
def taxSetIsASplit(self, taxSetName):
"""Asks whether a nexus taxset is a split in the tree.
Args:
taxSetName (str): The name of the taxset. Case does not matter.
Returns:
Node: the node in self if the taxset is a split, or else None
"""
gm = ["Tree.taxSetIsASplit(%s)" % taxSetName]
assert self.nexusSets
assert self.taxNames
assert self.nexusSets.taxSets
lowArgTaxSetName = taxSetName.lower()
theTS = None
for ts in self.nexusSets.taxSets:
if ts.lowName == lowArgTaxSetName:
theTS = ts
break
assert theTS, "Could not find the taxSet named %s" % taxSetName
# theTS.dump()
assert theTS.mask
# Make sure that self splitKey has been set. Check the first internal node.
needsDoing = False
for n in self.iterInternalsNoRoot():
if not n.br.splitKey:
needsDoing = True
break
if needsDoing:
self.makeSplitKeys()
rawSplitKey = 0
for i in range(len(theTS.mask)):
# print i, theTS.mask[i]
if theTS.mask[i] == '1':
rawSplitKey += (1 << i)
# Ie "Does rawSplitKey contain a 1?" or "Is rawSplitKey odd?"
if 1 & rawSplitKey:
allOnes = 2 ** (self.nTax) - 1
splitKey = allOnes ^ rawSplitKey # "^" is xor, a bit-flipper.
else:
splitKey = rawSplitKey
# print "got splitKey %s" % splitKey
for n in self.nodes:
if n.br and not n.isLeaf:
assert n.br.splitKey, "Needs node.br.splitKey on all internal nodes."
# print " %2i %s %s" % (n.nodeNum, n.br.splitKey, splitKey)
if n.br.splitKey == splitKey:
# self.draw()
# return n.nodeNum
return n
# Was -1 before, when n.nodeNum was returned for hits. Now a node is
# returned.
return None
[docs]
def checkTaxNames(self):
"""Check that all taxNames are in the tree, and vice versa."""
# If var.allowTreesWithDifferingTaxonSets is set to True we will not check
# the taxnames. This is primarily used to read trees for supertree and
# leafstability calcuations.
# Peter comments that Tobias added this, but it is not needed,
# and messes other things up -- so comment out until it is
# sorted.
# if var.allowTreesWithDifferingTaxonSets:
# return
# self.taxNames is a property, so when it is set, it calls this method
if self.name:
gm = ['Tree.checkTaxNames() tree %s' % self.name]
else:
gm = ['Tree.checkTaxNames()']
if self.fName:
gm.append("From file '%s' " % self.fName)
if not self.taxNames:
gm.append('No taxNames.')
raise P4Error(gm)
tax = []
for n in self.iterNodes():
if n.isLeaf and n.name:
tax.append(n.name)
# This next line should not be needed, as root leaves should be
# leaves.
# terminal root that has a taxName
elif n == self.root and n.name and n.getNChildren() < 2:
tax.append(n.name)
# print 'tax from tree = %s' % tax
# print 'self.taxNames = %s' % self.taxNames
# Check for same number of taxa
if len(tax) != len(self.taxNames):
# self.draw()
# self.dump(node=1)
gm.append('Mismatch. Length of self.taxNames is wrong.')
gm.append('The tree has %i leaves with names, but len(self.taxNames) = %i' % (
len(tax), len(self.taxNames)))
gm.append('leaves on the tree = %s' % tax)
gm.append('self.taxNames = %s' % self.taxNames)
gm.append('symmetric_difference is %s' %
set(tax).symmetric_difference(set(self.taxNames)))
# print ' Setting invalid taxNames to an empty list.'
#self.taxNames = []
raise P4Error(gm, 'tree_badTaxNamesLength')
# Check for mis-matched taxNames
isBad = 0
taxSet = set(tax)
selfTaxNamesSet = set(self.taxNames)
s = taxSet.difference(selfTaxNamesSet)
if len(s):
isBad = 1
print(gm[0])
print('TaxName mismatch between the tree and self.taxNames.')
print('These taxa are found in the tree but not in self.taxNames:')
print(s)
s = selfTaxNamesSet.difference(taxSet)
if len(s):
isBad = 1
print(gm[0])
print('TaxName mismatch between the tree and self.taxNames.')
print('These taxa are found in the self.taxNames but not in the tree:')
print(s)
if isBad:
raise P4Error(gm, 'tree_taxNamesMisMatch')
################################################
# Copy and Verify
[docs]
def copyToTree(self, otherTree):
gm = ['Tree.copyToTree()']
if len(self.nodes) != len(otherTree.nodes):
gm.append('Different number of nodes.')
raise P4Error(gm)
# node relations (parent, child, sib)
for nNum in range(len(self.nodes)):
selfNode = self.nodes[nNum]
otherNode = otherTree.nodes[nNum]
# parent
if selfNode.parent:
otherNode.parent = otherTree.nodes[selfNode.parent.nodeNum]
else:
# print "otherNode.parent", otherNode.parent
otherNode.parent = None
# leftChild
if selfNode.leftChild:
otherNode.leftChild = otherTree.nodes[
selfNode.leftChild.nodeNum]
else:
# print "otherNode.leftChild", otherNode.leftChild
otherNode.leftChild = None
# sibling
if selfNode.sibling:
otherNode.sibling = otherTree.nodes[selfNode.sibling.nodeNum]
else:
# print "otherNode.sibling", otherNode.sibling
otherNode.sibling = None
# root
otherTree.root.br = otherTree.nodes[self.root.nodeNum].br
otherTree.nodes[self.root.nodeNum].br = None
otherTree.root = otherTree.nodes[self.root.nodeNum]
# brLens and splitKeys
for nNum in range(len(self.nodes)):
if self.nodes[nNum] != self.root:
otherTree.nodes[nNum].br.len = self.nodes[nNum].br.len
otherTree.nodes[nNum].br.lenChanged = self.nodes[nNum].br.lenChanged
otherTree.nodes[nNum].flag = self.nodes[nNum].flag
otherTree.nodes[nNum].br.splitKey = self.nodes[nNum].br.splitKey
otherTree.nodes[nNum].br.rawSplitKey = self.nodes[nNum].br.rawSplitKey
if 0:
print("Tree.copyToTree()")
for nNum in range(len(self.nodes)):
if self.nodes[nNum] != self.root:
print(f"{nNum:3} {self.nodes[nNum].br.splitKey} {otherTree.nodes[nNum].br.splitKey}")
else:
print(f"{nNum:3} is root")
# model usage numbers
if self.model:
for nNum in range(len(self.nodes)):
selfNode = self.nodes[nNum]
otherNode = otherTree.nodes[nNum]
if selfNode.parts and otherNode.parts:
for pNum in range(self.model.nParts):
otherNode.parts[pNum].compNum = selfNode.parts[pNum].compNum
if selfNode != self.root:
if selfNode.br.parts and otherNode.br.parts:
otherNode.br.parts[pNum].rMatrixNum = selfNode.br.parts[pNum].rMatrixNum
otherNode.br.parts[pNum].gdasrvNum = selfNode.br.parts[pNum].gdasrvNum
# pre- and postOrder
for i in range(len(self.preOrder)):
otherTree.preOrder[i] = self.preOrder[i]
otherTree.postOrder[i] = self.postOrder[i]
# partLikes
if self.model:
for pNum in range(self.model.nParts):
otherTree.partLikes[pNum] = self.partLikes[pNum]
otherTree._nInternalNodes = self._nInternalNodes
[docs]
def verifyIdentityWith(self, otherTree, doSplitKeys=False, doMore=False):
"""For MCMC debugging. Verifies that two trees are identical."""
complaintHead = '\nTree.verifyIdentityWith()' # keep
if len(self.nodes) != len(otherTree.nodes):
print(complaintHead)
print(' Different number of nodes.')
return var.DIFFERENT
# check node relations (parent, child, sib)
isBad = 0
for nNum in range(len(self.nodes)):
selfNode = self.nodes[nNum]
otherNode = otherTree.nodes[nNum]
# parent
if selfNode.parent:
if otherNode.parent:
if otherNode.parent.nodeNum != selfNode.parent.nodeNum:
isBad = 1
else:
isBad = 1
else:
if otherNode.parent:
isBad = 1
# leftChild
if selfNode.leftChild:
if otherNode.leftChild:
if otherNode.leftChild.nodeNum != selfNode.leftChild.nodeNum:
isBad = 1
else:
isBad = 1
else:
if otherNode.leftChild:
isBad = 1
# sibling
if selfNode.sibling:
if otherNode.sibling:
if otherNode.sibling.nodeNum != selfNode.sibling.nodeNum:
isBad = 1
else:
isBad = 1
else:
if otherNode.sibling:
isBad = 1
if isBad:
print(complaintHead)
print(' Node %i, relations differ.' % nNum)
self.write()
otherTree.write()
return var.DIFFERENT
if self.root.nodeNum != otherTree.root.nodeNum:
print(complaintHead)
print(' Roots differ.')
return var.DIFFERENT
# brLens, lenChanged, and node.flag. and splitKeys
if 0:
print("Tree.verifyIdentityWith() flag")
for nNum in range(len(self.nodes)):
if self.nodes[nNum] != self.root:
print(f"{nNum:3} {self.nodes[nNum].flag} {otherTree.nodes[nNum].flag}")
else:
print(f"{nNum:3} is root")
for nNum in range(len(self.nodes)):
if self.nodes[nNum] != self.root:
# if self.nodes[nNum].br.len != otherTree.nodes[nNum].br.len:
if math.fabs(self.nodes[nNum].br.len - otherTree.nodes[nNum].br.len) > 1.e-8:
print(complaintHead)
print(' BrLens differ.')
return var.DIFFERENT
if self.nodes[nNum].br.lenChanged != otherTree.nodes[nNum].br.lenChanged:
print(complaintHead)
print(' br.lenChanged differs.')
return var.DIFFERENT
if self.nodes[nNum].flag != otherTree.nodes[nNum].flag:
print(complaintHead)
print(' flag differs, nodeNum %i. %s vs %s' % (nNum, self.nodes[nNum].flag, otherTree.nodes[nNum].flag))
return var.DIFFERENT
if doSplitKeys:
if self.nodes[nNum].br.splitKey != otherTree.nodes[nNum].br.splitKey:
print(complaintHead)
print(f' SplitKeys differ. nodeNum {nNum}')
return var.DIFFERENT
if self.nodes[nNum].br.rawSplitKey != otherTree.nodes[nNum].br.rawSplitKey:
print(complaintHead)
print(f' rawSplitKeys differ. nodeNum {nNum}')
return var.DIFFERENT
if self.model:
# model usage numbers
isBad = 0
for pNum in range(self.model.nParts):
for nNum in range(len(self.nodes)):
selfNode = self.nodes[nNum]
otherNode = otherTree.nodes[nNum]
if selfNode.parts and otherNode.parts:
if selfNode.parts[pNum].compNum != otherNode.parts[pNum].compNum:
isBad = 1
if self.nodes[nNum] != self.root:
if selfNode.br.parts and otherNode.br.parts:
if selfNode.br.parts[pNum].rMatrixNum != otherNode.br.parts[pNum].rMatrixNum:
isBad = 1
elif selfNode.br.parts[pNum].gdasrvNum != otherNode.br.parts[pNum].gdasrvNum:
isBad = 1
if isBad:
print(complaintHead)
print(' Node %i, model usage info does not match.' % nNum)
return var.DIFFERENT
# pre- and postOrder
isBad = 0
for i in range(len(self.preOrder)):
if self.preOrder[i] != otherTree.preOrder[i]:
isBad = 1
break
elif self.postOrder[i] != otherTree.postOrder[i]:
isBad = 1
break
if isBad:
print(complaintHead)
print(' Pre- or postOrder do not match.')
return var.DIFFERENT
if self.nInternalNodes != otherTree.nInternalNodes:
print(complaintHead)
print(' nInternalNodes differ.')
return var.DIFFERENT
# partLikes
for pNum in range(self.model.nParts):
# if otherTree.partLikes[pNum] != self.partLikes[pNum]:
myDiff = math.fabs(otherTree.partLikes[pNum] - self.partLikes[pNum])
if myDiff > 1.e-5:
print(complaintHead)
print(" partLikes differ by %.8f (%g). (%.8f, (%g) %.8f (%g)" % (
myDiff, myDiff,
otherTree.partLikes[pNum], otherTree.partLikes[pNum], self.partLikes[pNum], self.partLikes[pNum]))
return var.DIFFERENT
if doMore: # some more
for nNum in range(len(self.nodes)):
selfNode = self.nodes[nNum]
otherNode = otherTree.nodes[nNum]
if selfNode.nodeNum != otherNode.nodeNum:
print(complaintHead)
print(' nodeNum differs')
return var.DIFFERENT
if selfNode.seqNum != otherNode.seqNum:
print(complaintHead)
print(' seqNum differs')
return var.DIFFERENT
if selfNode.name != otherNode.name:
print(complaintHead)
print(' name differs')
return var.DIFFERENT
if selfNode.isLeaf != otherNode.isLeaf:
print(complaintHead)
print(' isLeaf differs')
return var.DIFFERENT
return var.SAME
############################################
[docs]
def isFullyBifurcating(self, verbose=False, biRoot=False):
"""Returns True if the tree is fully bifurcating. Else False.
If arg biRoot is True, then it is required that the tree be
biRoot'ed (ie have a bifurcating root)
If arg biRoot is False, then biRoot'ed trees are not allowed,
but trees rooted on a leaf are allowed.
"""
rootNChildren = self.root.getNChildren()
if rootNChildren > 3:
if verbose:
print("isFullyBifurcating() returning False, due to root with %i children." % rootNChildren)
return False
if not biRoot:
if rootNChildren == 2:
if verbose:
print("isFullyBifurcating() returning False, due to root with %i children." % rootNChildren)
return False
elif rootNChildren == 1 or rootNChildren == 3: # rooting on a leaf is OK
if rootNChildren == 1:
if not self.root.isLeaf:
if verbose:
print("isFullyBifurcating()", end=" ")
print("returning False, because the single-child root is not a leaf.")
return False
for n in self.iterInternalsNoRoot():
if n.leftChild and n.leftChild.sibling:
if n.leftChild.sibling.sibling:
if verbose:
print("isFullyBifurcating()", end=" ")
print("returning False, due to node %i having 3 or more children." % n.nodeNum)
return False
else:
if verbose:
print("isFullyBifurcating()", end=" ")
print("returning False, due to non-leaf node %i having 1 or fewer children." % n.nodeNum)
return False
return True
else:
raise P4Error("Tree.isFullyBifurcating() --- this should not happen; fix me.")
if biRoot:
# root on a leaf not allowed on a biRoot tree, as the root is explicitly not a leaf.
if rootNChildren != 2:
if verbose:
print("isFullyBifurcating(biRoot=True)", end=" ")
print("returning False, due to root with %i children." % rootNChildren)
return False
for n in self.iterInternalsNoRoot():
if n.leftChild and n.leftChild.sibling:
if n.leftChild.sibling.sibling:
if verbose:
print("isFullyBifurcating(biRoot=True)", end=" ")
print("returning False, due to node %i having 3 or more children." % n.nodeNum)
return False
else:
if verbose:
print("isFullyBifurcating(biRoot=True)", end=" ")
print("returning False, due to non-leaf node %i having 1 or fewer children." % n.nodeNum)
return False
return True
# These next two are for the eTBR implementation that I got from Jason
# Evans' Crux. Thanks Jason!
[docs]
def getDegree(self, nodeSpecifier):
n = self.node(nodeSpecifier)
if n.isLeaf:
if n.parent:
return 1
else:
assert n == self.root
if n.leftChild: # a tree-on-a-stick
return 1
else: # a single isolated node
return 0
else:
#assert n.leftChild
deg = 1 # the leftChild
if n.parent:
deg += 1
s = n.leftChild.sibling
while s:
deg += 1
s = s.sibling
return deg
[docs]
def nextNode(self, spokeSpecifier, hubSpecifier):
"""Get next node cycling around a hub node.
A bit of a hack to make a p4 Node behave sorta like a
Felsenstein node. Imagine cycling around the branches
emanating from a node like spokes on a hub, starting from
anywhere, with no end.
The hub node would usually be the parent of the spoke, or the
spoke would be the hub itself. Usually
self.nextNode(spoke,hub) delivers spoke.sibling. What happens
when the siblings run out is that
self.nextNode(rightmostSibling, hub) delivers hub itself, and
of course its branch (spoke) points toward the hub.parent.
(Unless hub is the root, of course, in which case
self.nextNode(rightmostSibling, hub) delivers hub.leftChild.)
In the usual case of the hub not being the root, the next node
to be delivered by nextNode(spokeIsHub, hub) is usually the
leftChild of the hub. Round and round, clockwise.
"""
spoke = self.node(spokeSpecifier)
hub = self.node(hubSpecifier)
if spoke.parent == hub or hub == spoke:
if spoke == hub:
if spoke.leftChild:
return spoke.leftChild
else:
return hub
else:
if spoke.sibling:
return spoke.sibling
else:
if hub.parent:
return hub
else:
return hub.leftChild
else:
print("*=" * 25)
self.draw()
gm = ["Tree.nextNode() spoke=%i, hub=%i" %
(spoke.nodeNum, hub.nodeNum)]
gm.append(
"Need to have either spoke.parent == hub or hub == spoke.")
raise P4Error(gm)
[docs]
def topologyDistance(self, tree2, metric='sd', resetSplitKeySet=True):
"""Compares the topology of self with tree2.
The complete list of metrics is given in
var.topologyDistanceMetrics
For most metrics using this method, taxNames needs to be set,
to the same in the two trees. If the taxa differ, this method
simply returns -1
The 'metric' can be one of 'sd' (symmetric difference), 'wrf'
(weighted Robinson-Foulds), 'bld' (Felsenstein's branch-
length distance), or 'diffs'. The unweighted Robinson-Foulds
metric would be the same as the symmetric difference.
There is also an experimental scqdist, but that needs the
scqdist.so module, in the QDist directory.
See Felsenstein 2004 Inferring Phylogenies, Pg 529.
The default metric is the very simple 'sd', symmetric
difference. Using this metric, if the 2 trees share the same
set of splits, they are deemed to be the same topology; branch
lengths are not compared. This method returns the number of
splits that are in self that are not in tree2 plus the number
of splits that are in tree2 that are not in self. So it would
return 0 for trees that are the same.
The 'wrf' and 'bld' metrics take branch lengths into account.
Bifurcating roots complicate things, so they are not allowed
for weighted distance calculations.
In the unweighted case (ie metric='sd'), whether the trees
compared have bifurcating roots or not is ignored. So the
trees (A,B,(C,D)) and ((A,B),(C,D)) will be deemed to have the
same topology, since they have the same splits.
The measurement 'diffs', which returns a tuple of 2 numbers --
both are set differences. The first is the number of splits
in self that are not in tree2, and the second is the number of
splits in tree2 that are not in self. (Consider it as the the
symmetric difference split into its 2 parts.)
If you calculate a distance and then make a topology change, a
subsequent sd topologyDistance calculation will be wrong, as
it uses previous splits. So then you need to
'resetSplitKeySet'.
The 'scqdist' metric calculates quartet distances. The code
was written by Anders Kabell Kristensen for his Masters degree
at Aarhus University, 2010.
http://www.cs.au.dk/~dalko/thesis/ It has two versions -- a
pure Python version (that needs scipy) that I do not include
here, and a fast C++ version, that I wrapped in python. Its
speedy -- the 'sc' in 'scqdist' is for 'sub-cubic', ie better
than O(n^3).
I have also incorporated the tqDist v1.0.2 code, from 2014,
also for quartet distance calculations, from Christian N. S.
Pedersen and his group at BiRC in Aarhus. See
https://users-cs.au.dk/cstorm/software/tqdist/ 2014.
It is available here via the 'tqdist' metric.
"""
gm = ['Tree.topologyDistance()']
if metric not in var.topologyDistanceMetrics:
gm.append("Got a request for unknown metric '%s'" % metric)
gm.append("The 'metric' arg should be one of %s" %
var.topologyDistanceMetrics)
raise P4Error(gm)
if metric == 'scqdist': # no need for taxNames
try:
import p4.scqdist as scqdist
except ImportError:
gm.append("Could not find the 'scqdist' module needed for this metric.")
gm.append("See the instructions for making it in the p4 source, in the Qdist directory.")
raise P4Error(gm)
tsSelf = self.writeNewick(toString=True)
tsTree2 = tree2.writeNewick(toString=True)
return scqdist.qdist(tsSelf, tsTree2)
elif metric == 'tqdist': # no need for taxNames
try:
import p4.pytqdist as pytqdist
except ImportError:
gm.append("Could not find the 'tqdist' module needed for this metric.")
gm.append("See the instructions for making it in the p4 source, in the tqDist directory.")
raise P4Error(gm)
tsSelf = self.writeNewick(toString=True, spaceAfterComma=False)
tsTree2 = tree2.writeNewick(toString=True, spaceAfterComma=False)
return pytqdist.qdistFromStrings(tsSelf, tsTree2)
if not self.taxNames or not tree2.taxNames:
gm.append("This method requires taxNames to be set.")
raise P4Error(gm)
if self.taxNames != tree2.taxNames:
gm.append("The taxNames are different for the two trees.")
gm.append("Self: %s" % self.taxNames)
gm.append("tree2: %s" % tree2.taxNames)
raise P4Error(gm)
if (self.root.getNChildren() == 2 or tree2.root.getNChildren() == 2) and (metric in ['wrf', 'bld']):
gm.append('One of the input trees has a bifurcating root.')
gm.append(
'Weighted tree distance calculations do not work on bi-rooted trees.')
raise P4Error(gm)
# I might be doing a lot of these, and I don't want to make
# splitKeys and make the splitKeys set over and over again.
# So store it as a Tree.attribute.
if resetSplitKeySet:
if hasattr(self, 'splitKeySet'):
del(self.splitKeySet)
if hasattr(tree2, 'splitKeySet'):
del(tree2.splitKeySet)
if not hasattr(self, 'splitKeySet'):
self.makeSplitKeys()
self.splitKeySet = set(
[n.br.splitKey for n in self.iterNodesNoRoot()])
if not hasattr(tree2, 'splitKeySet'):
tree2.makeSplitKeys()
tree2.splitKeySet = set(
[n.br.splitKey for n in tree2.iterNodesNoRoot()])
if metric == 'sd':
# Symmetric difference. The symmetric_difference method
# returns all elements that are in exactly one of the sets.
theSD = len(self.splitKeySet.symmetric_difference(tree2.splitKeySet))
return theSD
# The difference method returns the difference of two sets as
# a new Set. I.e. all elements that are in self and not in
# the other.
selfHasButTree2DoesNot = self.splitKeySet.difference(tree2.splitKeySet)
tree2HasButSelfDoesNot = tree2.splitKeySet.difference(self.splitKeySet)
if metric == 'diffs':
return len(selfHasButTree2DoesNot), len(tree2HasButSelfDoesNot)
if metric in ['wrf', 'bld']:
self.splitKeyHash = {}
for n in self.iterNodesNoRoot():
self.splitKeyHash[n.br.splitKey] = n
tree2.splitKeyHash = {}
for n in tree2.iterNodesNoRoot():
tree2.splitKeyHash[n.br.splitKey] = n
if metric == 'wrf':
theSum = 0.0
for k in self.splitKeySet.intersection(tree2.splitKeySet):
# print '%s - %s' % (self.splitKeyHash[k].br.len,
# tree2.splitKeyHash[k].br.len)
theSum += abs(self.splitKeyHash[k].br.len -
tree2.splitKeyHash[k].br.len)
for k in selfHasButTree2DoesNot:
# print 'x %s' % self.splitKeyHash[k].br.len
theSum += self.splitKeyHash[k].br.len
for k in tree2HasButSelfDoesNot:
# print 'y %s' % tree2.splitKeyHash[k].br.len
theSum += tree2.splitKeyHash[k].br.len
return theSum
elif metric == 'bld':
theSum = 0.0
for k in self.splitKeySet.intersection(tree2.splitKeySet):
theDiff = self.splitKeyHash[
k].br.len - tree2.splitKeyHash[k].br.len
theSum += theDiff * theDiff
for k in selfHasButTree2DoesNot:
theSum += self.splitKeyHash[k].br.len * \
self.splitKeyHash[k].br.len
for k in tree2HasButSelfDoesNot:
theSum += tree2.splitKeyHash[k].br.len * \
tree2.splitKeyHash[k].br.len
# print 'branch score =', theSum
return math.sqrt(theSum)
##################################################
[docs]
def tv(self):
"""Tree Viewer. Show the tree in a gui window.
Needs Tkinter.
If you have nexus taxsets defined, you can show them.
"""
from p4.btv import TV
#import os
#os.environ['PYTHONINSPECT'] = '1'
TV(self)
[docs]
def btv(self):
"""Big Tree Viewer. Show the tree in a gui window.
This is for looking at big trees. The viewer has 2 panels --
one for an overall view of the whole tree, and one for a
zoomed view, controlled by a selection rectangle on the whole
tree view.
Needs Tkinter.
If you have nexus taxsets defined, you can show them.
"""
from p4.btv import BTV
#import os
#os.environ['PYTHONINSPECT'] = '1'
BTV(self)
[docs]
def tvTopologyCompare(self, treeB):
"""Graphically show topology differences.
The taxNames need to be set, and need to be the same for both
trees.
(If the red lines don't show up right away, try adjusting the
size of the windows slightly.)
"""
sd = self.topologyDistance(treeB)
if sd == 0:
print("The trees are the same. No tv.")
return
# for sk in self.splitKeyHash.keys():
# if sk not in treeB.splitKeyHash:
# print "self has sk
self.splitKeyHash = {}
for n in self.iterInternalsNoRoot():
self.splitKeyHash[n.br.splitKey] = n
treeB.splitKeyHash = {}
for n in treeB.iterNodesNoRoot():
treeB.splitKeyHash[n.br.splitKey] = n
assert self.splitKeySet
assert treeB.splitKeySet
selfHasButTreeBDoesnt = self.splitKeySet.difference(treeB.splitKeySet)
treeBHasButSelfDoesnt = treeB.splitKeySet.difference(self.splitKeySet)
from p4.btv import TV
#import os
#os.environ['PYTHONINSPECT'] = '1'
TV(self, title='TV self')
TV(treeB, title='TV treeB')
for spl in selfHasButTreeBDoesnt:
n = self.splitKeyHash[spl]
n.br.color = 'red'
for spl in treeBHasButSelfDoesnt:
n = treeB.splitKeyHash[spl]
n.br.color = 'orange'
##################################################
[docs]
def drawTopologyCompare(self, treeB, showNodeNums=False):
"""Graphically show topology differences between two trees.
The two trees (self and treeB) are drawn as text, with differences
highlighted with a thick branch.
The taxNames need to be set, and need to be the same for both
trees.
"""
sd = self.topologyDistance(treeB)
if sd == 0:
print("The trees are the same. ")
return
self.splitKeyHash = {}
for n in self.iterInternalsNoRoot():
self.splitKeyHash[n.br.splitKey] = n
treeB.splitKeyHash = {}
for n in treeB.iterNodesNoRoot():
treeB.splitKeyHash[n.br.splitKey] = n
assert self.splitKeySet
assert treeB.splitKeySet
selfHasButTreeBDoesnt = self.splitKeySet.difference(treeB.splitKeySet)
treeBHasButSelfDoesnt = treeB.splitKeySet.difference(self.splitKeySet)
for spl in selfHasButTreeBDoesnt:
n = self.splitKeyHash[spl]
n.br.textDrawSymbol = '='
for spl in treeBHasButSelfDoesnt:
n = treeB.splitKeyHash[spl]
n.br.textDrawSymbol = '='
self.draw(showNodeNums=showNodeNums)
treeB.draw(showNodeNums=showNodeNums)
[docs]
def getNodeOnReferenceTreeCorrespondingToSelfRoot(self, refTree, verbose=True):
"""Find, on a ref tree, a node corresponding to where the self root is.
This works for both bifurcating and non-bifurcating roots (of
self).
The refTree should not be bi-rooted.
To facilitate doing a lot of (self) trees, a counter in the
corresponding node in the refTree is incremented. The refTree
is rooted on the first taxon. For bi-rooted trees, the
node.br.biRootCount is incremented, and for non-bi-rooted
trees the node.rootCount is incremented. The root counts
should therefore be immune to reRoot()'ing.
"""
gm = ["Tree.getNodeOnReferenceTreeCorrespondingToSelfRoot()"]
assert self is not refTree
assert self.taxNames
assert refTree.taxNames
assert self.taxNames == refTree.taxNames
isBiRoot = False
# sr = self.root; srnChildren = nChildren of self.root
srnChildren = self.root.getNChildren()
if srnChildren == 1:
gm.append("Self root has only one child; does not work")
raise P4Error(gm)
elif srnChildren == 2:
isBiRoot = True
for n in refTree.iterNodesNoRoot():
if not hasattr(n.br, "biRootCount"):
n.br.biRootCount = 0
else:
for n in refTree.iterNodes():
if not hasattr(n, "rootCount"):
n.rootCount = 0
#self.draw()
refTree.reRoot(self.taxNames[0])
#refTree.draw()
self.makeSplitKeys() # default makeNodeForSplitKeyDict=True
refTree.makeSplitKeys()
for ch in self.root.iterChildren():
if ch.br.rawSplitKey & 1:
break
# ch clade has the first taxon, ch.parent is self.root
#print("self root child %i clade has the first taxon" % ch.nodeNum)
refNode = refTree.nodeForSplitKeyDict.get(ch.br.splitKey)
if refNode:
if isBiRoot:
refNode.br.biRootCount += 1
if verbose:
#print("When the refTree is rooted on the first taxon,")
#print("the bi-root is on the branch on refTree node %i" % refNode.nodeNum)
print("Incrementing refTree node %i br.biRootCount by 1" % refNode.nodeNum)
else:
refNode.rootCount += 1
if verbose:
#print("When the refTree is rooted on the first taxon,")
#print("the root is at refTree node %i" % refNode.nodeNum)
print("Incrementing refTree node %i rootCount by 1" % refNode.nodeNum)
return refNode
else:
if verbose:
print("There is no node in the refTree corresponding to the self.root")
return None
[docs]
def readPhyloXmlFile(self, fName, verbose=False):
"""Start with an empty Tree, read in a phyloxml file"""
assert not self.nodes, "The tree should be empty, with no nodes"
import xml.etree.ElementTree as ET
xAll = ET.parse(fName)
for el in xAll.iter():
if '}' in el.tag:
el.tag = el.tag.split('}', 1)[1] # strip all namespaces
xroot = xAll.getroot()
xphylogeny = xroot.find('phylogeny')
assert xphylogeny
rootclade = xphylogeny.find('clade')
assert rootclade
def recurseClade(theClade):
global currentNode
for el in theClade:
#print(el.tag)
if el.tag == 'clade':
n = Node()
n.nodeNum = len(self.nodes)
el.attrib['p4Node'] = n
n.parent = theClade.attrib['p4Node']
if n.parent:
if n.parent.leftChild == None:
n.parent.leftChild = n
else:
rmCh = n.parent.rightmostChild()
assert rmCh, "Can't get rightmostChild"
rmCh.sibling = n
self.nodes.append(n)
currentNode = n
recurseClade(el)
elif el.tag == "branch_length":
currentNode.br.len = float(el.text)
elif el.tag == 'name':
#print(el.text)
currentNode.name = el.text
if verbose:
print("Got node name", el.text)
elif el.tag == 'width':
currentNode.br.width = float(el.text)
if verbose:
print("Got node width, as node.br.width", el.text)
elif el.tag == 'color':
thiscol = {}
for el2 in el:
thiscol[el2.tag] = int(el2.text)
cols = ['red', 'green', 'blue']
for col in cols:
assert thiscol.get(col)
myHexString = ""
for col in cols:
bit = thiscol.get(col)
hx = hex(bit)[2:].upper()
myHexString += hx
#print(myHexString)
currentNode.br.hexColor = myHexString
if verbose:
print("Got node color (as attribute node.br.hexColor)", currentNode.br.hexColor)
else:
print("Fixme. Unhandled/unused element with tag %s" % el.tag)
n = Node()
n.nodeNum = 0
n.br = None
self.root = n
self.nodes.append(n)
rootclade.attrib['p4Node'] = n
recurseClade(rootclade)
for n in self.nodes:
if not n.leftChild:
n.isLeaf = 1
self.setPreAndPostOrder()
[docs]
def attachRooter(self):
rNode = self.nodes[-1]
assert rNode.name == 'tempRooter'
assert rNode.nodeNum == var.NO_ORDER
rtMostCh = self.root.rightmostChild()
rtMostCh.sibling = rNode
rNode.sibling = None
rNode.parent = self.root
rNode.nodeNum = len(self.nodes) - 1
if self.taxNames:
# taxnames can be shared among trees, so check whether it has already been added
if rNode.name not in self.taxNames:
self.taxNames.append(rNode.name)
self.preAndPostOrderAreValid = False
self.setPreAndPostOrder()
#if self._nTax:
# self._nTax += 1
return rNode
[docs]
def detachRooter(self):
rNode = self.nodes[-1]
assert rNode.parent == self.root
safety = 0
while self.root.leftChild.sibling.sibling != rNode:
self.rotateAround(self.root)
safety += 1
if safety >= 5:
self.draw()
raise P4Error("Tree.detachRooter() Too many rotations.")
assert self.root.leftChild.sibling.sibling == rNode
self.root.leftChild.sibling.sibling = None
rNode.parent = None
rNode.nodeNum = var.NO_ORDER
self.preAndPostOrderAreValid = False
self.setPreAndPostOrder()
if self.taxNames:
# taxnames can be shared among trees, so check whether it is in
if rNode.name == self.taxNames[-1]:
self.taxNames.pop()
#if self._nTax:
# self._nTax -= 1
[docs]
def isBiRoot(self):
"""Answers whether self has a bifurcating root"""
rootNChildren = self.root.getNChildren()
if rootNChildren == 2:
return True
return False
[docs]
def isTriRoot(self):
"""Answers whether self has a trifurcating root"""
rootNChildren = self.root.getNChildren()
if rootNChildren == 3:
return True
return False
[docs]
def isCompatibleWith(self, otherTree):
"""Determines whether self is compatible with otherTree
Returns True or False.
"""
assert otherTree.nTax == self.nTax
assert otherTree.taxNames == self.taxNames
# It is easier with sets, so make sets from splitKeys
self.makeSplitKeys()
for n in self.iterInternalsNoRoot():
ss = set()
for i in range(self.nTax):
tester = 2 ** i
if tester & n.br.splitKey:
ss.add(tester)
n.br.splSet = ss
otherTree.makeSplitKeys()
for n in otherTree.iterInternalsNoRoot():
ss = set()
for i in range(self.nTax):
tester = 2 ** i
if tester & n.br.splitKey:
ss.add(tester)
n.br.splSet = ss
isCompatible = True
for nA in self.iterInternalsNoRoot():
for nB in otherTree.iterInternalsNoRoot():
if len(nA.br.splSet.intersection(nB.br.splSet)) \
and len(nA.br.splSet.difference(nB.br.splSet)) \
and len(nB.br.splSet.difference(nA.br.splSet)):
isCompatible = False
break
if not isCompatible:
break
# print(f"isCompatible is {isCompatible}")
return isCompatible
#################################################### manip
####################################################
[docs]
def node(self, specifier):
"""Get a node based on a specifier.
The *specifier* can be a nodeNum, name, or node object.
"""
nodeNum = None
gm = ['Tree.node()']
if isinstance(specifier, int):
nodeNum = specifier
elif isinstance(specifier, numpy.int32):
nodeNum = specifier
elif isinstance(specifier, Node):
if specifier in self.nodes:
return specifier
else:
gm.append(
"The specifier is a node object, but is not part of self.")
raise P4Error(gm)
elif isinstance(specifier, str):
for n in self.iterNodes():
if n.name == specifier:
return n
# if we haven't found a node matching the specier...
if nodeNum == None:
gm.append("Specifier string '%s' is not a node name. What gives?" % specifier)
raise P4Error(gm)
else:
gm.append("I don't understand the specifier '%s', type '%s'." % (
specifier, type(specifier)))
raise P4Error(gm)
if nodeNum < 0 or nodeNum >= len(self.nodes):
gm.append("The requested node number %i is out of range (of %i nodes)." % (nodeNum, len(self.nodes)))
raise P4Error(gm)
return self.nodes[nodeNum]
[docs]
def rotateAround(self, specifier):
"""Rotate a clade around a node.
The *specifier* can be a nodeNum, name, or node object.
"""
gm = ['Tree.rotateAround()']
rotateNode = self.node(specifier)
if rotateNode.isLeaf:
print(gm[0])
print(" The rotateNode is a terminal node. Not doing anything ...")
return
if rotateNode.getNChildren() == 1:
print(gm[0])
print(" The rotateNode has only one child. Not doing anything...")
return
# set up to unattach the rightmost child, and reattach at the left
oldLeftChild = rotateNode.leftChild
oldRightmostChild = rotateNode.rightmostChild()
# we need to know the node second from the right, which will become the
# newRightmostChild
newRightmostChild = rotateNode.leftChild # as a first guess...
while newRightmostChild.sibling != oldRightmostChild:
newRightmostChild = newRightmostChild.sibling
# now the newRightmostChild is indeed the second from the right
# now unattach the rightmost child, and reattach at the left
newRightmostChild.sibling = None
oldRightmostChild.sibling = oldLeftChild
oldLeftChild.parent.leftChild = oldRightmostChild
self.preAndPostOrderAreValid = 0
[docs]
def reRoot(self, specifier, moveInternalName=True, stompRootName=True, checkBiRoot=True, fixRawSplitKeys=False):
"""Re-root the tree to the node described by the specifier.
It does it in-place, and returns None.
The *specifier* can be a node.nodeNum, node.name, or node object.
Here is a potential problem. Lets say you start with this tree,
with split support as shown::
(((A, B)99, C)70, D, E);
+--------3:A
+---------2:99
+--------1:70 +--------4:B
| |
| +---------5:C
0
|--------6:D
|
+--------7:E
Now we want to reRoot it to node 2. So we do that. Often,
node.name's are really there to label the branch, not the node;
you may be using node.name's to label your branches (splits), eg
with support values. If that is the case, then you want to keep
the node name with the branch (not the node) as you reRoot(). Ie
you want node labels to behave like branch labels. If that is the
case, we set moveInternalName=True; that is the default. When
that is done in the example above, we get::
(A, B, (C, (D, E)70)99);
+--------3:A
|
|--------4:B
2
| +---------5:C
+--------1:99
| +--------6:D
+---------0:70
+--------7:E
Another possibility is that the node names actually are there to
name the node, not the branch, and you want to keep the node name
with the node during the re-rooting process. That can be done by
setting moveInternalName=False, and the tree below is what happens
when you do that. Each internal node.name stays with its node in
the re-rooting process::
(A, B, (C, (D, E))70)99;
+--------3:A
|
|--------4:B
99:2
| +---------5:C
+--------1:70
| +--------6:D
+---------0
+--------7:E
Now if you had the default moveInternalName=True and you had a
node name on the root, that would not work (draw it out to
convince yourself ...). So in that case you probably want
stompRootName=True as well --- Both are default. If you set
stompRootName=True, it gives you a little warning as it does it.
If you set stompRootName=2, it will do it silently. If
moveInternalName is not set, then stompRootName is not used.
You probably do not want to ``reRoot()`` if the current root is
bifurcating. If you do that, you will get a node in the tree with
a single child, which is strictly ok, but useless and confusing.
So by default I checkBiRoot=True, and throw a P4Error if there is
one. If you want to draw such a pathological tree with a node
with a single child, set checkBiRoot=False, and it will allow it.
"""
gm = ['Tree.reRoot()']
# The user probably does not want to reRoot if the current root is
# bifurcating. So check that.
if checkBiRoot:
if self.root.getNChildren() == 2:
gm.append("The tree has a bifurcating root, so you probably do not")
gm.append("want to reRoot() it. You can remove the bifurcating root")
gm.append("with yourTree.removeRoot(). If you really want to reRoot()")
gm.append("with a bifurcating root, set checkBiRoot=False in the reRoot() args.")
raise P4Error(gm)
if self.root.isLeaf and not self.root.name:
gm.append("The root is a leaf, but has no name.")
gm.append("So when you reRoot() it, some other leaf will have no name.")
gm.append("That is a recipe for trouble, and is not allowed.")
raise P4Error(gm)
newRoot = self.node(specifier)
oldRoot = self.root
if newRoot == oldRoot:
return
if moveInternalName:
if self.root.name and not self.root.isLeaf:
if stompRootName:
if stompRootName != 2:
print("Notice. Tree.reRoot(stompRootName) is set, so the root name '%s' is being zapped..." % self.root.name)
print("(Set stompRootName=2 to do this silently ...)")
self.root.name = None
else:
gm.append("Setting 'moveInternalName' implies keeping node names with their branches.")
gm.append("The root in this tree has a name, but has no branch.")
gm.append("So that does not work.")
gm.append("Set arg stompRootName to work around this.")
raise P4Error(gm)
if self.root.isLeaf and self.root.leftChild.name:
assert self.root.name
self.draw()
gm.append("The current root is a leaf, with a name.")
gm.append("Its sole child has a name also, which is an 'internal' node name.")
gm.append("Arg moveInternalName is turned on.")
gm.append("So when the tree gets re-rooted, that internal node name should stay with its branch, not its node.")
gm.append("But the rerooted branch will already have a name-- the current root name.")
gm.append("So that does not work.")
raise P4Error(gm)
path = [newRoot]
theParent = newRoot.parent
while theParent != oldRoot:
path.append(theParent)
theParent = theParent.parent
if 0:
print("path from the newRoot to the oldRoot:")
for i in path:
print(" %i" % i.nodeNum)
while len(path):
# We reverse the path above. Its last entry is a child of the old root.
# Start there-- its to be called the 'swivelNode'
# print "Rooting on node %i..." % path[-1].nodeNum
# +----------2:A
# +----------1
# | +----------3:B
# 0
# +----------4:C
# |
# +----------5:D
# +----------2:A
# |
# 1----------3:B
# |
# | +----------4:C
# +----------0
# +----------5:D
# in case this is not the first time around this loop...
oldRoot = self.root
swivelNode = path.pop()
# Move the old root around the swivel node to the right.
oldRoot.parent = swivelNode
oldRoot.br = swivelNode.br
swivelNode.br = None
# If we are keeping the node.name with the branch, then do so now.
if moveInternalName:
if swivelNode.isLeaf:
pass
elif oldRoot.isLeaf:
pass
else:
oldRoot.name = swivelNode.name
swivelNode.name = None
swivelNode.parent = None # as befits a root
if swivelNode.leftChild:
swivelNode.rightmostChild().sibling = oldRoot
else:
swivelNode.leftChild = oldRoot
# What we do next depends on whether the swivelNode is the
# left, middle, or right child of the oldRoot
if oldRoot.leftChild == swivelNode:
oldRoot.leftChild = swivelNode.sibling
else:
leftSib = oldRoot.leftChild
while leftSib.sibling != swivelNode:
leftSib = leftSib.sibling
# At this point leftSib is the node that was left sib of the swivelNode
# But the swivelNode is no longer there, so skip to the other side of it
# (which may be None, if the swivelNode was rightmost child of the oldRoot).
leftSib.sibling = swivelNode.sibling
swivelNode.sibling = None # as befits a root
self.root = swivelNode
# The splitKey is still good, but the rawSplitKey needs updating.
if fixRawSplitKeys and oldRoot.br.rawSplitKey:
oldRoot.br.rawSplitKey = 0
if oldRoot.leftChild:
for n in oldRoot.iterChildren():
oldRoot.br.rawSplitKey += n.br.rawSplitKey
else:
oldRoot.br.rawSplitKey = 1 << self.taxNames.index(
oldRoot.name) # "<<" is left-shift
self.preAndPostOrderAreValid = 0
[docs]
def removeRoot(self):
"""Removes the root if self.root is mono- or bifurcating.
This removes the root node if the tree is rooted on a terminal
node, or if the tree is rooted on a bifurcating node. Otherwise,
it refuses to do anything.
In the usual case of removing a bifurcating root, the branch
length of one fork of the bifurcation is added to the other fork,
so the tree length is preserved.
In the unusual case of removing a monofurcating root (a root that
is a terminal node, a tree-on-a-stick) then its branch length
disappears.
"""
gm = ['\nTree.removeRoot()']
oldRoot = self.root
newRoot = None
if not oldRoot.leftChild:
gm.append("The root has no children.")
raise P4Error(gm)
self.deleteCStuff()
nRootChildren = self.root.getNChildren()
if nRootChildren == 1:
# The root has only one child. Its rooted on a terminal node.
# Its like this:
# +---A
# 0---1
# +---B
newRoot = oldRoot.leftChild
newRoot.parent = None
newRoot.br = None
elif nRootChildren == 2:
# The root has exactly two children. This would be the usual,
# expected, case. We want to find the first child (of the
# oldRoot) with more than one child (of the child).
newRoot = oldRoot.leftChild # Try this one first:
# Ideally, and usually, the new root should have two or more
# children. Imagine what would happen if this tree
#
# +-----A
# 0 +----B
# +---2
# +----C
#
# had its root removed, and then
# had A for its new root. We get
# +---B
# A---2
# +---C
# It would be better in this case if the new root was 2. So we
# want to search for candidate newRoots that have at least 2
# children. So is the current candidate, newRoot, good
# enough?
while newRoot:
if newRoot.getNChildren() >= 2:
break # its good enough, use it
newRoot = newRoot.sibling # its not good, try its sib
# If we got this far an failed to find a good root, then
# newRoot is None. We are dealing with a tree where all the
# root children are either leaves or have only one child. Ok,
# its unusual, but we do need a new root, even if it is a
# terminal node. So arbitrarily choose the leftChild to root
# on.
if not newRoot:
newRoot = oldRoot.leftChild
# print "x Re-rooting on node number %i" % newRoot.nodeNum
if newRoot.leftChild:
newCh = newRoot.leftChild
newCh.sibling = newRoot.sibling
else:
newCh = newRoot.sibling
newRoot.leftChild = newCh
newRoot.parent = None
newRoot.sibling = None
newCh.parent = newRoot
newCh.br.len = newCh.br.len + newRoot.br.len
newRoot.br = None
else: # We found a good new root, with two or more children
# print "y reRooting on nodeNum %i" % newRoot.nodeNum
if newRoot == oldRoot.leftChild:
newRoot.sibling.br.len = newRoot.sibling.br.len + \
newRoot.br.len
newRoot.br = None
newRoot.rightmostChild().sibling = newRoot.sibling
newRoot.sibling.parent = newRoot
newRoot.parent = None
newRoot.sibling = None
else:
# the new root is the right child of the old
oldLeftCh = newRoot.leftChild
oldRoot.leftChild.br.len += newRoot.br.len
newRoot.br = None
newRoot.leftChild = oldRoot.leftChild
newRoot.leftChild.parent = newRoot
newRoot.leftChild.sibling = oldLeftCh
newRoot.parent = None
newRoot.sibling = None
else:
gm.append("The root has more than two children.")
gm.append(
"Removing the root with more than two children is not implemented.")
gm.append("Are you even sure you want to do that?")
raise P4Error(gm)
oldRoot.wipe()
self.nodes.remove(oldRoot)
del oldRoot
if newRoot:
self.root = newRoot
else:
gm.append("No newRoot? Programming error.")
raise P4Error(gm)
for i in range(len(self.nodes)):
self.nodes[i].nodeNum = i
self.preOrder = None
self.postOrder = None
self.preAndPostOrderAreValid = 0
self.setPreAndPostOrder()
self._nTax = 0
[docs]
def removeNode(self, specifier, alsoRemoveSingleChildParentNode=True, alsoRemoveBiRoot=True, alsoRemoveSingleChildRoot=True, deleteCStuff=True):
"""Remove a node, together with everything above it.
Arg *specifier* can be a nodeNum, name, or node object.
So lets say that we have a tree like this::
+-------1:A
0
| +--------3:B
+-------2
+--------4:C
and we remove node 4. When it is removed, node 2 ends up having
only one child. Generally you would want to remove it as well (so
that the parent of node 3 is node 0), so the option
*alsoRemoveSingleChildParentNode* is turned on by default. If
*alsoRemoveSingleChildParentNode* is turned off, nodes like node 2
get left in the tree.
Removal of a node might cause the creation of a bifurcating root.
I assume that is not desired, so alsoRemoveBiRoot is turned on by
default.
In the example above, if I were to remove node 4, by default node
2 would also disappear, but by default node 0 would also disappear
because it would then be a tree with a bifurcating root node. So
starting with a 5-node tree, by removing 1 node you would end up
with a 2-node tree, with 1 branch.
In the example here, if I were to simply remove node 1::
+--------1:A
|
0 +---------3:B
+--------2
| +--------5:C
+---------4
+--------6:D
Then node 0 would remain, as::
+---------2:B
0--------1
| +--------4:C
+---------3
+--------5:D
Presumably that is not wanted, so the arg
*alsoRemoveSingleChildRoot* is turned on by default. When the root
is removed, we are left with a bi-root, which (if the arg
alsoRemoveBiRoot is set) would also be removed. The resulting
tree would be::
+-------0:B
|
1-------2:C
|
+-------3:D
The deleted nodes are really deleted, and do not remain in self.nodes.
"""
gm = ['Tree.removeNode()']
rNode = self.node(specifier)
if rNode == self.root:
print(gm[0])
print(" The specified node appears to be the root.")
print(" Removing everything above the root would leave nothing.")
print(" I assume that you do not want to do that.")
print(" So I'm not doing that.")
# self.draw()
# print "the specifier was %s" % specifier
# print "the specified node was node number %i" % rNode.nodeNum
#raise P4Error
return
rNodeParnt = rNode.parent
if deleteCStuff:
self.deleteCStuff()
# For cases where the tree is originally with a single child root
# -- we don't want to then delete that root below.
assert self.root.leftChild
isOriginallySingleChildRoot = False
if not self.root.leftChild.sibling:
isOriginallySingleChildRoot = True
# For cases where we originally have a bi-Root, that we would like to
# keep.
isOriginallyBiRoot = False
if self.root.getNChildren() == 2:
isOriginallyBiRoot = True
# print "Removing node number", rNode.nodeNum
#hitList = []
#self.recursivelyListNodeIndicesDownTo(hitList, rNode)
# getNodeNumsAbove does not require setting
# self.preAndPostOrderAreValid = 0. Its irrelevant.
# does not include rNode
hitList = self.getNodeNumsAbove(rNode.nodeNum)
hitList.append(rNode.nodeNum)
# hitList.sort()
# hitList.reverse()
# print "Hit list is", hitList
hitNodes = []
for i in hitList:
hitNodes.append(self.nodes[i])
# Disconnect it from the tree.
# the rNode is the left, middle, or right child of the parent
if rNodeParnt.leftChild == rNode:
# its the left child
rNodeParnt.leftChild = rNode.sibling
rNode.sibling = None
else:
# its a middle or a right child
# If its a right child, then rNode.sibling = None
leftSib = rNode.leftSibling()
if 1:
if leftSib:
# print "leftSib is node %i" % leftSib.nodeNum
leftSib.sibling = rNode.sibling
else:
gm.append("leftSib is None. This shouldn't happen")
gm.append("Programming error?")
raise P4Error(gm)
haveRemovedSingleChildRoot = False
if not isOriginallySingleChildRoot:
# The tree may have been bifurcating, and left a single-child root.
if alsoRemoveSingleChildRoot and self.root.leftChild and not self.root.leftChild.sibling:
hitNodes.append(self.root)
self.root = self.root.leftChild
self.root.parent = None
haveRemovedSingleChildRoot = True
for n in hitNodes:
n.wipe()
self.nodes.remove(n)
if n.isLeaf and self.taxNames and n.name and n.name in self.taxNames:
self.taxNames.remove(n.name)
del n
self._nTax = 0
if 1:
for i in range(len(self.nodes)):
self.nodes[i].nodeNum = i
# self.dump(node=1)
self.preOrder = None
self.postOrder = None
self.preAndPostOrderAreValid = 0
# self.draw() # This won't work unless preAndPostOrderAreValid set
# to 0
# the parent of the removed node may now only have one child,
# in which case it (the parent) should be removed
if alsoRemoveSingleChildParentNode or alsoRemoveBiRoot or haveRemovedSingleChildRoot:
ignoreBrLens = True
self.preOrder = numpy.array(
[var.NO_ORDER] * len(self.nodes), numpy.int32)
self.postOrder = numpy.array(
[var.NO_ORDER] * len(self.nodes), numpy.int32)
if len(self.nodes) > 1:
self.setPreAndPostOrder()
for n in self.iterNodesNoRoot():
if n.br.len != 0.1:
ignoreBrLens = False
break
if alsoRemoveSingleChildParentNode:
if rNodeParnt and rNodeParnt.leftChild and not rNodeParnt.leftChild.sibling:
if rNodeParnt.parent:
# rNodeParnt is a left, middle, or right child
if rNodeParnt.parent.leftChild == rNodeParnt:
# print 'rNodeParnt is a left child'
rNodeParnt.parent.leftChild = rNodeParnt.leftChild
else:
# print 'rNodeParnt is a middle or right child'
rNodeParnt.leftSibling().sibling = rNodeParnt.leftChild
rNodeParnt.leftChild.sibling = rNodeParnt.sibling
if not ignoreBrLens:
rNodeParnt.leftChild.br.len += rNodeParnt.br.len
rNodeParnt.leftChild.parent = rNodeParnt.parent
rNodeParnt.wipe()
self.nodes.remove(rNodeParnt)
del rNodeParnt
if not isOriginallyBiRoot:
if self.root.getNChildren() == 2 and alsoRemoveBiRoot:
self.removeRoot()
if ignoreBrLens:
for ch in self.root.iterChildren():
ch.br.len = 0.1
for i in range(len(self.nodes)):
self.nodes[i].nodeNum = i
self.preOrder = numpy.array(
[var.NO_ORDER] * len(self.nodes), numpy.int32)
self.postOrder = numpy.array(
[var.NO_ORDER] * len(self.nodes), numpy.int32)
if len(self.nodes) > 1:
self.setPreAndPostOrder()
[docs]
def removeAboveNode(self, specifier, newName):
"""Remove everything above an internal node, making it a leaf, and so needing a new name.
"""
rNode = self.node(specifier)
assert rNode != self.root
assert not rNode.isLeaf
toDelete = [n for n in rNode.iterPostOrder() if n != rNode]
# print [n.nodeNum for n in toDelete]
for n in toDelete:
# print 'deleting node %i' % n.nodeNum
self.removeNode(n, alsoRemoveSingleChildParentNode=False)
rNode.name = newName
rNode.isLeaf = 1
if self.taxNames:
self.taxNames.append(newName)
[docs]
def collapseNode(self, specifier):
"""Collapse the specified node to make a polytomy, and remove it from the tree.
Arg *specifier*, as usual, can be a node, node number, or node name.
The specified node remains in self.nodes, and is returned.
"""
theNode = self.node(specifier)
assert theNode in self.nodes, "The specified Node is not in the tree."
#assert theNode.leftChild and theNode.leftChild.sibling, "The specified node must have at least 2 children."
assert not theNode.isLeaf, "The specified node must not be a leaf."
assert theNode is not self.root, "The specified node must not be the tree root."
# print "Collapsing node %i" % theNode.nodeNum
theNewParent = theNode.parent
theRightmostChild = theNode.rightmostChild()
theLeftSib = theNode.leftSibling()
if theLeftSib:
theLeftSib.sibling = theNode.leftChild
else:
theNewParent.leftChild = theNode.leftChild
for n in theNode.iterChildren():
n.parent = theNewParent
theRightmostChild.sibling = theNode.sibling
theNode.wipe()
# The following does not work well.
# self.nodes.remove(theNode)
# del(theNode)
self.setPreAndPostOrder()
self._nInternalNodes -= 1
[docs]
def collapseClade(self, specifier):
"""Collapse all nodes within a clade, but not the clade itself
Arg *specifier*, as usual, can be a node, node number, or node
name. It calls Tree.collapseNode repeatedly until the clade
is a comb. Should be useful for making constraint trees.
"""
rNode = self.node(specifier)
assert rNode != self.root
assert not rNode.isLeaf
toCollapse = [n for n in rNode.iterPostOrder() if n != rNode and not n.isLeaf]
for n in toCollapse:
# print(f'collapsing node {n.nodeNum}')
self.collapseNode(n)
[docs]
def pruneSubTreeWithoutParent(self, specifier, allowSingleChildNode=False):
"""Remove and return a node, together with everything above it.
Arg *specifier* can be a nodeNum, name, or node object.
By default, the arg allowSingleChildNode is turned off, and is for
those cases where the parent of the node has more than 2 children.
So when the subTree is removed, the parent node that is left
behind has more than one child.
The stuff that is removed is returned. The nodes are left in
self; the idea being that the subTree will be added back to the
tree again (via reconnectSubTreeWithoutParent()).
"""
gm = ['Tree.pruneSubTreeWithoutParent()']
rNode = self.node(specifier)
if rNode == self.root:
gm.append("The specified node is the root.")
raise P4Error(gm)
rNodeParnt = rNode.parent
if not allowSingleChildNode:
if rNodeParnt.getNChildren() < 3:
# self.draw()
gm.append("The arg allowSingleChildNode is turned off.")
gm.append(
"This would be for those cases where the parent of the subTree has more than 2 children.")
raise P4Error(gm)
# self.deleteCStuff()
# print "rNode is node %i" % rNode.nodeNum
# print "rNodeParnt is node %i" % rNodeParnt.nodeNum
# Disconnect it from the tree.
# the rNode is the left, middle, or right child of the parent
if rNodeParnt.leftChild == rNode:
# print "its the left child"
rNodeParnt.leftChild = rNode.sibling
rNode.sibling = None
rNode.parent = None
else:
# print "its a middle or a right child"
leftSib = rNode.leftSibling()
assert leftSib
leftSib.sibling = rNode.sibling
rNode.sibling = None
rNode.parent = None
self.preAndPostOrderAreValid = 0
self._nTax = 0
return rNode
[docs]
def reconnectSubTreeWithoutParent(self, stNode, newParent, beforeNode=None):
"""Attach subtree stNode to the rest of the tree at newParent.
The beforeNode is by default None, and then the subtree is
reconnected as the rightmost child of the new parent. However, if
you want it somewhere else, for example as the leftmost child, or
between two existing child nodes, specify a beforeNode (specified
as usual as a node, nodeNumber, or node name) and the subtree will
be inserted there.
"""
gm = ["Tree.reconnectSubTreeWithoutParent()"]
newParent = self.node(newParent)
if not newParent.leftChild:
gm.append("Can't attach to a leaf.")
raise P4Error(gm)
stNode.parent = newParent
if beforeNode == None: # easy -- just add it to the rightmost child.
theRMChildOfNewParent = newParent.rightmostChild()
theRMChildOfNewParent.sibling = stNode
else:
bNode = self.node(beforeNode)
if bNode.parent != newParent:
gm.append(
"The parent of the 'beforeNode' should be the newParent.")
raise P4Error(gm)
if newParent.leftChild == bNode:
newParent.leftChild = stNode
else:
lSib = newParent.leftChild
while lSib.sibling != bNode:
lSib = lSib.sibling
lSib.sibling = stNode
stNode.sibling = bNode
self._nTax = 0
[docs]
def pruneSubTreeWithParent(self, specifier):
"""Remove and return a subtree, with its parent node and branch
Args:
specifier: Arg ``specifier`` can be a nodeNum, node name, or node object.
Returns:
The node that is the parent of the specified node is returned.
The nodes are left in self (ie they are not removed from the
self.nodes list; the idea being that the subtree will be added
back to the tree again (via :meth:`~p4.tree.Tree.reconnectSubTreeWithParent()`).
Let's say that we want to detach the subtree composed of taxa E, F, and G, below::
# +--------1:A
# |
# |--------2:B
# |
# 0 +---------4:C
# | |
# | | +--------6:D
# | | |
# | |---------5 +--------8:E
# +--------3 | |
# | +--------7 +---------10:F
# | +--------9
# | +---------11:G
# |
# +---------12:H
This really means that we want to remove nodes 5 (yes!) and 7
as well, and their branches. To specify that subtree, we can
use node 7 as the specifier when we call this method. The
node that is returned in this case is node 5 ::
stNode = t.pruneSubTreeWithParent(7)
print(f"The returned object is a {stNode}, nodeNum {stNode.nodeNum}")
# The returned object is a <p4.node.Node object at 0x107748eb8>, nodeNum 5
At this point we will have the tree as shown here::
# +-------1:A
# |
# |-------2:B
# 0
# | +--------4:C
# | |
# +-------3--------6:D
# |
# +--------10:G
Now we would use the companion method
:meth:`~p4.tree.Tree.reconnectSubTreeWithParent` to reconnect the subtree. For
example we can reconnect it in the same place that it was, by
specifying node 6 as the arg attachNode ::
t.reconnectSubTreeWithParent(stNode, 6)
That restores the original tree, as shown above.
The subtree can be a single leaf if you wish. In that case
the subtree is specified by the leaf, and is composed of the
leaf and its parent, including both their branches.
At the moment it does not work for nodes where the parent is
not bifurcating.
"""
gm = ['Tree.pruneSubTreeWithParent()']
rNode = self.node(specifier)
if rNode == self.root:
gm.append("The specified node is the root.")
raise P4Error(gm)
rNodeParnt = rNode.parent
if rNodeParnt.getNChildren() != 2:
# self.draw()
gm.append("The parent of the specified node must have exactly two children")
raise P4Error(gm)
if rNodeParnt == self.root:
gm.append("The parent of the specified node may not be the root")
raise P4Error(gm)
theSib = rNode.sibling # may be None
theLeftSib = rNode.leftSibling() # may be None
theGrandParent = rNodeParnt.parent
rNodeParntLeftSib = rNodeParnt.leftSibling()
rNodeParntSib = rNodeParnt.sibling
theGrandParentLeftChild = theGrandParent.leftChild
if theSib:
rNode.sibling = None
rNodeParnt.parent = None
theSib.parent = theGrandParent
if rNodeParntLeftSib:
rNodeParntLeftSib.sibling = theSib
if rNodeParntSib:
theSib.sibling = rNodeParntSib
if theGrandParentLeftChild == rNodeParnt:
theGrandParent.leftChild = theSib
elif theLeftSib:
theLeftSib.sibling = None
rNodeParnt.parent = None
theLeftSib.parent = theGrandParent
if rNodeParntLeftSib:
rNodeParntLeftSib.sibling = theLeftSib
if rNodeParntSib:
theLeftSib.sibling = rNodeParntSib
if theGrandParentLeftChild == rNodeParnt:
theGrandParent.leftChild = theLeftSib
rNodeParnt.leftChild = rNode
self.preAndPostOrderAreValid = 0
self._nTax = 0
return rNodeParnt
[docs]
def reconnectSubTreeWithParent(self, stNode, attachNode):
"""Attach subtree stNode to the branch on attachNode
This is a companion method to Tree.pruneSubTreeWithParent()
The subtree is attached on the branch on attachNode.
"""
gm = ["Tree.reconnectSubTreeWithParent()"]
myAttachNode = self.node(attachNode)
if not myAttachNode.br:
gm.append("The attachNode must have a branch")
myAttachNodeParent = myAttachNode.parent
myAttachNodeSib = myAttachNode.sibling # may be None
myAttachNodeLeftSib = myAttachNode.leftSibling() # may be None
stNodeLeftChild = stNode.leftChild # stNode has only one child
if 0:
print("myAttachNodeParent", myAttachNodeParent.nodeNum)
if myAttachNodeSib:
print("myAttachNodeSib", myAttachNodeSib.nodeNum)
else:
print("myAttachNodeSib", myAttachNodeSib)
if myAttachNodeLeftSib:
print("myAttachNodeLeftSib", myAttachNodeLeftSib.nodeNum)
else:
print("myAttachNodeLeftSib", myAttachNodeLeftSib)
print("stNodeLeftChild", stNodeLeftChild.nodeNum)
myAttachNode.parent = stNode
stNode.leftChild = myAttachNode
myAttachNode.sibling = stNodeLeftChild
stNode.parent = myAttachNodeParent
if myAttachNodeSib:
stNode.sibling = myAttachNodeSib
else:
stNode.sibling = None
if myAttachNodeLeftSib:
myAttachNodeLeftSib.sibling = stNode
else:
myAttachNodeParent.leftChild = stNode
# self.dump(node=True)
self.preAndPostOrderAreValid = 0
self._nTax = 0
[docs]
def addNodeBetweenNodes(self, specifier1, specifier2):
"""Add a node between 2 exisiting nodes, which should be parent-child.
The *specifier* can be a nodeNum, name, or node object.
Returns the new node object.
"""
gm = ['Tree.addNodeBetweenNodes()']
aNode1 = self.node(specifier1)
aNode2 = self.node(specifier2)
# aNode1 should be the parent of aNode2
if aNode1 == aNode2.parent:
pass
elif aNode1 == aNode2:
gm.append("The two specified nodes are the same.")
raise P4Error(gm)
elif aNode2 == aNode1.parent:
temp = aNode1
aNode1 = aNode2
aNode2 = temp
else:
gm.append(
"The 2 specified nodes should have a parent-child relationship")
raise P4Error(gm)
self.deleteCStuff()
hasBrLens = False
for n in self.iterNodes():
if n.br and math.fabs(n.br.len - 0.1) > 1.e-15:
hasBrLens = True
break
newNode = copy.deepcopy(aNode2)
newNode.br.len = 0.1
newNode.name = None
newNode.isLeaf = False
newNode.nodeNum = len(self.nodes)
self.nodes.append(newNode)
newNode.parent = aNode1
newNode.leftChild = aNode2
aNode2.parent = newNode
if aNode1.leftChild == aNode2:
aNode1.leftChild = newNode
else:
oldCh = aNode1.leftChild
while oldCh.sibling != aNode2:
oldCh = oldCh.sibling
oldCh.sibling = newNode
if aNode2.sibling:
newNode.sibling = aNode2.sibling
aNode2.sibling = None
if hasBrLens:
halfBrLen = aNode2.br.len / 2.0
aNode2.br.len = halfBrLen
newNode.br.len = halfBrLen
if 1:
self.preOrder = numpy.array(
[var.NO_ORDER] * len(self.nodes), numpy.int32)
self.postOrder = numpy.array(
[var.NO_ORDER] * len(self.nodes), numpy.int32)
if len(self.nodes) > 1:
self.setPreAndPostOrder()
return newNode
[docs]
def allBiRootedTrees(self):
"""Returns a Trees object containing all possible bi-rootings of self.
Self should have a root node of degree > 2, but need not be fully
resolved.
Self needs a taxNames.
"""
gm = ['Tree.allBiRootedTrees()']
if self.root.getNChildren() < 3:
gm.append("Self root should be of degree > 2.")
raise P4Error(gm)
if not self.taxNames:
gm.append("Self (ie the tree) needs to have taxNames set.")
raise P4Error(gm)
tList = []
for i in range(len(self.nodes)):
if self.nodes[i] == self.root:
pass
else:
t = self.dupe()
n = t.nodes[i]
x = t.addNodeBetweenNodes(n.parent, n)
t.reRoot(x, moveInternalName=False)
t.name = 'r%i' % n.nodeNum
tList.append(t)
# This import needs to be here --- if it is up top as usual, it leads to circular grief.
from p4.trees import Trees
tt = Trees(trees=tList, taxNames=self.taxNames)
return tt
[docs]
def ladderize(self, biggerGroupsOnBottom=True):
"""Rotate nodes for a staircase effect.
This method, in its default biggerGroupsOnBottom way, will take a
tree like this::
+---------4:A
+--------3
+---------2 +---------5:B
| |
| +--------6:C
+--------1
| | +--------8:D
| +---------7
| +--------9:E
0
|--------10:F
|
| +---------12:G
+--------11
+---------13:H
and rearranges it so that it is like ... ::
+--------10:F
|
| +---------12:G
|--------11
| +---------13:H
0
| +--------8:D
| +---------7
| | +--------9:E
+--------1
| +--------6:C
+---------2
| +---------4:A
+--------3
+---------5:B
Note that for each node, the more populated group is on the bottom,
the secondmost populated second, and so on.
To get it with the bigger groups on top, set
biggerGroupsOnBottom=False. I made the default with the bigger
groups on the bottom so that it often makes room for a scale bar.
The setting biggerGroupsOnBottom, the default here, would
equivalent to set torder=right in paup; torder=left puts the
bigger groups on the top.
"""
# self.draw()
self.root._ladderize(biggerGroupsOnBottom)
self.preAndPostOrderAreValid = 0
# self.draw()
[docs]
def randomizeTopology(self, randomBrLens=True):
gm = ["Tree.randomizeTopology()"]
if self.root.getNChildren() != 3 or not self.isFullyBifurcating():
gm.append(
"Should be a fully bifurcating tree, this week. Fix me?")
raise P4Error(gm)
if self.cTree:
self.deleteCStuff()
nTax = self.nTax
oldRoot = self.root
leaves = []
internals = []
for n in self.nodes:
if n.isLeaf:
leaves.append(n)
else:
internals.append(n)
random.shuffle(leaves)
random.shuffle(internals)
lIndx = 0
iIndx = 0
self.nodes = []
nodeNum = 0
# new root
self.root = internals[iIndx]
iIndx += 1
self.nodes.append(self.root)
self.root.parent = None
self.root.leftChild = None
self.root.sibling = None
if self.root == oldRoot:
pass
else:
oldRoot.br = self.root.br
self.root.br = None
self.root.nodeNum = nodeNum
nodeNum += 1
# add a leaf as left child to the root
n = leaves[lIndx]
lIndx += 1
self.nodes.append(n)
n.sibling = None
n.nodeNum = nodeNum
nodeNum += 1
self.root.leftChild = n
n.parent = self.root
previousNode = n
# add the rest of the leaves
while lIndx < nTax:
n = leaves[lIndx]
lIndx += 1
self.nodes.append(n)
n.sibling = None
n.nodeNum = nodeNum
nodeNum += 1
previousNode.sibling = n
previousNode = n
n.parent = self.root
# Now we have a star tree. Now add internal nodes until it is all
# resolved, which needs nTax - 3 nodes
for i in range(nTax - 3):
ssNodes = []
for n in self.nodes:
if n.sibling and n.sibling.sibling:
ssNodes.append(n)
lChild = random.choice(ssNodes)
# print "lChild = node %i" % lChild.nodeNum
# +----------1:oldLeftSib
# |
# +----------2:lChild
# 0
# +----------3:lChildSib
# |
# +----------4:oldLChildSibSib
# +----------1:oldLeftSib
# |
# | +----------3:lChild
# 0----------2(n)
# | +----------4:lChildSib
# |
# +----------5:oldLChildSibSib
n = internals[iIndx]
iIndx += 1
n.parent = None
n.sibling = None
n.leftChild = None
n.nodeNum = nodeNum
nodeNum += 1
lChildSib = lChild.sibling # guarranteed to have one
oldLChildSibSib = lChildSib.sibling # ditto
oldLeftSib = lChild.parent.leftChild # first guess ...
if oldLeftSib != lChild:
while oldLeftSib.sibling != lChild:
oldLeftSib = oldLeftSib.sibling
else:
oldLeftSib = None
if 0:
if oldLeftSib:
print("oldLeftSib = %i" % oldLeftSib.nodeNum)
else:
print("oldLeftSib = None")
print("lChildSib = %i" % lChildSib.nodeNum)
if oldLChildSibSib:
print("oldLChildSibSib = %i" % oldLChildSibSib.nodeNum)
else:
print("oldLChildSibSib = None")
if oldLeftSib:
oldLeftSib.sibling = n
else:
lChild.parent.leftChild = n
n.parent = lChild.parent
lChild.parent = n
n.leftChild = lChild
lChildSib.parent = n
n.sibling = oldLChildSibSib
lChildSib.sibling = None
self.nodes.append(n)
# self.dump(all=True)
# self.draw()
# The way it is now, the root rightmost child is always a
# leaf. Not really random, then, right? So choose a random
# internal node, and re-root it there.
# print "nTax=%i, len(t.nodes)=%i" % (nTax, len(t.nodes))
if nTax > 3:
n = self.nodes[random.randrange(nTax + 1, len(self.nodes))]
self.reRoot(n, moveInternalName=False)
# The default is to have randomBrLens, where internal nodes get
# brLens of 0.02 - 0.05, and terminal nodes get brLens of 0.2 -
# 0.5. Branch lengths are all 0.1 if randomBrLens is turned
# off.
if randomBrLens:
for n in self.nodes:
if n != self.root:
if n.isLeaf:
n.br.len = 0.02 + (random.random() * 0.48)
else:
n.br.len = 0.02 + (random.random() * 0.03)
self.preAndPostOrderAreValid = 0
# self.dump(all=True)
[docs]
def readBipartitionsFromPaupLogFile(self, thePaupLogFileName):
"""Assigns support to the tree, from the PAUP bipartitions table.
This needs to have self.taxNames set.
This is useful if you want to make a consensus tree using PAUP,
and get the support values. When you make a cons tree with PAUP,
the support values, usually bootstrap values, are unfortunately
not saved with the tree. That information is in the Bipartitions
table, which can be saved to a PAUP log file so that p4 can get
it. This method will read thePaupLogFileName and extract the
split (tree bipartition) supports, and assign those supports to self
as node.br.support's (as a float, not a string).
It also returns a hash with the split strings as keys and the
split support as values, if you need it.
"""
gm = ['Tree.readBipartitionsFromPaupLogFile()']
if not self.taxNames:
gm.append("This method needs self.taxNames.")
raise P4Error(gm)
self.checkTaxNames()
f = open(thePaupLogFileName, 'r')
while 1:
aLine = f.readline()
if not aLine:
gm.append("No Bipartitions line in %s?" % thePaupLogFileName)
raise P4Error(gm)
if aLine.startswith('Bipartitions found'):
# print aLine
break
# The splits table might be all in one piece, or it might be split
# into sections (CYCLEs). Only the last section has the Freq or
# percent. If the first section is the only section, then it is
# the LAST_CYCLE, so that we get the Freq or percent.
# Surprise-- the PAUP output contains both Freq and % support,
# unless the sum of the weights (or if there are no weights, the
# number of trees) is 100, in which case only the Freq is given.
FIRST_CYCLE = -1
MIDDLE_CYCLE = 0
LAST_CYCLE = 1
cycleType = None
thisKeyLength = None
accumulatedKeyLength = 0
nKeys = 0
keyCounter = 0
theHash = {}
theKeys = []
hasFreqOnly = None
while 1:
prevLine = aLine
aLine = f.readline()
# print "a Got Line ==>%s<==" % aLine
if not aLine:
gm.append('Unexpected end of file.')
raise P4Error(gm)
aLine = aLine.rstrip()
# print "b Got Line ==>%s<==" % aLine
if len(aLine):
if aLine[0] == '-':
# print prevLine
# print aLine
if prevLine.endswith('Freq') or prevLine.endswith('%'):
if prevLine.endswith('Freq'):
hasFreqOnly = 1
elif prevLine.endswith('%'):
hasFreqOnly = 0
cycleType = LAST_CYCLE
# print "We are now in the last cycle. hasFreqOnly=%s"
# % hasFreqOnly
splitLine = prevLine.split()
thisKeyLength = len(splitLine[0])
keyCounter = 0
elif cycleType == MIDDLE_CYCLE:
# print "We are now in a middle cycle"
thisKeyLength = len(prevLine)
keyCounter = 0
elif cycleType == FIRST_CYCLE:
gm.append('This should never happen.')
raise P4Error(gm)
else:
cycleType = FIRST_CYCLE
# print "We are now in the first cycle"
thisKeyLength = len(prevLine)
elif aLine[0] in ['.', '*']:
if cycleType in [FIRST_CYCLE, MIDDLE_CYCLE] and len(aLine) != thisKeyLength:
gm.append("Unequal key lengths?!?")
gm.append("thisKeyLength = %i" % thisKeyLength)
gm.append("%s" % aLine)
raise P4Error(gm)
if cycleType == FIRST_CYCLE:
theKeys.append(aLine)
elif cycleType == MIDDLE_CYCLE:
theKeys[keyCounter] = theKeys[keyCounter] + aLine
keyCounter = keyCounter + 1
if keyCounter > nKeys:
gm.append("Too many keys.")
raise P4Error(gm)
elif cycleType == LAST_CYCLE:
# It will usually be a line like: ...*....*. 67.83 67.9%
# But it will sometimes be a line like: ...*....*.
# 68
splitLine = aLine.split()
theKey = splitLine[0]
if hasFreqOnly:
theSupport = float(splitLine[-1])
theSupport /= 100.0
else:
# don't read the % sign
theSupport = float(splitLine[-1][:-1])
theSupport /= 100.0
if len(theKey) != thisKeyLength:
gm.append("Unequal key lengths?!?")
gm.append("thisKeyLength = %i" % thisKeyLength)
gm.append("%s" % theKey)
raise P4Error(gm)
if accumulatedKeyLength:
theKeys[keyCounter] = theKeys[keyCounter] + theKey
theHash[theKeys[keyCounter]] = theSupport
keyCounter = keyCounter + 1
if keyCounter > nKeys:
gm.append("Too many keys.")
raise P4Error(gm)
# LAST_CYCLE is also the FIRST_CYCLE, ie the table is
# in one section.
else:
theKeys.append(theKey)
theHash[theKey] = theSupport
else:
gm.append("This should never happen.")
raise P4Error(gm)
else:
pass # Skip lines with numbers
else: # a blank line
if cycleType == FIRST_CYCLE:
cycleType = MIDDLE_CYCLE
nKeys = len(theKeys)
# print "At the end of the first cycle, got %i keys" %
# nKeys
accumulatedKeyLength = thisKeyLength
elif cycleType == MIDDLE_CYCLE:
accumulatedKeyLength += thisKeyLength
elif cycleType == LAST_CYCLE:
accumulatedKeyLength += thisKeyLength
nKeys = len(theKeys)
# print "finished"
# print theKeys[0]
break
else:
pass
f.close()
if 0:
print("Finished getting split strings, and supports")
print("Got %i items in the hash" % len(theHash))
print("accumulatedKeyLength = %i" % accumulatedKeyLength)
print("nKeys = %i" % nKeys)
# self.draw()
# I will need to make splitStrings (in dot-star notation) from
# splitKeys. To do that, I can use the
# p4.func.getSplitStringFromKey(theKey, nTax) function.
self.makeSplitKeys()
for n in self.nodes:
if n != self.root:
if not n.isLeaf:
theNodeSplitString = p4.func.getSplitStringFromKey(
n.br.splitKey, self.nTax)
if theNodeSplitString in theHash:
if hasattr(n.br, 'support') and n.br.support is not None:
gm.append("Node %i already has a br.support." % n.nodeNum)
gm.append("I am refusing to clobber it with the split support.")
gm.append("Either fix the tree or fix this method.")
raise P4Error(gm)
n.br.support = float(theHash[theNodeSplitString])
return theHash
[docs]
def renameForPhylip(self, dictFName='p4_renameForPhylip_dict.py'):
"""Rename with phylip-friendly short boring names.
It saves the old names (together with the new) in a python
dictionary, in a file by default named p4_renameForPhylip_dict.py
If self does not have taxNames set, it does not write
originalNames to that file-- which may cause problems
restoring names. If you want to avoid that, be sure to set
self.taxNames before you do this method.
This method does not deal with internal node names, at all. They
are silently ignored. If they are too long for phylip, they are
still silently ignored, which might cause problems.
"""
gm = ['Tree.renameForPhylip()']
if os.path.exists(dictFName):
gm.append("The dictionary file '%s' already exists." % dictFName)
raise P4Error(gm)
d = {}
if self.taxNames:
d2 = {}
originalNames = self.taxNames[:]
for i in range(len(self.taxNames)):
oldName = self._taxNames[i]
newName = 's%i' % i
d[newName] = oldName
d2[oldName] = newName
self._taxNames[i] = newName
for n in self.iterLeavesNoRoot():
n.name = d2[n.name]
if self.root.isLeaf and self.root.name:
self.root.name = d2[self.root.name]
else:
originalNames = None
i = 0
for n in self.iterLeavesNoRoot():
oldName = n.name
newName = 's%i' % i
d[newName] = oldName
n.name = newName
i += 1
if self.root.isLeaf and self.root.name:
oldName = self.root.name
newName = 's%i' % i
d[newName] = oldName
self.root.name = newName
f = open(dictFName, 'w')
f.write("p4_renameForPhylip_originalNames = %s\np4_renameForPhylip_dict = %s\n" % (
originalNames, d))
f.close()
[docs]
def restoreNamesFromRenameForPhylip(self, dictFName='p4_renameForPhylip_dict.py'):
"""Given the dictionary file, restore proper names.
The renaming is done by the Alignment method renameForPhylip(),
which makes the dictionary file. The dictionary file is by
default named p4_renameForPhylip_dict.py
"""
gm = ["Tree.restoreNamesFromRenameForPhylip()"]
if os.path.exists(dictFName):
loc = {}
exec(open(dictFName).read(), {}, loc)
try:
p4_renameForPhylip_dict = loc['p4_renameForPhylip_dict']
p4_renameForPhylip_originalNames = loc['p4_renameForPhylip_originalNames']
except KeyError:
gm.append("Dict file %s exists, but I can't read it correctly." % dictFName)
raise P4Error(gm)
else:
gm.append("The dictionary file '%s' can't be found." % dictFName)
raise P4Error(gm)
for n in self.iterNodes():
if n.isLeaf:
if n.name in p4_renameForPhylip_dict:
n.name = p4_renameForPhylip_dict[n.name]
else:
gm.append("The dictionary does not contain a key for '%s'." % n.name)
raise P4Error(gm)
if p4_renameForPhylip_originalNames:
self.taxNames = p4_renameForPhylip_originalNames
else:
if self.taxNames:
gm.append("self.taxNames is set, and should be replaced, but")
gm.append("p4_renameForPhylip_originalNames is None. ?!?")
raise P4Error(gm)
[docs]
def restoreDupeTaxa(self, dictFileName='p4DupeSeqRenameDict.py', asMultiNames=True):
"""Restore previously removed duplicate taxa from a dict file.
The usual story would be like this: You read in your alignment and
p4 tells you that you have duplicate sequences. So you use the
Alignment method checkForDuplicateSequences() to remove them,
which makes a dictionary file, by default
'p4DupeSeqRenameDict.py', to facilitate restoration of the names.
You do your analysis on the reduced alignment, and get a tree.
Then you use the dictionary file with this method to restore all
the taxon names.
If asMultiNames is turned on, the default, then the leaf nodes are
not replicated, and the name is changed to be a long multi-name.
If asMultiNames is turned off, then the restored taxa are made to
be siblings, and the branch lengths are set to zero.
"""
gm = ['Tree.restoreDupeTaxa()']
if not os.path.isfile(dictFileName):
gm.append("Can't find dict file '%s'" % dictFileName)
raise P4Error(gm)
loc = {}
exec(open(dictFileName).read(), {}, loc)
try:
p4DupeSeqRenameDict = loc['p4DupeSeqRenameDict']
except KeyError:
gm.append(
"Can't get the dictionary named 'p4DupeSeqRenameDict' from the dict file.")
# print p4DupeSeqRenameDict
kk = p4DupeSeqRenameDict.keys()
# print kk
if asMultiNames:
for oldName in kk:
rNode = self.node(oldName)
newNames = p4DupeSeqRenameDict[oldName]
multiName = ', '.join(newNames)
rNode.name = multiName
else:
for oldName in kk:
rNode = self.node(oldName)
newNames = p4DupeSeqRenameDict[oldName]
lCh = Node()
lCh.isLeaf = 1
lCh.name = newNames[0]
lCh.br.len = 0.0
lCh.parent = rNode
lCh.nodeNum = len(self.nodes)
self.nodes.append(lCh)
rNode.isLeaf = 0
rNode.name = None
rNode.leftChild = lCh
theLeftSib = rNode.leftChild
for sibName in newNames[1:]:
n = Node()
n.parent = rNode
theLeftSib.sibling = n
n.isLeaf = 1
n.name = sibName
n.br.len = 0.0
n.nodeNum = len(self.nodes)
self.nodes.append(n)
theLeftSib = n
self.taxNames = []
self.preOrder = None
self.postOrder = None
self.setPreAndPostOrder()
self._nTax = 0
[docs]
def lineUpLeaves(self, rootToLeaf=1.0, overWriteBrLens=True):
"""Make the leaves line up, as in a cladogram.
This makes the rootToLeaf distance the same for all leaves.
If overWriteBrLens is set, then the newly calculated br.lens
replace the original br.lens. If it is not set, then the new
br.lens are placed in br.lenL, and does not over-write the
original br.lens. """
levels = 0
for n1 in self.iterLeavesNoRoot():
nodeCount = 1
p = n1.parent
while p != self.root:
p = p.parent
nodeCount += 1
if nodeCount > levels:
levels = nodeCount
# print "levels = %i" % levels
for n1 in self.iterLeavesNoRoot():
n1.lul_pos = levels
ch = n1
p = n1.parent
while p:
newPos = ch.lul_pos - 1
if not hasattr(p, 'lul_pos'):
p.lul_pos = newPos
else:
if p.lul_pos <= newPos:
pass
else:
p.lul_pos = newPos
ch = p
p = p.parent
for n in self.iterNodesNoRoot():
p = n.parent
if p == self.root:
n.br.lenL = float(n.lul_pos) / float(levels) * rootToLeaf
else:
n.br.lenL = float(n.lul_pos - p.lul_pos) / \
float(levels) * rootToLeaf
# Clean up
for n in self.iterNodes():
if hasattr(n, 'lul_pos'):
del(n.lul_pos)
if overWriteBrLens:
if n.br and hasattr(n.br, 'lenL'):
n.br.len = n.br.lenL
del(n.br.lenL)
[docs]
def removeEverythingExceptCladeAtNode(self, specifier):
"""Like it says. Leaves a tree with a root-on-a-stick."""
theNode = self.node(specifier)
if theNode == self.root:
return
self.reRoot(theNode.parent, checkBiRoot=False)
toRemoves = []
for ch in self.root.iterChildren():
if ch != theNode:
toRemoves.append(ch)
for ch in toRemoves:
self.removeNode(
ch, alsoRemoveSingleChildParentNode=False, alsoRemoveBiRoot=False)
[docs]
def dupeSubTree(self, dupeNodeSpecifier, up, doBrLens=True, doSupport=True):
"""Makes and returns a new Tree object, duping part of self.
The dupeNodeSpecifier can be a node name, node number, or node
object.
Arg 'up' should be True or False.
The returned subtree has a root-on-a-stick.
BrLens are not duped -- brLens are default in the new subtree.
So if the tree is like this::
+---------2:A
+--------1
| +---------3:B
|
0--------4:C
|
| +---------6:D
+--------5:dupeNode
| +--------8:E
+---------7
+--------9:F
Then the subtree from node 5 up is::
+---------2:D
subTreeRoot:0--------1:dupeNode
| +--------4:E
+---------3
+--------5:F
and the subtree from node 5 down is::
+--------2:A
+---------1
dupeNode:0--------5 +--------3:B
|
+---------4:C
"""
dupeNode = self.node(dupeNodeSpecifier)
if dupeNode == self.root and not up:
print("The dupeNode is self.root, and you want a subtree below that?!?")
sys.exit()
from p4.tree import Tree
st = Tree()
if up:
n = Node()
n.br = None
n.nodeNum = 0
n.name = 'subTreeRoot'
n.isLeaf = 1
st.root = n
st.nodes.append(n)
if dupeNode.isLeaf:
# its easy -- just dupe the leaf
n = Node()
if doBrLens:
n.br.len = dupeNode.br.len
if doSupport:
n.br.support = dupeNode.br.support
n.parent = st.root
st.root.leftChild = n
n.name = dupeNode.name
n.isLeaf = 1
st.nodes.append(n)
else:
i = 1 # nodeNum counter
nodeNumDict = {}
for selfNode in dupeNode.iterPreOrder():
n = Node()
if doBrLens:
n.br.len = selfNode.br.len
if doSupport and hasattr(selfNode.br, 'support'):
n.br.support = selfNode.br.support
n.nodeNum = i
nodeNumDict[selfNode.nodeNum] = i
n.isLeaf = selfNode.isLeaf
n.name = selfNode.name
i += 1
st.nodes.append(n)
for selfNode in dupeNode.iterPreOrder():
if selfNode.leftChild:
st.nodes[nodeNumDict[selfNode.nodeNum]].leftChild = \
st.nodes[nodeNumDict[selfNode.leftChild.nodeNum]]
if selfNode == dupeNode:
pass # skip parent and sibling
else:
# parents and siblings. There will always be a parent.
st.nodes[nodeNumDict[selfNode.nodeNum]].parent = \
st.nodes[nodeNumDict[selfNode.parent.nodeNum]]
if selfNode.sibling:
st.nodes[nodeNumDict[selfNode.nodeNum]].sibling = \
st.nodes[nodeNumDict[selfNode.sibling.nodeNum]]
st.root.leftChild = st.nodes[nodeNumDict[dupeNode.nodeNum]]
st.root.leftChild.parent = st.root
else: # down
i = 0 # nodeNum counter
nodeNumDict = {}
mostRecentDown = None
for selfNode in dupeNode.iterDown():
n = Node()
if selfNode != self.root:
if doBrLens:
n.br.len = selfNode.br.len
if not selfNode.isLeaf and doSupport:
if hasattr(selfNode.br, 'support'):
n.br.support = selfNode.br.support
else:
print("Warning: dupeSubTree() doSupport is turned on, but node %i has no support." % selfNode.nodeNum)
n.nodeNum = i
nodeNumDict[selfNode.nodeNum] = i
n.isLeaf = selfNode.isLeaf
n.name = selfNode.name
i += 1
st.nodes.append(n)
for selfNode in dupeNode.iterDown():
# parents and siblings.
if selfNode.parent:
st.nodes[nodeNumDict[selfNode.nodeNum]].parent = \
st.nodes[nodeNumDict[selfNode.parent.nodeNum]]
else:
# Its the root
st.root = st.nodes[nodeNumDict[selfNode.nodeNum]]
st.root.br = None
if selfNode.sibling:
st.nodes[nodeNumDict[selfNode.nodeNum]].sibling = \
st.nodes[nodeNumDict[selfNode.sibling.nodeNum]]
if selfNode == dupeNode:
st.nodes[nodeNumDict[dupeNode.nodeNum]].leftChild = None
st.nodes[nodeNumDict[dupeNode.nodeNum]].isLeaf = 1
else:
if selfNode.leftChild:
st.nodes[nodeNumDict[selfNode.nodeNum]].leftChild = \
st.nodes[nodeNumDict[selfNode.leftChild.nodeNum]]
st.reRoot(nodeNumDict[dupeNode.nodeNum], moveInternalName=False)
st.root.isLeaf = 1
if not st.root.name:
st.root.name = 'subTreeRoot'
st.setPreAndPostOrder()
return st
[docs]
def addSubTree(self, selfNode, theSubTree, subTreeTaxNames=None):
"""Add a subtree to a tree.
The nodes from theSubTree are added to self.nodes, and theSubTree
is deleted.
If subTreeTaxNames is provided, fine, but if not this method can
find them. Providing them saves a bit of time, I assume.
"""
# subTreeRootNode:0-------1:oldSubTreeRootNodeLeftChild
# +-------1:A
# oldSelfNodeParent:0
# +-------2:selfNode
# +--------1:A
# oldSelfNodeParent:0
# | +--------2:selfNode
# +--------3:subTreeRootNode
# +--------4:oldSubTreeRootNodeLeftChild
# =========================================
# +-------1:selfNode
# oldSelfNodeParent:0
# +-------2:A
# +--------1:selfNode
# +--------3:subTreeRootNode
# oldSelfNodeParent:0 +--------4:oldSubTreeRootNodeLeftChild
# |
# +--------2:A
assert selfNode in self.nodes
assert selfNode.parent
# its a root on a stick
assert theSubTree.root.leftChild and not theSubTree.root.leftChild.sibling
if not subTreeTaxNames:
subTreeTaxNames = [n.name for n in theSubTree.iterLeavesNoRoot()]
oldSelfNodeParent = selfNode.parent
subTreeRootNode = theSubTree.root
theSubTree.root = None
oldSubTreeRootNodeLeftChild = subTreeRootNode.leftChild
subTreeRootNode.parent = oldSelfNodeParent
subTreeRootNode.leftChild = selfNode
selfNode.parent = subTreeRootNode
if oldSelfNodeParent.leftChild == selfNode:
oldSelfNodeParent.leftChild = subTreeRootNode
else:
oldCh = oldSelfNodeParent.leftChild
while oldCh.sibling != selfNode:
oldCh = oldCh.sibling
oldCh.sibling = subTreeRootNode
if selfNode.sibling:
subTreeRootNode.sibling = selfNode.sibling
selfNode.sibling = oldSubTreeRootNodeLeftChild
subTreeRootNode.isLeaf = 0
nSelfNodes = len(self.nodes)
self.nodes += theSubTree.nodes
theSubTree.nodes = []
selfNode.br.len = 0.1
subTreeRootNode.br = NodeBranch()
subTreeRootNode.br.len = 0.1
self.taxNames += theSubTree.taxNames
for i in range(nSelfNodes, len(self.nodes)):
n = self.nodes[i]
n.nodeNum = i
# print
# for n in self.nodes:
# print n.nodeNum
# print "self.taxNames is %s" % self.taxNames
# print "subTreeTaxNames %s" % subTreeTaxNames
if self.taxNames:
self._taxNames += subTreeTaxNames
if self._nTax:
self._nTax += len(subTreeTaxNames)
self.preAndPostOrderAreValid = False
self.preOrder = None
self.postOrder = None
self.setPreAndPostOrder()
del(theSubTree)
[docs]
def addLeaf(self, attachmentNode, taxName):
"""Add a leaf to a tree
The leaf is added to the branch leading to the attachmentNode.
The attachmentNode should be a Node, not a node number or name.
A new node is made on that branch, so actually two nodes are added
to the tree. The new leaf node is returned.
"""
assert attachmentNode in self.nodes
aNode = attachmentNode
# self.draw()
# print "attachmentNode is %i" % aNode.nodeNum
#bNode = self.addNodeBetweenNodes(aNode, aNode.parent)
aNodeP = attachmentNode.parent
bNode = Node()
bNode.nodeNum = len(self.nodes)
self.nodes.append(bNode)
bNode.parent = aNodeP
bNode.leftChild = aNode
aNode.parent = bNode
if aNodeP.leftChild == aNode:
aNodeP.leftChild = bNode
else:
oldCh = aNodeP.leftChild
while oldCh.sibling != aNode:
oldCh = oldCh.sibling
oldCh.sibling = bNode
if aNode.sibling:
bNode.sibling = aNode.sibling
aNode.sibling = None
aNode.br.len = 0.1
bNode.br.len = 0.1
# self.draw()
n = Node()
n.nodeNum = len(self.nodes)
n.name = taxName
n.isLeaf = 1
oldSibling = attachmentNode.sibling
n.parent = bNode
aNode.sibling = n
n.sibling = oldSibling
self.nodes.append(n)
if self.taxNames:
# taxnames can be shared among trees, so check whether it has already been added
if n.name not in self.taxNames:
self.taxNames.append(n.name)
self.preAndPostOrderAreValid = False
self.preOrder = None
self.postOrder = None
self.getPreAndPostOrderAboveRoot()
# self.dump(node=True)
# self.draw()
if self._nTax:
self._nTax += 1
return n
[docs]
def addSibLeaf(self, attachmentNode, taxName):
"""Add a leaf to a tree as a sibling, by specifying its parent.
The leaf is added so that its parent is the specified node (ie
attachmentNode), adding the node as a rightmost child to that
parent. The attachmentNode should not be a leaf -- it must have
children nodes, to which the new leaf can be added as a sibling.
The new node is returned.
"""
assert attachmentNode in self.nodes
assert not attachmentNode.isLeaf
aNode = attachmentNode
# self.draw()
# print "attachmentNode is %i" % aNode.nodeNum
n = Node()
n.nodeNum = len(self.nodes)
n.name = taxName
n.isLeaf = 1
n.br.len = 0.1
rtMostCh = aNode.rightmostChild()
rtMostCh.sibling = n
n.sibling = None
n.parent = aNode
self.nodes.append(n)
if self.taxNames:
# taxnames can be shared among trees, so check whether it has already been added
if n.name not in self.taxNames:
self.taxNames.append(n.name)
self.preAndPostOrderAreValid = False
self.preOrder = None
self.postOrder = None
self.getPreAndPostOrderAboveRoot()
# self.dump(node=True)
# self.draw()
if self._nTax:
self._nTax += 1
return n
[docs]
def subTreeIsFullyBifurcating(self, theNode, up=True):
"""Is theNode and everything above it (or below it) bifurcating?
Arg *up* says whether its above or below.
"""
# don't use getNChildren() -- too slow!
assert theNode != self.root
if up:
# Includes theNode, which we want
for n in theNode.iterInternals():
# if n.getNChildren() != 2:
# return False
if n.leftChild and n.leftChild.sibling:
if n.leftChild.sibling.sibling:
return False
else:
return False
return True
else:
# Includes theNode, which we do not want
for n in theNode.iterDown():
if n != theNode:
if not n.isLeaf:
if n == self.root:
# if n.getNChildren() != 3:
# return False
if n.leftChild and n.leftChild.sibling and n.leftChild.sibling.sibling:
if n.leftChild.sibling.sibling.sibling:
return False
else:
return False
else:
# if n.getNChildren() != 2:
# return False
if n.leftChild and n.leftChild.sibling:
if n.leftChild.sibling.sibling:
return False
else:
return False
return True
[docs]
def nni(self, upperNodeSpec=None):
"""Simple nearest-neighbor interchange.
You specify an 'upper' node, via an upperNodeSpec, which as
usual can be a node name, node number, or node object. If you
don't specify something, a random node will be chosen for you.
(This latter option might be a little slow if you are doing
many of them, as it uses iterInternalsNoRoot(), but mostly it
should be fast enough).
The upper node has a parent -- the 'lower' node. One subtree
from the upper node and one subtree from the lower node are
exchanged. Both subtrees are chosen randomly.
This works on biRooted trees also, preserving the biRoot.
Neighbours are not allowed to be on both sides the the
bifurcating root, and so exchanges are only allowed on one
side of the tree or the other.
"""
gm = ["Tree.nni()"]
if upperNodeSpec:
# This makes sure that upperNode is part of self.
upperNode = self.node(upperNodeSpec)
else:
# This allows candidates adjacent to the root, including the biRoot
#candidates = [n for n in self.iterInternalsNoRoot()]
# disallow candidates adjacent to the biRoot
candidates = []
disallowed = []
selfIsBiRooted = self.isBiRoot()
if selfIsBiRooted:
disallowed = [self.root.leftChild, self.root.leftChild.sibling]
for n in self.iterInternalsNoRoot():
if selfIsBiRooted:
if n in disallowed:
pass
else:
candidates.append(n)
else:
candidates.append(n)
if not candidates:
self.dump()
self.draw()
gm.append("No internal nodes?")
raise P4Error(gm)
upperNode = random.choice(candidates)
#print(f"upperNode is {upperNode.nodeNum}")
# Want the upperNode to have at least 2 children
upperChildren = [n for n in upperNode.iterChildren()]
if len(upperChildren) < 2:
gm.append("upperNode needs to have at least 2 children.")
raise P4Error(gm)
if upperNode.parent:
lowerNode = upperNode.parent
else:
gm.append("upperNode needs to have a parent node.")
raise P4Error(gm)
lowerChildren = [n for n in lowerNode.iterChildren() if n != upperNode]
if 0:
if lowerNode.parent:
if len(lowerChildren) < 1:
gm.append("The lower node has a parent.")
gm.append(
"It needs at least one more child besides the upperNode.")
raise P4Error(gm)
else:
if len(lowerChildren) < 2:
gm.append("The lower node does not have a parent.")
gm.append(
"It needs at least 2 children besides the upperNode.")
raise P4Error(gm)
if len(lowerChildren) < 1:
gm.append(
"The lower node needs at least one more child besides the upperNode.")
raise P4Error(gm)
upperSubTreeNode = random.choice(upperChildren)
lowerSubTreeNode = random.choice(lowerChildren)
upperSubTree = self.pruneSubTreeWithoutParent(
upperSubTreeNode, allowSingleChildNode=True)
lowerSubTree = self.pruneSubTreeWithoutParent(
lowerSubTreeNode, allowSingleChildNode=True)
self.reconnectSubTreeWithoutParent(upperSubTree, lowerNode)
self.reconnectSubTreeWithoutParent(lowerSubTree, upperNode)
self.setPreAndPostOrder()
[docs]
def nni2(self, upperSubTreeNode, lowerSubTreeNode):
"""Simple nearest-neighbor interchange, with specified nodes to interchange.
You specify an 'upper' node, via upperSubTreeNode, which as usual can be
a node name, node number, or node object. You also specify a 'lower'
node, via lowerSubTreeNode.
The grand-parent of the upperSubTreeNode must exist and must be the
parent of the lowerSubTreeNode.
This works on biRooted trees also, preserving the biRoot.
"""
gm = ["Tree.nni2()"]
# This makes sure that the two nodes are part of self.
upperSubTreeNode = self.node(upperSubTreeNode)
lowerSubTreeNode = self.node(lowerSubTreeNode)
assert upperSubTreeNode.parent
upperNode = upperSubTreeNode.parent
assert upperSubTreeNode.parent.parent
lowerNode = upperSubTreeNode.parent.parent
assert lowerSubTreeNode.parent
assert upperSubTreeNode.parent.parent == lowerSubTreeNode.parent
upperSubTree = self.pruneSubTreeWithoutParent(
upperSubTreeNode, allowSingleChildNode=True)
lowerSubTree = self.pruneSubTreeWithoutParent(
lowerSubTreeNode, allowSingleChildNode=True)
self.reconnectSubTreeWithoutParent(upperSubTree, lowerNode)
self.reconnectSubTreeWithoutParent(lowerSubTree, upperNode)
self.setPreAndPostOrder()
[docs]
def checkThatAllSelfNodesAreInTheTree(self, verbose=False, andRemoveThem=False):
"""Check that all self.nodes are actually part of the tree.
Arg *andRemoveThem* will remove those nodes, renumber the nodes,
and reset pre- and postOrder, and return None
If *andRemoveThem* is not set (the default is not set) then this
method returns the list of nodes that are in self.nodes but not in
the tree.
"""
selfNodesSet = set(self.nodes)
treeSet = set([n for n in self.iterNodes()])
inSelfNodesButNotInTree = list(selfNodesSet.difference(treeSet))
if verbose:
if inSelfNodesButNotInTree:
print("These nodes are in self.nodes, but not part of the tree.")
for n in inSelfNodesButNotInTree:
print(n.nodeNum)
else:
print("All nodes in self.nodes are also in the tree.")
if inSelfNodesButNotInTree and andRemoveThem:
for n in inSelfNodesButNotInTree:
self.nodes.remove(n)
for i in range(len(self.nodes)):
self.nodes[i].nodeNum = i
self.preOrder = numpy.array(
[var.NO_ORDER] * len(self.nodes), numpy.int32)
self.postOrder = numpy.array(
[var.NO_ORDER] * len(self.nodes), numpy.int32)
if len(self.nodes) > 1:
self.setPreAndPostOrder()
return
return list(inSelfNodesButNotInTree)
[docs]
def spr(self, pruneNode=None, above=True, graftNode=None):
"""Subtree pruning and reconnection.
See also the Tree.randomSpr() method. It uses this method to do a
random spr move.
This only works on fully bifurcating trees. Doing spr moves would
tend to break up polytomies anyway; pruning subtrees from a
polytomy would require creation of new nodes.
The subtree to be pruned might be pointing up or pointing down
from a specified node. If the subtree is pointing up, the subtree
to be pruned is specified by the appropriate child of the root of
the subtree; the subtree would have a root-on-a-stick (Is
monofurcating a proper word?) with the subtree root's single child
being the specified node. If the subtree is pointing down, then
the tree is re-rooted to the specified node to allow pruning of
the subtree, now above the specified node, with the specified node
as the root, including the subtree with the pre-re-rooting parent
of the specified node.
I'll draw that out. Lets say we want to prune the subtree below
node 2 in this tree. That would include nodes 0, 1, 2, and 7. ::
+--------1:A
|
| +---------3:B
|--------2
0 | +--------5:C
| +---------4
| +--------6:D
|
+--------7:E
The way it is done in this method is to re-root at node 2, which
is the specified node. Then the subtree including the
pre-re-rooting parent of the specified node, ie node 0, is pruned.
::
+--------3:B
|
| +--------5:C
2--------4
| +--------6:D
|
| +--------1:A
+--------0
+--------7:E
"""
gm = ["Tree.spr()"]
# This is only for fully bifurcating trees.
if not self.isFullyBifurcating():
gm.append("This method is only for fully bifurcating trees.")
raise P4Error(gm)
pnNode = self.node(pruneNode)
grNode = self.node(graftNode)
# self.draw()
self.deleteCStuff()
if not above:
preReRootingParent = pnNode.parent
self.reRoot(pnNode)
pnNode = preReRootingParent
# Check for silliness
assert pnNode != self.root
assert grNode != self.root
assert pnNode != grNode
subTreeNodes = [n for n in pnNode.iterPreOrder()]
pnNodeParnt = pnNode.parent
subTreeNodes.append(pnNodeParnt)
# print [n.nodeNum for n in subTreeNodes]
if grNode in subTreeNodes:
if above:
gm.append("grNode %i is part of the pruned subtree from %i-%i up. No workee!" % (
grNode.nodeNum, pnNodeParnt.nodeNum, pnNode.nodeNum))
else:
gm.append("grNode %i is part of the pruned subtree below %i-%i. No workee!" % (
grNode.nodeNum, pnNodeParnt.nodeNum, pnNode.nodeNum))
raise P4Error(gm)
# Prune it from the tree.
if pnNodeParnt == self.root:
if grNode.parent == self.root: # as well,
gm.append(
"prune node and graft node both have root as parent -- ie same origin and destination.")
raise P4Error(gm)
# Check if removal of the subtree will result in only 2 taxa
singles = 0
for ch in self.root.iterChildren():
if ch != pnNode:
if not ch.leftChild:
singles += 1
if singles == 2:
gm.append(
"Removing subtree at %i will leave only 2 taxa." % pnNode.nodeNum)
raise P4Error(gm)
newRoot = None
for ch in self.root.iterChildren():
if ch != pnNode:
if ch.leftChild:
newRoot = ch
break
# print "rerooting to node %i" % newRoot.nodeNum
self.reRoot(newRoot)
# self.draw()
# pnNodeParnt is not the root, and so we can be sure pnNodeParnt has a parent.
# the pnNode is the left, middle, or right child of the parent
if pnNodeParnt.leftChild == pnNode:
# its the left child
# remove the pnNodeParnt along with the subtree
# (pnNodeParnt.parent.leftChild, (pnNode, pnNode.sibling)pnNodeParnt, pnNodeParnt.sibling)pnNodeParnt.parent;
# +----------1:pnNodeParnt.parent.leftChild
# |
# | +-----------3:pnNode
# pnNodeParnt.parent:0----------2:pnNodeParnt
# | +-----------4:pnNode.sibling
# |
# +----------5:pnNodeParnt.sibling
pnNode.sibling.parent = pnNodeParnt.parent
if pnNodeParnt.parent.leftChild == pnNodeParnt:
pnNodeParnt.parent.leftChild = pnNode.sibling
elif pnNodeParnt.parent.leftChild.sibling == pnNodeParnt:
pnNodeParnt.parent.leftChild.sibling = pnNode.sibling
# if pnNodeParnt.parent is the root, it can have 3 children, and
# maybe this ...
else:
pnNodeParnt.parent.leftChild.sibling.sibling = pnNode.sibling
pnNode.sibling.sibling = pnNodeParnt.sibling
#pnNode.sibling = None
#pnNodeParnt.parnt = None
else:
# its a right child
# (pnNodeParnt.parent.leftChild, (pnNodeLeftSib, pnNode)pnNodeParnt, pnNodeParnt.sibling)pnNodeParnt.parent;
# +-----------1:pnNodeParnt.parent.leftChild
# |
# | +-----------3:pnNodeLeftSib
# pnNodeParnt.parent:0-----------2:pnNodeParnt
# | +-----------4:pnNode
# |
# +-----------5:pnNodeParnt.sibling
pnNodeLeftSib = pnNodeParnt.leftChild
pnNodeLeftSib.parent = pnNodeParnt.parent
pnNodeLeftSib.sibling = pnNodeParnt.sibling
if pnNodeParnt.parent.leftChild == pnNodeParnt:
pnNodeParnt.parent.leftChild = pnNodeLeftSib
elif pnNodeParnt.parent.leftChild.sibling == pnNodeParnt:
pnNodeParnt.parent.leftChild.sibling = pnNodeLeftSib
# if pnNodeParnt.parent is the root, it can have 3 children, and
# maybe this ...
else:
pnNodeParnt.parent.leftChild.sibling.sibling = pnNodeLeftSib
# To look at the pruned tree, before grafting ...
if 0:
print("removal of subtree at %i-%i gives .." % (pnNodeParnt.nodeNum, pnNode.nodeNum))
self._nTax = 0
self.preAndPostOrderAreValid = 0
# This won't work unless preAndPostOrderAreValid set to 0
self.draw()
# Now graft it back on ...
# (grNodeParnt.leftChild, grNode, grNode.sibling)grNodeParnt;
# +-------1:grNodeParnt.leftChild
# |
# grNodeParnt:0-------2:grNode
# |
# +-------3:grNode.sibling
# (grNode, grNode.sibling)grNodeParnt;
# +-------1:grNode
# grNodeParnt:0
# +-------2:grNode.sibling
# (grNodeParnt.leftChild, grNodeParnt.leftChild.sibling, grNode)grNodeParnt;
# +-------1:grNodeParnt.leftChild
# |
# grNodeParnt:0-------2:grNodeParnt.leftChild.sibling
# |
# +-------3:grNode
grNodeParnt = grNode.parent
pnNodeParnt.parent = grNodeParnt
grNode.parent = pnNodeParnt
pnNodeParnt.sibling = grNode.sibling
grNode.sibling = pnNode
pnNode.sibling = None
grNode.sibling = pnNode
pnNodeParnt.leftChild = grNode
if grNodeParnt.leftChild == grNode:
grNodeParnt.leftChild = pnNodeParnt
elif grNodeParnt.leftChild.sibling == grNode:
grNodeParnt.leftChild.sibling = pnNodeParnt
else:
grNodeParnt.leftChild.sibling.sibling = pnNodeParnt
# To look at the tree after grafting ...
if 0:
print("grafting the subtree at grNode %i gives ..." % grNode.nodeNum)
self._nTax = 0
self.preAndPostOrderAreValid = 0
# This won't work unless preAndPostOrderAreValid set to 0
self.draw()
self.preAndPostOrderAreValid = 0
self.setPreAndPostOrder()
[docs]
def randomSpr(self):
"""Do a random spr move.
"""
myAbove = random.choice([True, False])
tNodes = [n for n in self.iterNodesNoRoot()]
while 1:
try:
pNode = random.choice(tNodes)
except IndexError:
# we have run out of choices, all were unsuitable.
return
tNodes.remove(pNode)
if pNode == self.root:
continue
if myAbove:
# iterDown() will return pNode, the root, and whatever else.
# If whatever else is only 2 nodes, it won't work.
nodesDown = [n2 for n2 in pNode.iterDown()]
if len(nodesDown) <= 4:
continue
else:
# iterPostOrder() ends with the pNode. We need at least the
# pNode and 3 others, so total 3 is too few.
nodesUp = [n2 for n2 in pNode.iterPostOrder()]
# self.draw()
# print "--", pNode.nodeNum, [n2.nodeNum for n2 in nodesUp]
if len(nodesUp) <= 3:
continue
break
if myAbove:
#pNode = self.node(pNNum)
possibles = [n2.nodeNum for n2 in pNode.iterDown()
if n2 is not pNode
and n2 is not pNode.parent
and n2.parent is not pNode.parent]
else:
#pNode = self.node(pNNum)
possibles = [n2.nodeNum for n2 in pNode.iterPreOrder()
if n2.parent is not pNode and
pNode is not n2]
if 0:
self.draw()
print("above=%s, pNode %i, " % (myAbove, pNode.nodeNum), subtreeNodeNums)
print(possibles)
sys.exit()
safety = 0
giveUp = 20
while 1:
safety += 1
if safety >= giveUp:
break
gNNum = random.choice(possibles)
if gNNum == pNode.nodeNum:
continue
if gNNum == self.root.nodeNum:
continue
# print "===", "pNode=%i" % pNode.nodeNum, "gNNum=%i" % gNNum,
# subtreeNodeNums
if pNode.parent == self.root and self.node(gNNum).parent == self.root:
continue
if not myAbove and self.node(gNNum).parent == pNode:
continue
break
if safety >= giveUp:
print("pNode is %i, above=%s" % (pNode.nodeNum, myAbove))
self.draw()
raise P4Error
# print("randomSpr() pruneNum %i, above=%s, graftNum %i" % (pNode.nodeNum,myAbove, gNNum))
self.spr(pruneNode=pNode, above=myAbove, graftNode=gNNum)
[docs]
def resolvePolytomyAtNode(self, theNode, resolution=2):
"""Resolve the polytomy at theNode
Resolve randomly, by joining up children with new nodes.
After resolution, the theNode should have arg resolution children.
"""
if not theNode.leftChild:
return
children = []
p = theNode.leftChild
while p:
children.append(p)
p = p.sibling
while len(children) > resolution:
rNode = random.choice(children)
children.remove(rNode)
# print(f" about to prune subtree at node {rNode.nodeNum}")
rSubtree = self.pruneSubTreeWithoutParent(rNode)
otherNode = random.choice(children)
nBetween = self.addNodeBetweenNodes(theNode, otherNode)
self.reconnectSubTreeWithoutParent(rSubtree, nBetween)
self.preAndPostOrderAreValid = 0
self.setPreAndPostOrder()
# self.draw()
children = []
p = theNode.leftChild
while p:
children.append(p)
p = p.sibling
[docs]
def resolve(self, biRoot=False):
"""Randomly resolve all polytomies
It does it in postOrder, calling Tree.resolvePolytomyAtNode() on
internal nodes.
It will work on a partially resolved tree, or on a comb tree.
If biRoot is set to False, the default, then the final resolution
will leave a triRoot'ed root. If biRoot is set to True, then the
final resolution will leave a biRoot'ed tree.
"""
self.setPreAndPostOrder()
for n in self.iterInternalsNoRootPostOrder():
self.resolvePolytomyAtNode(n, resolution=2)
if biRoot:
myRes = 2
else:
myRes = 3
self.resolvePolytomyAtNode(self.root, resolution=myRes)
# resolvePolytomyAtNode() does self.setPreAndPostOrder()
################################################# write
#################################################
[docs]
def patristicDistanceMatrix(self):
"""Matrix of distances along tree path.
This method sums the branch lengths between each pair of taxa, and
puts the result in a DistanceMatrix object, which is returned.
Self.taxNames is required.
"""
gm = ['Tree.patristicDistanceMatrix()']
if not self.taxNames:
gm.append("No taxNames.")
raise P4Error(gm)
# The tree will be rearranged, so make a copy to play with, so
# self is undisturbed.
t = self.dupe()
d = DistanceMatrix()
d.names = self.taxNames
d.dim = len(d.names)
# print d.names
d.matrix = []
for i in range(d.dim):
d.matrix.append([0.0] * d.dim)
for i in range(d.dim):
n1 = t.node(d.names[i])
t.reRoot(n1, moveInternalName=False)
for j in range(i + 1, d.dim):
n2 = t.node(d.names[j])
# sum up distances between n1 and n2
sum = n2.br.len
n2 = n2.parent
while n2 != n1:
sum = sum + n2.br.len
n2 = n2.parent
# print "Dist from %s to %s is %f" % (d.names[i], d.names[j],
# sum)
d.matrix[i][j] = sum
d.matrix[j][i] = sum
return d
[docs]
def tPickle(self, fName=None):
"""Pickle self to a file with a 'p4_tPickle' suffix.
If there is an attached Data object, it is not pickled. If there
is an attached model object, it is pickled. Pointers to c-structs
are not pickled.
If fName is supplied, the file name becomes fName.p4_tPickle,
unless fName already ends with .p4_tPickle. If fName is not
supplied, self.name is used in fName's place. If neither is
supplied, the pid is used as fName.
If a file with the chosen name already exists, it is silently
over-written!
p4 can read a p4_tPickle file from the command line or using the
read() function, as usual.
(This would not be a good option for long-term storage, because if
you upgrade p4 and the p4 Classes change a lot then it may become
impossible to unpickle it. If that happens, you can use the old
version of p4 to unpickle.)
"""
suffix = '.p4_tPickle'
if not self.name and not fName:
fName = '%s' % os.getpid()
if fName:
if fName.endswith(suffix):
fN = fName
else:
fN = fName + suffix
elif self.name:
if self.name.endswith(suffix):
fN = self.name
else:
fN = self.name + suffix
f = open(fN, 'wb')
pickle.dump(self.dupe(), f, pickle.HIGHEST_PROTOCOL)
# pickle.dump(self, f, 1) # Don't do this -- has pointers that would
# not have been malloc'ed! And data!
f.close()
[docs]
def writeNexus(self, fName=None, append=0, writeTaxaBlockIfTaxNamesIsSet=1, message=None):
"""Write the tree out in Nexus format, in a trees block.
If fName is None, the default, it is written to sys.stdout.
#NEXUS is written unless we are appending-- set append=1.
If you want to write with a translation, use a Trees object.
"""
gm = ['Tree.writeNexus()']
if fName == None or fName == sys.stdout:
f = sys.stdout
if not append:
f.write('#NEXUS\n\n')
else:
if append:
if os.path.isfile(fName):
try:
f = open(fName, 'a')
except IOError:
gm.append("Can't open %s for appending." % fName)
raise P4Error(gm)
else:
if 0:
print("Tree.writeNexus()")
print(" 'append' is requested,")
print(" but '%s' is not a regular file (doesn't exist?)." \
% fName)
print(" Writing to a new file instead.")
try:
f = open(fName, 'w')
f.write('#NEXUS\n\n')
except IOError:
gm.append("Can't open %s for writing." % fName)
raise P4Error(gm)
else:
try:
f = open(fName, 'w')
f.write('#NEXUS\n\n')
except IOError:
gm.append("Can't open %s for writing." % fName)
raise P4Error(gm)
if writeTaxaBlockIfTaxNamesIsSet and self.taxNames:
f.write('begin taxa;\n')
f.write(' dimensions ntax=%s;\n' % self.nTax)
f.write(' taxlabels')
for i in self.taxNames:
f.write(' %s' % p4.func.nexusFixNameIfQuotesAreNeeded(i))
f.write(';\nend;\n\n')
f.write('begin trees;\n')
if message:
f.write(' [%s\n ]\n' % message)
if self.logLike:
f.write(' [logLike for tree %s is %f]\n' %
(self.name, self.logLike))
f.write(' tree %s = [&U] ' %
p4.func.nexusFixNameIfQuotesAreNeeded(self.name))
if self.recipWeight:
if self.recipWeight == 1:
f.write('[&W 1] ')
else:
f.write('[&W 1/%i] ' % self.recipWeight)
if hasattr(self, 'weight'):
f.write('[&W %f] ' % self.weight)
self.writeNewick(f)
f.write('end;\n\n')
if f != sys.stdout:
f.close()
[docs]
def write(self):
"""This writes out the Newick tree description to sys.stdout."""
self.writeNewick(sys.stdout)
[docs]
def writePhylip(self, fName=None, withTranslation=0, translationHash=None, doMcmcCommandComments=0):
"""Write the tree in Phylip or Newick format.
(This method is just a dupe of writeNewick(). Without the
'toString' or 'append' args.)
fName may also be an open file object.
"""
self.writeNewick(
fName, withTranslation, translationHash, doMcmcCommandComments)
[docs]
def writeNewick(self, fName=None, withTranslation=0, translationHash=None, doMcmcCommandComments=0, toString=False, append=False, spaceAfterComma=True):
"""Write the tree in Newick, aka Phylip, format.
This is done in a Nexus-oriented way. If taxNames have spaces or
odd characters, they are single-quoted. There is no restriction
on the length of the taxon names. A translationHash can be used.
fName may also be an open file object.
If 'toString' is turned on, then 'fName' should be None, and a Newick
representation of the tree is returned as a string.
The method 'writePhylip()' is the same as this, with fewer arguments.
"""
gm = ['Tree.writeNewick()']
sList = []
if withTranslation and not translationHash:
gm.append("No translationHash.")
raise P4Error(gm)
if fName and toString:
gm.append(
"You cannot write to a file and string at the same time.")
raise P4Error(gm)
if doMcmcCommandComments:
if not self.model:
gm.append("No model attached to tree.")
gm.append("Set doMcmcCommandComments=0")
raise P4Error(gm)
# print 'self.preAndPostOrderAreValid = %s' %
# self.preAndPostOrderAreValid
if not self.preAndPostOrderAreValid:
self.setPreAndPostOrder()
# print "self.preOrder = %s" % self.preOrder
# Don't count un-used nodes.
nNodes = len([n for n in self.iterNodes()])
# print "nNodes = %i" % nNodes
if nNodes == 1:
# print "Single node. isLeaf=%s, name=%s" % (self.root.isLeaf,
# self.root.name)
if self.root.isLeaf:
if withTranslation:
sList.append('%s' % translationHash[self.root.name])
elif self.root.name:
sList.append(
'%s' % p4.func.nexusFixNameIfQuotesAreNeeded(self.root.name))
else:
sList.append('()')
else:
# Will this ever happen?
gm.append(
"Something is wrong. There is only one node, and it is not terminal.")
raise P4Error(gm)
elif nNodes > 1:
writeBrLens = 0
for n in self.iterNodesNoRoot():
if n.br.len != 0.1:
writeBrLens = 1
break
stack = []
for n in self.iterPreOrder():
stack.append(n)
if n.leftChild:
sList.append('(')
continue
while len(stack):
n1 = stack.pop()
# print "stacklen=%i, n1 name=%s" % (len(stack), n1.name)
if n1.isLeaf:
if n1 == self.root:
sList.append(')')
if withTranslation:
sList.append('%s' % (translationHash[n1.name]))
else:
if n1.name:
sList.append(
'%s' % p4.func.nexusFixNameIfQuotesAreNeeded(n1.name))
else:
if n1 != self.root:
gm.append("Terminal node with no name?")
raise P4Error(gm)
else:
sList.append(')')
if n1.name:
sList.append(
'%s' % p4.func.nexusFixNameIfQuotesAreNeeded(n1.name))
if writeBrLens:
if n1 != self.root:
sList.append(':%g' % n1.br.len)
if doMcmcCommandComments:
sList.append(self._getMcmcCommandComment(n1))
if n1.sibling:
if spaceAfterComma:
sList.append(', ')
else:
sList.append(',')
break
sList.append(';')
if toString:
return "".join(sList) # no newline
elif fName == None:
print("".join(sList)) # with default newline from print
sList.append('\n') # now all will have a newline
if isinstance(fName, str):
if append:
fName2 = open(fName, 'a')
else:
fName2 = open(fName, 'w')
fName2.write(''.join(sList))
fName2.close()
elif hasattr(fName, 'write'):
fName.write(''.join(sList))
# Somebody else opened the fName, so somebody else can close it.
else:
gm.append("I don't understand (%s) passed to me to write to." % fName)
raise P4Error(gm)
def _getMcmcCommandComment(self, theNode):
sList = [' [&']
for pNum in range(self.model.nParts):
mp = self.model.parts[pNum]
if mp.nComps > 1:
doWrite = True
if mp.ndch2:
if mp.ndch2_writeComps:
doWrite = True
else:
doWrite = False
if doWrite:
sList.append(' c%i.%i' % (pNum, theNode.parts[pNum].compNum))
if theNode != self.root:
if mp.nRMatrices > 1:
doWrite = True
if mp.ndrh2:
if mp.ndrh2_writeRMatrices:
doWrite = True
else:
doWrite = False
if doWrite:
sList.append(' r%i.%i' % (pNum, theNode.br.parts[pNum].rMatrixNum))
if mp.nGdasrvs > 1:
sList.append(' g%i.%i' % (pNum, theNode.br.parts[pNum].gdasrvNum))
sList.append(']')
if len(sList) == 2: # ie [&] only, eg ndrh2 but only 1 comp
return ""
return ''.join(sList)
[docs]
def draw(self, showInternalNodeNames=1, addToBrLen=0.2, width=None, showNodeNums=1, partNum=0, model=None):
"""Draw the tree to the screen.
This method makes a text drawing of the tree and writes it to sys.stdout.
Arg addToBrLen adds, by default 0.2, to each branch length, to
make the tree more legible. If you want the branch lengths more
realistic, you can set it to zero, or better, use vector graphics
for drawing the trees.
Setting arg model aids in drawing trees with tree-hetero models.
If the model characteristic (usually composition or rMatrix)
differs over the tree, this method can draw it for you.
See also :meth:`Tree.Tree.textDrawList`, which returns the drawing as a list of strings.
See the method :meth:`Tree.Tree.setTextDrawSymbol`, which facilitates
drawing different branches with different symbols.
"""
s = self.textDrawList(showInternalNodeNames=showInternalNodeNames,
addToBrLen=addToBrLen, width=width,
autoIncreaseWidth=True,
showNodeNums=showNodeNums, partNum=partNum, model=model)
print('\n'.join(s))
[docs]
def textDrawList(self, showInternalNodeNames=1, addToBrLen=0.2, width=None, autoIncreaseWidth=True, showNodeNums=1, partNum=0, model=False):
if len(self.nodes) == 0:
return ['']
elif len(self.nodes) == 1:
if showNodeNums:
return ['%i:%s' % (self.nodes[0].nodeNum, self.nodes[0].name)]
else:
return ['%s' % self.nodes[0].name]
gm = ['Tree.textDrawList()']
if not self.preAndPostOrderAreValid:
self.setPreAndPostOrder()
p = TreePicture(self)
p.fName = None
p.width = width
p.xScale = None
p.yScale = 1
p.pointsPerLetter = 1
p.addToBrLen = addToBrLen
p.textShowNodeNums = showNodeNums
p.showInternalNodeNames = showInternalNodeNames
if showNodeNums:
p.nameOffset = 0
else:
p.nameOffset = 1
p.xOrigin = 0.0
p.yOrigin = 0.0
p.textSize = 1
p.labelTextSize = 1
# If the width is not specified, make a guess
if width == None:
tLen = 0 # Longest number of horizontal sections to draw
longestNameLen = 0
for n in self.iterNodes():
if n.isLeaf and n != self.root:
if n.name: # This assumes short internal node name lengths
if len(n.name) > longestNameLen:
longestNameLen = len(n.name)
thisLen = 0
n1 = n
# print "x n1 is node %i" % n1.nodeNum
while n1 != self.root:
n1 = n1.parent
# print "y n1 is node %i" % n1.nodeNum
thisLen += 1
if thisLen > tLen:
tLen = thisLen
# print "tLen =", tLen
# print "longestNameLen =", longestNameLen
rootNameLen = 0
if self.root.name:
rootNameLen = len(self.root.name) + 1
p.width = (10 * tLen) + rootNameLen + longestNameLen
if p.width > 100:
p.width = 100
# print "p.width =", p.width
# sys.exit()
# Make sure the names fit.
if not autoIncreaseWidth:
for n in self.iterNodes():
if n.isLeaf and n != self.root:
if n.name and len(n.name) > p.width:
gm.append(
"There are long names, and the given width is not enough.")
raise P4Error(gm)
if model:
try:
partNum = int(partNum)
except:
gm.append("partNum arg should be an integer.")
raise P4Error(gm)
if not self.model:
gm.append("If model arg is set, then self.model must exist.")
raise P4Error(gm)
if partNum < 0 or partNum >= self.model.nParts:
gm.append(
"Zero-based partNum %i is out of range of %s parts." % (partNum, self.model.nParts))
raise P4Error(gm)
p.partNum = partNum
p.doModel = 1
doComps = 1
doRMatrices = 1
if self.model.parts[partNum].nComps < 2:
doComps = 0
if self.model.parts[partNum].nRMatrices < 2:
doRMatrices = 0
if not doComps and not doRMatrices:
p._setPos(autoIncreaseWidth)
s = p.textString(returnAsList=True)
s.append(
"Both the composition of the model and the rate matrix are homogeneous in part %i.\n" % partNum)
return s
# First do compositions
s = []
if doComps:
if 0:
if self.model.nParts > 1:
print("Compositions for part %i" % partNum)
else:
print("\nCompositions\n------------")
p.textDrawModelThing = var.TEXTDRAW_COMP
p._setPos(autoIncreaseWidth)
s = p.textString(returnAsList=True)
# print s
# Then do RMatrices
if doRMatrices:
if 0:
if self.model.nParts > 1:
print("RMatrices for part %i" % partNum)
else:
print("\nRMatrices\n---------")
p.textDrawModelThing = var.TEXTDRAW_RMATRIX
p._setPos(autoIncreaseWidth)
if s:
s += p.textString(returnAsList=True)
else:
s = p.textString(returnAsList=True)
if self.model.parts[partNum].nComps == 1:
s.append(
"The composition of the model is homogeneous in part %i\n" % partNum)
elif self.model.parts[partNum].nRMatrices == 1:
s.append(
"The rate matrix is homogeneous in part %i\n" % partNum)
# Don't bother with GDASRV, yet
return s
else:
p._setPos(autoIncreaseWidth)
s = p.textString(returnAsList=True)
return s
# outFileName=None, width=500, heightFactor=0.85, pointsPerLetter=6.0,
# textSize=11, labelSize=9, putInternalNodeNamesOnBranches=0)
[docs]
def eps(self, fName=None, width=500, putInternalNodeNamesOnBranches=0):
"""Make a basic eps drawing of self.
The 'width' is in postscript points.
By default, internal node names label the node, where the node
name goes on the right of the node. You can make the node name
label the branch by setting 'putInternalNodeNamesOnBranches'.
"""
gm = ['Tree.eps()']
if not self.preAndPostOrderAreValid:
self.setPreAndPostOrder()
p = TreePicture(self)
p.addToBrLen = 0.0
p.width = width
p.yScale = 17.0
p.xScale = None
p.pointsPerLetter = 6.0
p.textSize = 11
p.labelTextSize = 8
p.putInternalNodeNamesOnBranches = putInternalNodeNamesOnBranches
p.xOrigin = 5.0
p._setPos()
s = p.vectorString()
if not fName:
if self.name:
fName = '%s.eps' % self.name
else:
fName = '%i.eps' % os.getpid()
# if not fName.endswith('.eps'):
# fName = '%s.eps' % fName
f = open(fName, 'w')
f.write(s)
f.close()
################################################# model
#################################################
@property
def data(self):
"""(property) The data object"""
return self._data
@data.setter
def data(self, theData):
"""Sets self.data"""
# Two cases. Either
# 1. Self already has a data.
# - deleteCStuff(), and replace the old data with the new data.
# 2. Self has no data.
# a. Has a model. So just check for compatibility.
# b. Has no model, so has never seen a data before.
complaintHead = 'Tree.data (property setter)'
if self.name:
complaintHead += ", tree '%s'" % self.name
if self.fName:
complaintHead += ", file %s" % self.fName
gm = [complaintHead]
if isinstance(theData, Data) or theData == None:
pass
else:
gm.append("Set data only to a Data object, or None, ok?")
raise P4Error(gm)
if self.data or self.model:
# Normally there won't be anything to delete, but you never know...
self.deleteCStuff()
if not theData:
self._data = None
return
self._data = theData
if self.model:
# We have seen a data object before (otherwise we would not
# have been able to set the model). Check for compatibility.
# print("{gm[0]} self.model exits.")
if not self.taxNames:
gm.append(
"Self has model, but no taxNames. Programming error.")
raise P4Error(gm)
# Check for same number of taxa
treeNTax = len(self.taxNames)
dataNTax = len(theData.taxNames)
if treeNTax != dataNTax:
gm.append("The number of taxa in the tree (%s)" % treeNTax)
gm.append("is not the same as in the data (%s)" % dataNTax)
raise P4Error(gm)
# Check for mis-matched taxNames
isBad = 0
for tn in self.taxNames:
if tn not in theData.taxNames:
isBad = 1
break
for tn in theData.taxNames:
if tn not in self.taxNames:
isBad = 1
break
if isBad:
gm.append("TaxName mismatch between the tree and the data.")
gm.append("Here they are, sorted to show mis-matches.")
gm.append(" %25s %25s" % ('data', 'tree'))
self.taxNames.sort()
theData.taxNames.sort()
for i in range(len(self.taxNames)):
if theData.taxNames[i] == self.taxNames[i]:
gm.append(" %25s %25s" %
(theData.taxNames[i], self.taxNames[i]))
else:
gm.append(" %25s %25s ***" %
(theData.taxNames[i], self.taxNames[i]))
raise P4Error(gm)
# Same number of parts
if len(theData.parts) != self.model.nParts:
gm.append("nParts mis-match. len(theData.parts)=%i, model.nParts=%i" % (
len(theData.parts), self.model.nParts))
raise P4Error(gm)
# Check dims and symbols in the parts
for pNum in range(self.model.nParts):
if theData.parts[pNum].dim != self.model.parts[pNum].dim:
gm.append("Parts dim mis-match.")
raise P4Error(gm)
if theData.parts[pNum].symbols != self.model.parts[pNum].symbols:
gm.append("Parts symbols mis-match.")
raise P4Error(gm)
# Set seqNum
for n in self.iterLeavesNoRoot():
if n.seqNum != self.taxNames.index(n.name):
gm.append("seqNums do not match up with taxNames.")
raise P4Error(gm)
else: # no model
self.setNewModel()
[docs]
def setNewModel(self):
theData = self.data
assert self.data, "set the data first"
# assert not self.model, "this method is to be used when there is no model"
if self.model:
self.model = None
# When you do this method, self gets a suitable
# model. We have here no model, so we may have never seen a
# data before. Or we might have just lost the model.
# Check for same number of taxa
treeNTax = 0
treeTaxNames = []
for n in self.iterNodes():
if n.isLeaf:
treeNTax += 1
treeTaxNames.append(n.name)
dataNTax = len(theData.taxNames)
if treeNTax != dataNTax:
gm.append("The number of taxa in the tree (%s)" % treeNTax)
gm.append("is not the same as in the data (%s)" % dataNTax)
raise P4Error(gm)
# Check for mis-matched taxNames
isBad = 0
for tn in treeTaxNames:
if tn not in theData.taxNames:
isBad = 1
break
for tn in theData.taxNames:
if tn not in treeTaxNames:
isBad = 1
break
if isBad:
gm.append("TaxName mismatch between the tree and the data.")
gm.append("Here they are, sorted to show mis-matches.")
gm.append(" %25s %25s" % ('data', 'tree'))
treeTaxNames.sort()
theData.taxNames.sort()
for i in range(len(treeTaxNames)):
if theData.taxNames[i] == treeTaxNames[i]:
gm.append(" %25s %25s" %
(theData.taxNames[i], treeTaxNames[i]))
else:
gm.append("*** %25s %25s" %
(theData.taxNames[i], treeTaxNames[i]))
raise P4Error(gm)
# attach
self.taxNames = theData.taxNames
self._data = theData
# print "_setData. len(theData.parts) = %s" % len(theData.parts)
# calls self.deleteCStuff()
self.model = Model(len(theData.parts))
for n in self.iterNodes():
if n.parts:
n.parts = []
for i in range(self.model.nParts):
n.parts.append(NodePart())
for n in self.iterNodes():
if n != self.root:
n.br.parts = []
for i in range(self.model.nParts):
n.br.parts.append(NodeBranchPart())
# Set modelPart dims and symbols
for pNum in range(self.model.nParts):
self.model.parts[pNum].dim = theData.parts[pNum].dim
self.model.parts[pNum].symbols = theData.parts[pNum].symbols
# There is intentionally no default pInvar, forcing the user to be
# explicit.
for p in self.model.parts:
p.pInvar = None
# Set seqNum
for n in self.iterNodes():
if n.isLeaf:
n.seqNum = self.taxNames.index(n.name)
@property
def model(self):
"""(property) The model object"""
return self._model
@model.setter
def model(self, theModel):
gm = ['Tree.model (property setter)']
# print gm[0]
# print " Got '%s'" % theModel
if isinstance(theModel, Model) or theModel == None:
pass
else:
gm.append("Attempt to set Tree.model to '%s'. " % theModel)
gm.append(
"Don't set the model to anything other than 'None' or a Model, ok? ")
gm.append("(And generally the user only sets it to None.) ")
raise P4Error(gm)
if self.model or self.data:
self.deleteCStuff()
# print 'Tree._setModel() finished deleteCStuff()'
self._model = theModel
@model.deleter
def model(self):
gm = ['Tree.model']
gm.append("Caught an attempt to delete self.model, but")
gm.append("self.model is a property, so you shouldn't delete it.")
gm.append("But you can set it to None if you like.")
raise P4Error(gm)
def _checkModelThing(self, partNum, symbol, complaintHead):
gm = [complaintHead]
if not self.data:
gm.append("No data. Set the data first.")
raise P4Error(gm)
if not self.model:
# When you set the self.data, a model object of suitable dimensions
# is made and attached to self. If we have got here, it is
# because the model has subsequently been lost. So just
# re-instate it.
self.setNewModel()
if partNum < 0 or partNum >= self.model.nParts:
gm.append("Zero-based partNum (%s) is out of range (of %s parts)" %
(partNum, self.model.nParts))
raise P4Error(gm)
if symbol:
if not isinstance(symbol, str) or len(symbol) != 1:
gm.append("Symbols must be 1-length strings.")
raise P4Error(gm)
if symbol == '?':
gm.append("Got assigned text drawing symbol '?'.")
gm.append("Don't use it-- it is reserved for missing model components")
raise P4Error(gm)
[docs]
def newComp(self, partNum=0, free=0, spec='empirical', val=None, symbol=None):
"""Make, attach, and return a new Comp object.
The arg *spec* should be a string, one of::
'equal' no val
'empirical' no val
'specified' val=[aList]
'wag', etc no val
(ie one of the empirical protein models, including
cpREV, d78, jtt, mtREV24, mtmam, wag, etc)
If spec='specified', then you specify dim or dim-1 values in a
list as the 'val' arg.
This method returns a Comp object, which you can ignore if it is a
tree-homogeneous model. However, if it is a tree-hetero model
then you may want to get that Comp object so that you can place
it on the tree explicitly with setModelComponentOnNode(), like this::
c0 = newComp(partNum=0, free=1, spec='empirical')
c1 = newComp(partNum=0, free=1, spec='empirical')
myTree.setModelComponentOnNode(c0, node=myTree.root, clade=1)
myTree.setModelComponentOnNode(c1, node=5, clade=1)
myTree.setModelComponentOnNode(c1, node=18, clade=0)
Alternatively, you can simply let p4 place them randomly::
newComp(partNum=0, free=1, spec='empirical')
newComp(partNum=0, free=1, spec='empirical')
myTree.setModelComponentsOnNodesRandomly()
Calculation of probability matrices for likelihood calcs etc are
wrong when there are any comp values that are zero, so that is not
allowed. Any zeros are converted to var.PIVEC_MIN, which is 1e-13
this week. Hopefully close enough to zero for you.
"""
gm = ['Tree.newComp()']
self._checkModelThing(partNum, symbol, gm[0])
if self.model.cModel:
self.deleteCStuff()
mt = Comp()
mt.partNum = partNum
mt.free = free
# spec
if spec not in var.compSpecs:
gm.append("The spec should be one of %s" % var.compSpecs)
raise P4Error(gm)
mt.spec = spec
mt.num = len(self.model.parts[partNum].comps)
if symbol:
mt.symbol = symbol
else:
try:
mt.symbol = var.modelSymbols[mt.num]
except IndexError:
mt.symbol = '-'
self.model.parts[partNum].comps.append(mt)
# assign val
dim = self.model.parts[partNum].dim
if spec == 'equal':
mt.val = numpy.ones(dim, dtype=numpy.double) / dim
elif spec == 'empirical':
assert mt.val is None
elif spec == 'specified':
# print(f"Got val {val}, type {type(val)}")
# if val == None or val == []:
# gm.append("Specified comp, but no val.")
# raise P4Error(gm)
try:
val = list(val)
except TypeError:
gm.append("Comp is 'specified', but bad 'val' arg.")
gm.append("The 'val' arg should be a list or tuple.")
raise P4Error(gm)
if len(val) == dim or len(val) == dim - 1:
pass
else:
gm.append("Bad length for val arg (%i). Should be dim or dim-1 long." % len(val))
gm.append("(Dim for this part is %i)" % dim)
raise P4Error(gm)
# I allow val's of dim or dim-1.
if len(val) == dim - 1:
lastVal = 1.0 - sum(val)
if lastVal > 0.0:
val = val + [1.0 - sum(val)]
else:
gm.append("Bad comp vals %s" % val)
gm.append("sum to 1.0 or more.")
raise P4Error(gm)
else: # len = dim
theSum = sum(val)
theDiff = math.fabs(theSum - 1.0)
# How big to make the delta? With reasonably good, normalized
# protein comps (where all the values had just been divided by
# the total, so it should have summed to 1.0 at that point) I
# kept getting 1.1e-16. So make it 5.e-16. No, too small.
# With protein I am getting diffs of 8.88178e-16, so make the
# cutoff 9e-16
if theDiff > 9.e-16: # 1e-17 was too small for protein
gm.append("Bad comp vals %s" % val)
gm.append("does not sum to 1.0")
gm.append("The sum = %f" % theSum)
gm.append("abs(1.0 - theSum) = %g" % theDiff)
raise P4Error(gm)
# Are any specified values less than PIVEC_MIN?
needsNormalizing = 0
for i in range(len(val)):
thisVal = val[i]
if thisVal < var.PIVEC_MIN:
print(gm[0])
print(" Specifying a comp of zero for a character is not allowed.")
print(" The minimum is %g" % var.PIVEC_MIN)
myVal = (1.5 + random.random()) * var.PIVEC_MIN
print(" Re-setting to %g" % myVal)
val[i] = myVal
needsNormalizing = 1
if needsNormalizing:
theSum = sum(val)
for i in range(len(val)):
val[i] /= theSum
if math.fabs(sum(val) - 1.0) > 5.e-16:
gm.append("Bad comp vals %s" % val)
gm.append("does not sum to 1.0")
raise P4Error(gm)
# print "sum(val) - 1.0 = %f (%g)" % (sum(val) - 1.0, sum(val) -
# 1.0)
mt.val = val
elif spec in var.rMatrixProteinSpecs:
mt.val = p4.func.getProteinEmpiricalModelComp(spec)
return mt
[docs]
def newRMatrix(self, partNum=0, free=0, spec='ones', val=None, symbol=None):
"""Make, attach, and return a new RMatrix instance.
spec should be one of:
- 'ones' - for JC, poisson, F81
- '2p' - for k2p and hky
- 'specified'
- 'cpREV'
- 'd78'
- 'jtt'
- 'mtREV24'
- 'mtmam'
- 'wag'
- 'rtRev'
- 'tmjtt94'
- 'tmlg99'
- 'lg'
- 'blosum62'
- 'hivb'
- 'mtart'
- 'mtzoa'
- 'gcpREV'
- 'stmtREV'
- 'vt'
- 'pmb'
See var.rMatrixProteinSpecs
You do not set the 'val' arg unless the spec is 'specified' or
'2p'. If spec='2p', then you set val to kappa.
If the spec is 'specified', you specify all the numerical values
in a list given as the 'val' arg. The length of that list will be
(((dim * dim) - dim) / 2), so for DNA, where dim=4, you would
specify a list containing 6 numbers. """
# not implemented:
# 'blosum62a'
# 'blosum62b'
# 'phat70'
complaintHead = '\nTree.newRMatrix()'
gm = [complaintHead]
self._checkModelThing(partNum, symbol, complaintHead)
if self.model.cModel:
self.deleteCStuff()
mt = RMatrix()
mt.partNum = partNum
#mt.dim = self.data.parts[partNum].dim
mt.free = free
if spec not in var.rMatrixSpecs:
gm.append("Got unknown rMatrix spec '%s'." % spec)
gm.append("Should be one of: %s" % var.rMatrixSpecs)
raise P4Error(gm)
mt.spec = spec
mt.num = len(self.model.parts[partNum].rMatrices)
if symbol:
mt.symbol = symbol
else:
try:
mt.symbol = var.modelSymbols[mt.num]
except IndexError:
mt.symbol = '-'
self.model.parts[partNum].rMatrices.append(mt)
# assign val
dim = self.model.parts[partNum].dim
if var.rMatrixNormalizeTo1:
goodLen = int((((dim * dim) - dim) / 2))
else:
goodLen = int((((dim * dim) - dim) / 2) - 1)
v = None
if spec == 'specified':
if val:
# should check that values are all floats
if len(val) == goodLen:
v = numpy.array(val, dtype=numpy.double)
if var.rMatrixNormalizeTo1:
v /= v.sum()
elif var.rMatrixNormalizeTo1 and len(val) == goodLen - 1:
gm.append("var.rMatrixNormalizeTo1 is set, val length should be %i, got %i" % (
goodLen, len(val)))
raise P4Error(gm)
else:
gm.append("Bad length for arg val. Length %i, should be %i" % (len(val), goodLen))
raise P4Error(gm)
else:
gm.append("spec is 'specified', but there are no specified rMatrix values.")
gm.append("Specify rMatrix values by eg val=[1.0, 2.0, 3.0, 4.0, 5.0, 6.0]")
raise P4Error(gm)
elif spec == 'ones':
v = numpy.array([1.0] * goodLen, dtype=numpy.double)
if var.rMatrixNormalizeTo1:
v /= v.sum()
elif spec == '2p':
try:
v = float(val)
except (ValueError, TypeError):
gm.append("Kappa ('val' arg) should be a float. Setting to 2.0")
v = 2.0
if v < var.KAPPA_MIN:
gm.append("Kappa is too small. Setting to %f" %
var.KAPPA_MIN)
v = var.KAPPA_MIN
elif v > var.KAPPA_MAX:
gm.append("Kappa is too big. Setting to %f" % var.KAPPA_MAX)
v = var.KAPPA_MAX
v = numpy.array([v], dtype=numpy.double)
elif spec in var.rMatrixProteinSpecs:
if self.data.parts[partNum].dataType != 'protein':
gm.append("A protein matrix has been specified, but the dataType for part %i is %s." % (
partNum, self.data.parts[partNum].dataType))
raise P4Error(gm)
if free:
gm.append('The rMatrix should not be free if it is an empirical protein matrix.')
raise P4Error(gm)
else:
gm.append(f"I don't know spec '{spec}'")
gm.append(f"Programming error")
raise P4Error(gm)
mt.val = v # type numpy.ndarray, or None for specified protein
return mt
[docs]
def newGdasrv(self, partNum=0, free=0, val=None, symbol=None):
gm = ['Tree.newGdasrv()']
if not self.model:
gm.append("Set the data first. Eg myTree.data = Data()")
raise P4Error(gm)
if self.model.cModel:
self.deleteCStuff()
# check if there is an nGammaCat > 1:
if self.model.parts[partNum].nGammaCat == 1:
gm.append("For this part (%s), the number of nGammaCat has been set to 1." % partNum)
gm.append("So gdasrv won't work.")
gm.append("You can set the nGammaCat with yourTree.setNGammaCat(partNum=x, nGammaCat=y)")
raise P4Error(gm)
# check val
if val == None:
gm.append("Please specify a val, a positive float.")
raise P4Error(gm)
try:
v = float(val)
except:
gm.append("Arg val must be a float. Got '%s'" % val)
raise P4Error(gm)
# This week, we have in defines.h
# define GAMMA_SHAPE_MIN 0.000001
# define GAMMA_SHAPE_MAX 300.0
if v <= 0.000001 or v >= 300.0:
gm.append("Arg val must be between 0.000001 and 300. Got %f" % v)
raise P4Error(gm)
self._checkModelThing(partNum, symbol, gm[0])
mt = Gdasrv()
mt.nGammaCat = self.model.parts[partNum].nGammaCat
mt.partNum = partNum
mt.free = free
# no spec or dim
mt.num = len(self.model.parts[partNum].gdasrvs)
if symbol:
mt.symbol = symbol
else:
mt.symbol = var.modelSymbols[mt.num]
self.model.parts[partNum].gdasrvs.append(mt)
mt.freqs = numpy.zeros(mt.nGammaCat, dtype=numpy.double)
mt.rates = numpy.zeros(mt.nGammaCat, dtype=numpy.double)
mt._val[0] = v
mt.calcRates()
return mt
[docs]
def setPInvar(self, partNum=0, free=0, val=0.0):
complaintHead = '\nTree.setPInvar()'
gm = [complaintHead]
# check val
try:
v = float(val)
except:
gm.append("Arg val must be a float. Got '%s'" % val)
raise P4Error(gm)
if v < 0.0 or v >= 1.0:
gm.append(
"Arg val must be zero or more, and less than 1. Got %f" % v)
raise P4Error(gm)
self._checkModelThing(partNum, None, complaintHead)
if self.model.cModel:
self.deleteCStuff()
mt = PInvar()
mt.partNum = partNum
mt.free = free
mt.val = v
self.model.parts[partNum].pInvar = mt
[docs]
def setRelRate(self, partNum=0, val=0.0):
complaintHead = '\nTree.setRelRate()'
gm = [complaintHead]
# check val
try:
v = float(val)
except:
gm.append("Arg val must be a float. Got '%s'" % val)
raise P4Error(gm)
if v <= 0.0 or v >= 1000.0:
gm.append(
"Arg val must be more than zero, and less than 1000 (arbitrarily). Got %f" % v)
raise P4Error(gm)
self._checkModelThing(partNum, None, complaintHead)
if self.model.cModel:
self.deleteCStuff()
self.model.parts[partNum].relRate = v
[docs]
def setModelComponentOnNode(self, theModelComponent, node=None, clade=1):
complaintHead = '\nTree.setModelComponentOnNode()'
gm = [complaintHead]
if theModelComponent and \
(isinstance(theModelComponent, Comp) or
isinstance(theModelComponent, RMatrix) or
isinstance(theModelComponent, Gdasrv)):
pass
else:
gm.append("Expecting a model component instance of some sort.")
gm.append("Ie a comp, rMatrix, or gdasrv, instance.")
gm.append("Got theModelComponent = %s" % theModelComponent)
raise P4Error(gm)
if self.model.cModel:
self.deleteCStuff()
partNum = theModelComponent.partNum
if node == None:
theNode = self.root
else:
theNode = self.node(node)
isBad = 0
if isinstance(theModelComponent, Comp):
if theModelComponent != self.model.parts[partNum].comps[theModelComponent.num]:
isBad = 1
elif isinstance(theModelComponent, RMatrix):
if theModelComponent != self.model.parts[partNum].rMatrices[theModelComponent.num]:
isBad = 1
elif isinstance(theModelComponent, Gdasrv):
if theModelComponent != self.model.parts[partNum].gdasrvs[theModelComponent.num]:
isBad = 1
else: # This will never happen-- we checked above. Overkill.
gm.append("I don't recognise theModelComponent.")
raise P4Error(gm)
if isBad:
gm.append("The model component can only be set on the tree that made it.")
raise P4Error(gm)
# For the root, we set comps and nothing else. For other nodes we
# set anything.
if theNode == self.root:
if isinstance(theModelComponent, Comp):
theNode.parts[partNum].compNum = theModelComponent.num
else:
if isinstance(theModelComponent, Comp):
#print(f"node {theNode.nodeNum}, partNum {partNum}, comp num {theModelComponent.num}")
theNode.parts[partNum].compNum = theModelComponent.num
elif isinstance(theModelComponent, RMatrix):
theNode.br.parts[partNum].rMatrixNum = theModelComponent.num
elif isinstance(theModelComponent, Gdasrv):
theNode.br.parts[partNum].gdasrvNum = theModelComponent.num
if clade:
aboves = self.getNodeNumsAbove(theNode, leavesOnly=0)
for i in aboves:
if isinstance(theModelComponent, Comp):
self.nodes[i].parts[partNum].compNum = theModelComponent.num
elif isinstance(theModelComponent, RMatrix):
self.nodes[i].br.parts[
partNum].rMatrixNum = theModelComponent.num
elif isinstance(theModelComponent, Gdasrv):
self.nodes[i].br.parts[
partNum].gdasrvNum = theModelComponent.num
[docs]
def setModelThingsRandomly(self, forceRepresentation=2):
"""This method has been renamed to setModelComponentsOnNodesRandomly"""
gm = ["Tree.setModelThingsRandomly() has been renamed Tree.setModelComponentsOnNodesRandomly()"]
raise P4Error(gm)
[docs]
def setModelComponentsOnNodesRandomly(self, forceRepresentation=2):
"""Place model components (semi-)randomly on the tree.
For example, if there are 2 compositions in model part partNum,
this method will decorate each node of the tree with zeros and
ones, randomly. The actual component set is
node.parts[partNum].compNum. If the model is homogeneous,
it will just put zeros in all the nodes.
We want to have each model component on the tree somewhere, and so it
is not really randomly set. If the model component numbers were
assigned randomly on the tree, it may occur that some model component
numbers by chance would not be represented. This is not allowed,
and you can set forceRepresentation to some positive integer, 1 or
more. That number will be the lower limit allowed on the number
of nodes that get assigned the model component number. For example,
if forceRepresentation is set to 2, then each model component must get
assigned to at least 2 nodes."""
gm = ['Tree.setModelComponentsOnNodesRandomly()']
if not self.model or not self.model.nParts:
gm.append("No model parts?")
raise P4Error(gm)
if self.model.cModel:
self.deleteCStuff()
# self.model.dump()
if not isinstance(forceRepresentation, int) or forceRepresentation < 1:
gm.append("Arg 'forceRepresentation' should be 1 or more.")
gm.append("Got forceRepresentation = %s" % forceRepresentation)
raise P4Error(gm)
for i in self.preOrder:
if i == var.NO_ORDER:
gm.append("This method does not work if any nodes are not used in the tree.")
raise P4Error(gm)
for pNum in range(self.model.nParts):
mp = self.model.parts[pNum]
# First do comps
if mp.nComps == 1:
for n in self.nodes:
n.parts[pNum].compNum = 0
elif mp.nComps > 1:
if mp.ndch2:
pass
else:
nNodes = len(self.nodes)
if (mp.nComps * forceRepresentation) > nNodes:
gm.append("Part %i" % pNum)
gm.append(
"There are not enough nodes (%i) to put %i" % (nNodes, mp.nComps))
gm.append(
"comps on at least forceRepresentation (%i) nodes." % forceRepresentation)
raise P4Error(gm)
nList = self.nodes[:]
random.shuffle(nList)
# get the forceRepresentation out of the way first
for mtNum in range(mp.nComps):
for fr in range(forceRepresentation):
n = nList.pop()
n.parts[pNum].compNum = mtNum
# Now do the rest
for n in nList:
n.parts[pNum].compNum = random.randrange(mp.nComps)
else:
gm.append("No comps in part %i" % pNum)
raise P4Error(gm)
# Second do rMatrices
if mp.nRMatrices == 1:
for n in self.nodes:
if n != self.root:
n.br.parts[pNum].rMatrixNum = 0
elif mp.nRMatrices > 1:
if mp.ndrh2:
pass
else:
nNodes = len(self.nodes) - 1
if (mp.nRMatrices * forceRepresentation) > nNodes:
gm.append("Part %i" % pNum)
gm.append(
"There are not enough nodes (%i) to put %i" % (nNodes, mp.nRMatrices))
gm.append(
"rMatrices on at least forceRepresentation (%i) nodes." % forceRepresentation)
raise P4Error(gm)
nList = self.nodes[:]
nList.remove(self.root)
random.shuffle(nList)
# get the forceRepresentation out of the way first
for mtNum in range(mp.nRMatrices):
for fr in range(forceRepresentation):
n = nList.pop()
n.br.parts[pNum].rMatrixNum = mtNum
# Now do the rest
for n in nList:
n.br.parts[pNum].rMatrixNum = random.randrange(
mp.nRMatrices)
else:
gm.append("No rMatrices in part %i" % pNum)
raise P4Error(gm)
# Third do gdasrvs
if mp.nGammaCat > 1:
if mp.nGdasrvs == 1:
for n in self.nodes:
if n != self.root:
n.br.parts[pNum].gdasrvNum = 0
elif mp.nGdasrvs > 1:
nNodes = len(self.nodes) - 1
if (mp.nGdasrvs * forceRepresentation) > nNodes:
gm.append("Part %i" % pNum)
gm.append(
"There are not enough nodes (%i) to put %i" % (nNodes, mp.nGdasrvs))
gm.append(
"gdasrvs on at least forceRepresentation (%i) nodes." % forceRepresentation)
raise P4Error(gm)
nList = self.nodes[:]
nList.remove(self.root)
random.shuffle(nList)
# get the forceRepresentation out of the way first
for mtNum in range(mp.nGdasrvs):
for fr in range(forceRepresentation):
n = nList.pop()
n.br.parts[pNum].gdasrvNum = mtNum
# Now do the rest
for n in nList:
n.br.parts[pNum].gdasrvNum = random.randrange(
mp.nGdasrvs)
else:
gm.append(
"No gdasrvs in part %i and yet nGammaCat > 1" % pNum)
raise P4Error(gm)
# self.dump(model=True)
[docs]
def setModelComponentsNNodes(self):
"""Set nNodes for all model components"""
gm = ['Tree.setModelComponentsNNodes()']
if not self.model or not self.model.nParts:
gm.append("No model parts?")
raise P4Error(gm)
for pNum in range(self.model.nParts):
mp = self.model.parts[pNum]
if not mp.nComps:
gm.append("No comps in model part %i." % pNum)
raise P4Error(gm)
elif not mp.nRMatrices:
gm.append("No rMatrices in model part %i." % pNum)
raise P4Error(gm)
for pNum in range(self.model.nParts):
mp = self.model.parts[pNum]
# First do comps
if mp.nComps == 1:
pass
elif mp.nComps > 1:
for mtNum in range(mp.nComps):
mp.comps[mtNum].nNodes = 0
for n in self.iterNodes():
mp.comps[n.parts[pNum].compNum].nNodes += 1
# Second do rMatrices
if mp.nRMatrices == 1:
pass
elif mp.nRMatrices > 1:
for mtNum in range(mp.nRMatrices):
mp.rMatrices[mtNum].nNodes = 0
for n in self.iterNodesNoRoot():
mp.rMatrices[n.br.parts[pNum].rMatrixNum].nNodes += 1
# Third do gdasrvs
if mp.nGammaCat > 1:
if mp.nGdasrvs == 1:
pass
elif mp.nGdasrvs > 1:
for mtNum in range(mp.nGdasrvs):
mp.gdasrvs[mtNum].nNodes = 0
for n in self.iterNodesNoRoot():
mp.gdasrvs[n.br.parts[pNum].gdasrvNum].nNodes += 1
else:
gm.append("No gdasrvs in part %i" % pNum)
raise P4Error(gm)
[docs]
def summarizeModelComponentsNNodes(self):
"""Summarize nNodes for all model components if isHet"""
gm = ['Tree.summarizeModelComponentsNNodes()']
if not self.model or not self.model.nParts:
gm.append("No model parts?")
raise P4Error(gm)
if not self.model.isHet:
gm.append("This method is for hetero models")
raise P4Error(gm)
for pNum in range(self.model.nParts):
mp = self.model.parts[pNum]
if not mp.nComps:
gm.append("No comps in model part %i." % pNum)
raise P4Error(gm)
elif not mp.nRMatrices:
gm.append("No rMatrices in model part %i." % pNum)
raise P4Error(gm)
for pNum in range(self.model.nParts):
print("\n%6s %s:" % ("Part", pNum))
mp = self.model.parts[pNum]
# First do comps
if mp.nComps == 1:
pass
elif mp.ndch2:
print("%16s" % "ndch2 is on")
elif mp.nComps > 1:
for mtNum in range(mp.nComps):
# print " comp %i nNodes=%i" % (mtNum,
# mp.comps[mtNum].nNodes)
print("%16s %i %s = %i" % ("composition", mtNum, "nNodes",
mp.comps[mtNum].nNodes))
# Second do rMatrices
if mp.nRMatrices == 1:
pass
elif mp.ndrh2:
print("%16s" % "ndrh2 is on")
elif mp.nRMatrices > 1:
for mtNum in range(mp.nRMatrices):
print("%16s %i %s = %i" % ("rate matrix", mtNum,
"nNodes", mp.rMatrices[mtNum].nNodes))
# Third do gdasrvs
if mp.nGammaCat > 1:
if mp.nGdasrvs == 1:
pass
elif mp.nGdasrvs > 1:
for mtNum in range(mp.nGdasrvs):
print(" gdasrv %i nNodes =%i" % (mtNum, mp.gdasrvs[mtNum].nNodes))
else:
gm.append("No gdasrvs in part %i" % pNum)
raise P4Error(gm)
[docs]
def setTextDrawSymbol(self, theSymbol='-', node=None, clade=1):
gm = ['\nTree.setTextDrawString()']
if not theSymbol or not isinstance(theSymbol, str) or len(theSymbol) != 1:
gm.append("theSymbol should be a single character string.")
raise P4Error(gm)
if not node:
theNode = self.root
else:
theNode = self.node(node)
if theNode == self.root:
pass
else:
theNode.br.textDrawSymbol = theSymbol
if clade:
aboves = self.getNodeNumsAbove(theNode, leavesOnly=0)
for i in aboves:
self.nodes[i].br.textDrawSymbol = theSymbol
[docs]
def setNGammaCat(self, partNum=0, nGammaCat=1):
gm = ['\nTree.setNGammaCat()']
if not self.data or not self.model:
gm.append("No data?")
raise P4Error(gm)
if self.model.cModel:
self.deleteCStuff()
if partNum < 0 or partNum >= self.model.nParts:
gm.append("PartNum %s is out of range of %s parts." %
(partNum, self.model.nParts))
raise P4Error(gm)
try:
x = int(nGammaCat)
except ValueError:
gm.append("'%s' does not appear to be an integer." % i)
raise P4Error(gm)
if x < 1:
gm.append("nGammaCat should not be less than 1.")
raise P4Error(gm)
elif x > 16:
gm.append("nGammaCat '%s' exceeds the arbitrary limit of 16." % x)
raise P4Error(gm)
self.model.parts[partNum].nGammaCat = nGammaCat
[docs]
def modelSanityCheck(self, resetEmpiricalComps=True):
"""Check that the tree, data, and model specs are good to go.
Complain and exit if there is anything wrong that might prevent a
likelihood evaluation from being done. We are assuming that a
data object exists and is attached, and that model stuff has been
set.
Check that each part has at least 1 each from comps, rMatrices,
and gdasrvs (if nGammaCat is > 1).
Check that each node has a comp, rMatrix, and gdasr. Check that all
comps, rMatrices, gdasrvs are used on a node somewhere.
Here relRate, ie the relative rate of each data partition, is
adjusted based on the size of the data partitions.
newRelRate_p = oldRelRate_p * (Sum_p[oldRelRate_i * partLen_i] / Sum[partLen_i])
That ensures that Sum(relRate_i * partLen_i) = totalDataLength, ie
that the weighted mean of the rates is 1.0.
This method also tallies up the number of free prams in the whole
model, and sets self.model.nFreePrams.
"""
complaintHead = '\nTree.modelSanityCheck()'
gm = [complaintHead]
# print("\nTree.modelSanityCheck() here. self.model.nParts=%s" % self.model.nParts)
# print("\nTree.modelSanityCheck() here. resetEmpiricalComps=%s" % resetEmpiricalComps)
isBad = 0
complaints = []
if not self.data:
complaints.append(' No data.')
isBad = 1
if not self.model:
complaints.append(' No model.')
isBad = 1
# Set isHet.
for pNum in range(self.model.nParts):
mp = self.model.parts[pNum]
mp.isHet = 0
if mp.nComps > 1 or mp.nRMatrices > 1:
mp.isHet = 1
if mp.nGammaCat > 1 and mp.nGdasrvs > 1:
mp.isHet = 1
# This week ndch2 does not play well with other hetero models like ndch,
# so insist that all partitions are or are not ndch2.
firstPartIsNdch2 = self.model.parts[0].ndch2
for pNum in range(self.model.nParts):
mp = self.model.parts[pNum]
if mp.ndch2 != firstPartIsNdch2:
complaints.append(" Can't mix ndch2 with non-ndch2 models.")
isBad = 1
# Check that all parts have all the required stuff. Make a list
# of errors. If there is something missing or wrong, don't die
# right away, but add the problem to the list, and write it all
# out at the end. It gives the user a chance to fix more than one
# error at a time.
for pNum in range(self.model.nParts):
complaints.append(' Part %i' % pNum)
partIsBad = 0
mp = self.model.parts[pNum]
# Check if essential things have been set
if not mp.nComps:
complaints.append(' No comps in part %s' % pNum)
partIsBad = 1
if not mp.nRMatrices:
complaints.append(' No rMatrices in part %s' % pNum)
partIsBad = 1
if mp.nGammaCat > 1:
if not mp.nGdasrvs:
complaints.append(' No gdasrvs in part %s' % pNum)
partIsBad = 1
if mp.nGammaCat == 1:
if mp.nGdasrvs:
complaints.append(
' There should be no gdasrvs in part %s, with nGammaCat=1' % pNum)
partIsBad = 1
if not mp.pInvar:
complaints.append(' No pInvar in part %s' % pNum)
partIsBad = 1
if mp.ndch2:
if mp.nComps != len(list(self.iterNodes())):
complaints.append('Part %i, ndch2 needs a comp for each node' % pNum)
complaints.append(f" {len(list(self.iterNodes()))} nodes in the tree, {mp.nComps} comps")
partIsBad = 1
if partIsBad:
gm.append(" (Indices are zero-based.)")
gm += complaints
raise P4Error(gm)
# Check if comp values have been set.
for mt in mp.comps:
if mt.spec != 'empirical' or not resetEmpiricalComps:
if mt.val is None:
complaints.append(
' No composition val in part %s' % pNum)
partIsBad = 1
if len(mt.val) != mp.dim:
complaints.append(' Composition val is wrong length (%i), but dim is %i' % (
len(mt.val), mp.dim))
partIsBad = 1
# We don't want multiple rMatrices or free rMatrices if mp.dim is 2
if mp.dim == 2:
if mp.nRMatrices > 1:
complaints.append(
' Part %s is dim 2, but we have more than one rMatrix' % pNum)
partIsBad = 1
mt = mp.rMatrices[0] # hopefully only one
if mt.free:
complaints.append(
' Part %s is dim 2, but rMatrix 0 is free' % pNum)
partIsBad = 1
mp.nCat = mp.nGammaCat
# If the model part isHet, we need to check that all nodes
# have something assigned, and that all model components are
# used. If the model part is not het, we can skip that,
# but we need to check that all the
# node.parts[pNum].compNum are 0, and all the
# node.br.parts[pNum].rMatrixNum and
# node.br.parts[pNum].gdasrvNum are set to 0.
if not mp.isHet:
# print "model part %i is not het" % pNum
for n in self.iterNodes():
# print("pNum = %i, n.nodeNum=%i, len n.parts = %i" % (pNum, n.nodeNum, len(n.parts)))
n.parts[pNum].compNum = 0
if n != self.root:
n.br.parts[pNum].rMatrixNum = 0
if mp.nGammaCat > 1:
n.br.parts[pNum].gdasrvNum = 0
else: # isHet
# If there is only one comp, rMatrix, or gdasrv, then
# simply set it.
if mp.nComps == 1:
for n in self.iterNodes():
n.parts[pNum].compNum = 0
if mp.nRMatrices == 1:
for n in self.iterNodes():
if n != self.root:
n.br.parts[pNum].rMatrixNum = 0
if mp.nGammaCat > 1 and mp.nGdasrvs == 1:
for n in self.iterNodes():
if n != self.root:
n.br.parts[pNum].gdasrvNum = 0
# print "model part %i is het" % pNum
# New ad hoc attribute 'isUsed', to keep track of whether
# any node uses it.
for mt in mp.comps:
mt.isUsed = 0
for mt in mp.rMatrices:
mt.isUsed = 0
for mt in mp.gdasrvs:
mt.isUsed = 0
# Does every node have all required things?
for n in self.iterNodes():
mtNum = n.parts[pNum].compNum
if mtNum >= 0 and mtNum < mp.nComps:
mt = mp.comps[mtNum]
mt.isUsed = 1
else:
complaints.append(' Part %s, node %s has no comp.' % (pNum, n.nodeNum))
partIsBad = 1
if n != self.root:
mtNum = n.br.parts[pNum].rMatrixNum
if mtNum >= 0 and mtNum < mp.nRMatrices:
mt = mp.rMatrices[n.br.parts[pNum].rMatrixNum]
mt.isUsed = 1
else:
complaints.append(' Part %s, node %s has no rMatrix.' % (pNum, n.nodeNum))
partIsBad = 1
if mp.nGammaCat > 1:
mtNum = n.br.parts[pNum].gdasrvNum
if mtNum >= 0 and mtNum < mp.nGdasrvs:
mt = mp.gdasrvs[n.br.parts[pNum].gdasrvNum]
mt.isUsed = 1
else:
complaints.append(' Part %s, node %s has no gdasrv. nGammaCat=%s' % (
pNum, n.nodeNum, mp.nGammaCat))
partIsBad = 1
if mp.nGammaCat == 1:
if n.br.parts[pNum].gdasrvNum != -1:
complaints.append(' Part %s, node %s has a gdasrv, but nGammaCat is 1.' % (
pNum, n.nodeNum))
partIsBad = 1
# Is every model component used?
for mt in mp.comps:
if not mt.isUsed:
complaints.append(' Part %s, comp %s is not used.' % (pNum, mt.num))
partIsBad = 1
for mt in mp.rMatrices:
if not mt.isUsed:
complaints.append(
' Part %s, rMatrix %s is not used.' % (pNum, mt.num))
partIsBad = 1
for mt in mp.gdasrvs:
if not mt.isUsed:
complaints.append(
' Part %s, gdasrv %s is not used.' % (pNum, mt.num))
partIsBad = 1
# Clean up ad hoc attr 'isUsed'
for mt in mp.comps:
del(mt.isUsed)
for mt in mp.rMatrices:
del(mt.isUsed)
for mt in mp.gdasrvs:
del(mt.isUsed)
if partIsBad:
isBad = 1
else:
complaints.append(' ok')
# ##################################
if resetEmpiricalComps:
self.setEmpiricalComps()
# self.model.isHet if any part isHet
self.model.isHet = 0
for pNum in range(self.model.nParts):
if self.model.parts[pNum].isHet:
self.model.isHet = 1
break
# relativeRates
self.model.doRelRates = 0
if self.model.nParts > 1:
for p in self.model.parts:
if p.relRate != 1.0: # This week, the default relRate is 1.0
self.model.doRelRates = 1
break
if self.model.relRatesAreFree:
self.model.doRelRates = 1
if self.model.doRelRates:
totDataLen = 0
for p in self.data.parts:
totDataLen += p.nChar
fact = 0.0
for i in range(self.model.nParts):
fact += (self.model.parts[i].relRate *
self.data.parts[i].nChar)
fact = float(totDataLen) / fact
for p in self.model.parts:
p.relRate *= fact
if 0:
print("RelativeRates (adjusted for length)")
for i in range(self.model.nParts):
p = self.model.parts[i]
print(" part %s, nChar %5s, relRate %s" % (p.num, self.data.parts[i].nChar, p.relRate))
if 1:
total = 0.0
for i in range(self.model.nParts):
total += (self.model.parts[i].relRate *
(float(self.data.parts[i].nChar) / float(totDataLen)))
if abs(total - 1.0) > 1.0e-12:
gm.append(
'Error in relativeRate calculation (total=%s).' % total)
raise P4Error(gm)
# print "modelSanityCheck. relRatesAreFree=%s, doRelRates=%s" %
# (self.model.relRatesAreFree, self.model.doRelRates)
# model.nFreePrams
self.model.nFreePrams = 0
for mp in self.model.parts:
for mt in mp.comps:
if mt.free:
self.model.nFreePrams += mp.dim - 1
for mt in mp.rMatrices:
if mt.free:
if mt.spec == '2p':
self.model.nFreePrams += 1
else:
self.model.nFreePrams += (
((mp.dim * mp.dim) - mp.dim) / 2) - 1
for mt in mp.gdasrvs:
if mt.free:
self.model.nFreePrams += 1
if mp.pInvar.free:
self.model.nFreePrams += 1
# print "Tree.modelSanityCheck(). Counted %i free params." %
# self.model.nFreePrams
if self.model.doRelRates and self.model.relRatesAreFree:
self.model.nFreePrams += self.model.nParts - 1
if isBad:
gm.append("(Indices are zero-based.)")
gm += complaints
raise P4Error(gm)
[docs]
def setEmpiricalComps(self):
"""Set any empirical model comps to the comp of the data.
This is done by self.modelSanityCheck(), but sometimes you may
want to do it at other times. For example, do this after
exchanging Data objects, or after simulating. In those cases
there does not seem to be a reasonable way to do it
automatically."""
complaintHead = '\nTree.setEmpiricalComps()'
gm = [complaintHead]
if not self.model:
gm.append("This tree has no model.")
raise P4Error(gm)
if not self.data:
gm.append("This tree has no data.")
raise P4Error(gm)
for mp in self.model.parts:
for c in mp.comps:
if c.spec == 'empirical':
# print "got empirical comp, comp %s in part %s. (nComps=%i, isHet=%s)" % (
# c.num, mp.num, mp.nComps, mp.isHet)
if not mp.isHet:
seqNums = None
elif mp.nComps == 1:
seqNums = None
else:
seqNums = []
# for n in self.nodes:
# print "node %2i seqNum=%3i n.parts[%i].compNum=%3i" % (
# n.nodeNum, n.seqNum, mp.num, n.parts[mp.num].compNum)
for n in self.iterNodes():
# Is the comp used by the node?
if n.parts[mp.num].compNum == c.num:
# print "comp %s is used by node %s" % (c.num,
# n.nodeNum)
if n.isLeaf:
nodeNums = [n.nodeNum]
else:
nodeNums = self.getNodeNumsAbove(
n, leavesOnly=1)
# gm.append("nodeNums for %s = %s" %
# (n.nodeNum, nodeNums)
for i in nodeNums:
seqNum = self.nodes[i].seqNum
if seqNum not in seqNums:
seqNums.append(seqNum)
# print "setEmpiricalComps() got seqNums = %s" %
# seqNums
if not seqNums:
gm.append(
"Something is wrong here. part %i, comp %i." % (mp.num, c.num))
gm.append(
"This comp object has no sequences from which to get the empirical comp.")
gm.append(
"Maybe you need to yourTree.setModelModelComponentOnNode() or ")
gm.append("yourTree.setModelComponentsOnNodesRandomly()")
#gm.append(
# "Or maybe its an extra comp in an RJ MCMC? -- If so, fix")
#gm.append("the comp val to eg 'equal'.")
raise P4Error(gm)
# dim long, not dim - 1
c.val = self.data.parts[mp.num].composition(seqNums)
# print " seqNums=%s, c.val=%s" % (seqNums, c.val)
needsNormalizing = 0
for i in range(len(c.val)):
if c.val[i] < var.PIVEC_MIN:
c.val[i] = var.PIVEC_MIN + (0.2 * var.PIVEC_MIN) + (var.PIVEC_MIN * random.random())
needsNormalizing = 1
theSum = numpy.sum(c.val)
# print "setEmpiricalComps(). Got theSum = %i" % theSum
# We may have asked for the comp of an empty sequence,
# in which case val is all zeros. Check for that.
if math.fabs(1.0 - theSum) > 0.1:
gm.append(
"Something is very wrong here. Empirical comp vals should sum to 1.0")
gm.append("The sum of the comp vals for part %s, comp %s, is %s" % (
mp.num, c.num, theSum))
gm.append(
"Probably the sequences from which the composition was taken were blank.")
raise P4Error(gm)
if needsNormalizing or abs(theSum - 1.0) > 1e-16:
c.val /= theSum
#################################################### model fit
####################################################
[docs]
def modelFitTests(self, fName='model_fit_tests_out', writeRawStats=0):
"""Do model fit tests on the data.
The two tests are the Goldman-Cox test, and the tree- and model-
based composition fit test. Both require simulations with
optimizations in order to get a null distribution, and those
simulations need to be done before this method. The simulations
should be done with the simsForModelFitTests() method.
Self should have a data and a model attached, and be optimized.
The Goldman-Cox test (Goldman 1993. Statistical tests of models
of DNA substitution. J Mol Evol 36: 182-198.) is a test for
overall fit of the model to the data. It does not work if the
data have gaps or ambiguities.
The tree- and model-based composition test asks the question:
'Does the composition implied by the model fit the data?' If the
model is homogeneous and empirical comp is used, then this is the
same as the chi-square test except that the null distribution
comes from simulations, not from the chi-square distribution. In
that case only the question is, additionally, 'Are the data
homogeneous in composition?', ie the same question asked by the
chi-square test. However, the data might be heterogeneous, and
the model might be heterogeneous over the tree; the tree- and
model-based composition fit test can ask whether the heterogeneous
model fits the heterogeneous data. The composition is tested in
each data partition, separately. The test is done both overall,
ie for all the sequences together, and for individual sequences.
If you just want a compo homogeneity test with empirical
homogeneous comp, try the compoTestUsingSimulations() method-- its
way faster, because there are not optimizations in the sims part.
Output is verbose, to a file."""
gm = ['Tree.modelFitTests()']
self.calcLogLike(verbose=0)
# Usually True. Set to False for debugging, experimentation, getting
# individual stats, etc
doOut = True
# We can't do the Goldman-Cox test if there are any gaps or
# ambiguities.
doGoldmanCox = True
for a in self.data.alignments:
if a.hasGapsOrAmbiguities():
doGoldmanCox = False
break
# print "test doGoldmanCox = %s" % doGoldmanCox
rawFName = '%s_raw.py' % fName
#flob = sys.stderr
#fRaw = sys.stderr
if doOut:
flob = open(fName, 'w')
else:
flob = None
if writeRawStats:
fRaw = open(rawFName, 'w')
else:
fRaw = None
#######################
# Goldman-Cox stats
#######################
# For a two-part data analysis, the first few lines of the
# sims_GoldmanStats_* file will be like the following. Its in
# groups of 3-- the first one for all parts together (part number
# -1), and the next lines for separate parts.
# part unconstr L log like Goldman-Cox stat
# -1 -921.888705 -1085.696919 163.808215
# 0 -357.089057 -430.941958 73.852901
# 1 -564.799648 -654.754962 89.955314
# -1 -952.063037 -1130.195799 178.132761
# 0 -362.164119 -439.709824 77.545705
# ... and so on.
# For a one-part analysis, it will be the same except that one sim
# gets only one line, starting with zero.
if doGoldmanCox:
goldmanOverallSimStats = []
if self.data.nParts > 1:
goldmanIndividualSimStats = []
for partNum in range(self.data.nParts):
goldmanIndividualSimStats.append([])
import glob
goldmanFNames = glob.glob('sims_GoldmanStats_*')
# print "nParts=%s" % self.data.nParts
# print goldmanFNames
for fName1 in goldmanFNames:
f2 = open(fName1)
aLine = f2.readline()
if not aLine:
gm.append("Empty file %s" % fName1)
raise P4Error(gm)
if aLine[0] != '#':
gm.append(
"Expecting a '#' as the first character in file %s" % fName1)
raise P4Error(gm)
aLine = f2.readline()
# print "a got line %s" % aLine,
while aLine:
if self.data.nParts > 1:
splitLine = aLine.split()
if len(splitLine) != 4:
gm.append(
"Bad line in Goldman stats file %s" % fName1)
gm.append("'%s'" % aLine)
raise P4Error(gm)
if int(splitLine[0]) != -1:
gm.append(
"Bad line in Goldman stats file %s" % fName1)
gm.append("First item should be -1")
gm.append("'%s'" % aLine)
raise P4Error(gm)
# print splitLine[-1]
goldmanOverallSimStats.append(float(splitLine[-1]))
aLine = f2.readline()
# print "b got line %s" % aLine,
if not aLine:
gm.append("Premature end to file %s" % fName1)
raise P4Error(gm)
for partNum in range(self.data.nParts):
splitLine = aLine.split()
# print "partNum %i, splitLine=%s" % (partNum,
# splitLine)
if len(splitLine) != 4:
gm.append(
"Bad line in Goldman stats file %s" % fName1)
gm.append("'%s'" % aLine)
raise P4Error(gm)
try:
splitLine[0] = int(splitLine[0])
except ValueError:
gm.append(
"Bad line in Goldman stats file %s" % fName1)
gm.append(
"First item should be the partNum %i" % partNum)
gm.append("'%s'" % aLine)
raise P4Error(gm)
if splitLine[0] != partNum:
gm.append(
"Bad line in Goldman stats file %s" % fName1)
gm.append(
"First item should be the partNum %i" % partNum)
gm.append("'%s'" % aLine)
raise P4Error(gm)
# for taxNum in range(self.data.nTax):
# print splitLine[taxNum + 1]
# print splitLine[-1]
if self.data.nParts == 1:
goldmanOverallSimStats.append(float(splitLine[-1]))
else:
goldmanIndividualSimStats[
partNum].append(float(splitLine[-1]))
aLine = f2.readline()
# print "c got line %s" % aLine,
f2.close()
# print "goldmanOverallSimStats =", goldmanOverallSimStats
# print "goldmanIndividualSimStats =", goldmanIndividualSimStats
# sys.exit()
if doOut:
flob.write('Model fit tests\n===============\n\n')
flob.write(
'The data that we are testing have %i taxa,\n' % self.data.nTax)
if len(self.data.alignments) == 1:
flob.write('1 alignment, ')
else:
flob.write('%i alignments, ' % len(self.data.alignments))
if self.data.nParts == 1:
flob.write('and 1 data partition.\n')
else:
flob.write('and %i data partitions.\n' % self.data.nParts)
flob.write('The lengths of those partitions are as follows:\n')
flob.write(' partNum nChar \n')
for i in range(self.data.nParts):
flob.write(' %3i %5i\n' %
(i, self.data.parts[i].nChar))
self.data.calcUnconstrainedLogLikelihood2()
if doOut:
flob.write("\nThe unconstrained likelihood is %f\n" %
self.data.unconstrainedLogLikelihood)
flob.write(
'(This is the partition-by-partition unconstrained log likelihood, \n')
flob.write(
'ie the sum of the unconstrained log likes from each partition separately, \n')
flob.write(
'and so will not be the same as that given by PAUP, if the data are partitioned.)\n')
flob.write('\n\nGoldman-Cox test for overall model fit\n')
flob.write('======================================\n')
flob.write(
'The log likelihood for these data for this tree is %f\n' % self.logLike)
flob.write('The unconstrained log likelihood for these data is %f\n' %
self.data.unconstrainedLogLikelihood)
originalGoldmanCoxStat = self.data.unconstrainedLogLikelihood - \
self.logLike
if doOut:
flob.write(
'The Goldman-Cox statistic for the original data is the difference, %f\n' % originalGoldmanCoxStat)
if self.data.nParts > 1:
flob.write(
'(The unconstrained log likelihood for these data is calculated partition by partition.)\n')
flob.write('\n')
if self.data.nParts > 1:
originalGoldmanCoxStatsByPart = []
if doOut:
flob.write('Stats by partition.\n')
flob.write(
'part\t unconstrLogL\t log like \tGoldman-Cox stat\n')
flob.write(
'----\t ----------\t -------- \t----------------\n')
for partNum in range(self.data.nParts):
unc = pf.getUnconstrainedLogLike(
self.data.parts[partNum].cPart)
like = pf.p4_partLogLike(
self.cTree, self.data.parts[partNum].cPart, partNum, 0)
diff = unc - like
if doOut:
flob.write(' %i\t%f\t%f\t %f\n' %
(partNum, unc, like, diff))
originalGoldmanCoxStatsByPart.append(diff)
# Do the overall stat
nSims = len(goldmanOverallSimStats)
if doOut:
flob.write('\nThere were %i simulations.\n\n' % nSims)
if writeRawStats:
fRaw.write('# Goldman-Cox null distributions.\n')
if self.data.nParts > 1:
fRaw.write(
'# Simulation stats for overall data, ie for all data partitions combined.\n')
else:
fRaw.write('# Simulation stats.\n')
fRaw.write('goldman_cox_overall = %s\n' %
goldmanOverallSimStats)
if self.data.nParts > 1:
for partNum in range(self.data.nParts):
fRaw.write(
'# Simulation stats for data partition %i\n' % partNum)
fRaw.write('goldman_cox_part%i = %s\n' %
(partNum, goldmanIndividualSimStats[partNum]))
prob = p4.func.tailAreaProbability(
originalGoldmanCoxStat, goldmanOverallSimStats, verbose=0)[2]
if doOut:
flob.write('\n Overall Goldman-Cox test: ')
if prob <= 0.05:
flob.write('%13s' % "Doesn't fit.")
else:
flob.write('%13s' % 'Fits.')
flob.write(' P = %5.3f\n' % prob)
if self.data.nParts > 1:
if doOut:
flob.write(' Tests for individual data partitions:\n')
for partNum in range(self.data.nParts):
prob = p4.func.tailAreaProbability(originalGoldmanCoxStatsByPart[partNum],
goldmanIndividualSimStats[partNum], verbose=0)[2]
if doOut:
flob.write(
' Part %-2i: ' % partNum)
if prob <= 0.05:
flob.write('%13s' % 'Doesn\'t fit.')
else:
flob.write('%13s' % 'Fits.')
flob.write(' P = %5.3f\n' % prob)
#########################
# COMPOSITION
#########################
statsHashList = []
for pNum in range(self.data.nParts):
h = {}
statsHashList.append(h)
h['individualNSites'] = []
h['observedIndividualCounts'] = []
for j in range(self.data.nTax):
# print pf.partSequenceSitesCount(self.data.parts[pNum].cPart,
# j)
h['individualNSites'].append(
pf.partSequenceSitesCount(self.data.parts[pNum].cPart, j)) # no gaps or qmarks
# print self.data.parts[pNum].composition([j])
h['observedIndividualCounts'].append(
self.data.parts[pNum].composition([j]))
# The line above is temporarily composition, not counts
# pf.expectedCompositionCounts returns a tuple of tuples
# representing the counts of the nodes in proper alignment order.
h['expectedIndividualCounts'] = list(
pf.p4_expectedCompositionCounts(self.cTree, pNum)) # alignment order
# At the moment, h['observedIndividualCounts'] has composition,
# not counts. So multiply by h['individualNSites']
for i in range(self.data.nTax):
for j in range(self.data.parts[pNum].dim):
h['observedIndividualCounts'][i][
j] *= h['individualNSites'][i]
# We will want to skip any sequences composed of all gaps
skipTaxNums = []
for pNum in range(self.data.nParts):
stn = []
for tNum in range(self.data.nTax):
if not statsHashList[pNum]['individualNSites'][tNum]:
stn.append(tNum)
skipTaxNums.append(stn)
# print "skipTaxNums = %s" % skipTaxNums
# Do the boring old compo chi square test.
if doOut:
flob.write(longMessage1) # explanation ...
for pNum in range(self.data.nParts):
h = statsHashList[pNum]
# Can't use p4.func.xSquared(), because there might be column
# zeros.
# print "observedIndividualCounts = %s' %
# h['observedIndividualCounts"]
nRows = len(h['observedIndividualCounts'])
nCols = len(h['observedIndividualCounts'][0])
# I could have just used nSites, above
theSumOfRows = p4.func._sumOfRows(h['observedIndividualCounts'])
theSumOfCols = p4.func._sumOfColumns(h['observedIndividualCounts'])
# print theSumOfCols
isOk = 1
columnZeros = []
# for j in range(len(theSumOfRows)):
# if theSumOfRows[j] == 0.0:
# gm.append("Zero in a row sum. Programming error.")
# raise P4Error(gm)
for j in range(len(theSumOfCols)):
if theSumOfCols[j] <= 0.0:
columnZeros.append(j)
theExpected = p4.func._expected(theSumOfRows, theSumOfCols)
# print "theExpected = %s" % theExpected
# print "columnZeros = %s" % columnZeros
xSq = 0.0
for rowNum in range(nRows):
if rowNum in skipTaxNums[pNum]:
pass
else:
xSq_row = 0.0
for colNum in range(nCols):
if colNum in columnZeros:
pass
else:
theDiff = h['observedIndividualCounts'][rowNum][
colNum] - theExpected[rowNum][colNum]
xSq_row += (theDiff * theDiff) / \
theExpected[rowNum][colNum]
xSq += xSq_row
dof = (nCols - len(columnZeros) - 1) * \
(nRows - len(skipTaxNums[pNum]) - 1)
prob = p4.func.chiSquaredProb(xSq, dof)
if doOut:
flob.write(
' Part %i: Chi-square = %f, (dof=%i) P = %f\n' % (pNum, xSq, dof, prob))
for pNum in range(self.data.nParts):
h = statsHashList[pNum]
h['overallStat'] = 0.0
h['individualStats'] = [0.0] * self.data.nTax
for i in range(self.data.nTax):
if i in skipTaxNums[pNum]:
pass # h['individualStats'] stays at zeros
else:
for j in range(self.data.parts[pNum].dim):
# Avoid dividing by Zero.
if h['expectedIndividualCounts'][i][j]:
dif = h['observedIndividualCounts'][i][
j] - h['expectedIndividualCounts'][i][j]
h['individualStats'][
i] += ((dif * dif) / h['expectedIndividualCounts'][i][j])
h['overallStat'] += h['individualStats'][i]
h['overallSimStats'] = []
h['individualSimStats'] = []
for i in range(self.data.nTax):
h['individualSimStats'].append([])
if 0:
print("h['individualNSites'] = %s" % h['individualNSites'])
print("h['observedIndividualCounts'] = %s" % h['observedIndividualCounts'])
print("h['expectedIndividualCounts'] = %s" % h['expectedIndividualCounts'])
print("h['overallStat'] = %s" % h['overallStat'])
print("h['individualStats'] = %s" % h['individualStats'])
raise P4Error(gm)
import glob
compoFNames = glob.glob('sims_CompStats_*')
# print compoFNames
for fName1 in compoFNames:
f2 = open(fName1)
aLine = f2.readline()
if not aLine:
gm.append("Empty file %s" % fName1)
raise P4Error(gm)
# print "a got line %s" % aLine,
while aLine:
for partNum in range(self.data.nParts):
h = statsHashList[partNum]
splitLine = aLine.split()
if len(splitLine) != (self.data.nTax + 2):
gm.append(
"Bad line in composition stats file %s" % fName1)
gm.append("'%s'" % aLine)
raise P4Error(gm)
if int(splitLine[0]) != partNum:
gm.append(
"Bad line in composition stats file %s" % fName1)
gm.append(
"First item should be the partNum %i" % partNum)
gm.append("'%s'" % aLine)
raise P4Error(gm)
# for taxNum in range(self.data.nTax):
# print splitLine[taxNum + 1]
# print splitLine[-1]
h['overallSimStats'].append(float(splitLine[-1]))
for i in range(self.data.nTax):
h['individualSimStats'][i].append(
float(splitLine[i + 1]))
#raise P4Error(gm)
aLine = f2.readline()
if not aLine:
break
# print "b got line %s" % aLine,
f2.close()
nSims = len(statsHashList[0]['overallSimStats'])
if doOut:
# Explain tree- and model-based compo fit stat, X^2_m
flob.write(longMessage2)
flob.write(' %i simulation reps were used.\n\n' % nSims)
spacer1 = ' ' * 10
for partNum in range(self.data.nParts):
h = statsHashList[partNum]
if doOut:
flob.write('Part %-2i:\n-------\n\n' % partNum)
flob.write('Statistics from the original data\n')
flob.write('%s%30s: %f\n' %
(spacer1, 'Overall observed stat', h['overallStat']))
flob.write('%s%30s:\n' %
(spacer1, 'Stats for individual taxa'))
for taxNum in range(self.data.nTax):
if taxNum not in skipTaxNums[partNum]:
flob.write('%s%30s: %f\n' % (
spacer1, self.data.taxNames[taxNum], h['individualStats'][taxNum]))
else:
flob.write('%s%30s: skipped\n' %
(spacer1, self.data.taxNames[taxNum]))
flob.write(
'\nAssessment of fit from null distribution from %i simulations\n' % nSims)
flob.write('%s%30s: ' % (spacer1, 'Overall'))
prob = p4.func.tailAreaProbability(
h['overallStat'], h['overallSimStats'], verbose=0)[2]
if doOut:
if prob <= 0.05:
flob.write('%13s' % 'Doesn\'t fit.')
else:
flob.write('%13s' % 'Fits.')
flob.write(' P = %5.3f\n' % prob)
#############
theRet = prob
#############
for taxNum in range(self.data.nTax):
if doOut:
flob.write('%s%30s: ' %
(spacer1, self.data.taxNames[taxNum]))
if taxNum in skipTaxNums[partNum]:
if doOut:
flob.write('%13s\n' % 'skipped.')
else:
prob = p4.func.tailAreaProbability(h['individualStats'][taxNum],
h['individualSimStats'][taxNum], verbose=0)[2]
if doOut:
if prob <= 0.05:
flob.write('%13s' % "Doesn't fit.")
else:
flob.write('%13s' % 'Fits.')
flob.write(' P = %5.3f\n' % prob)
if writeRawStats:
fRaw.write('#\n# Tree and model based composition fit test\n')
fRaw.write('# =========================================\n')
fRaw.write(
'# Simulation statistics, ie the null distributions\n\n')
fRaw.write('# Part %i:\n' % partNum)
fRaw.write('part%i_overall_compo_null = %s\n' %
(partNum, h['overallSimStats']))
for taxNum in range(self.data.nTax):
fRaw.write('part%i_%s_compo_null = %s\n' % (partNum,
_fixFileName(
self.data.taxNames[taxNum]),
h['individualSimStats'][taxNum]))
# Yes, it is possible to close sys.stdout
if flob and flob != sys.stdout:
flob.close()
if fRaw and fRaw != sys.stdout:
fRaw.close()
return theRet
[docs]
def compoTestUsingSimulations(self, nSims=100, doIndividualSequences=0, doChiSquare=0, verbose=1):
"""Compositional homogeneity test using a null distribution from simulations.
This does a compositional homogeneity test on each data partition.
The statistic used here is X^2, obtained via
Data.compoChiSquaredTest().
The null distribution of the stat is made using simulations, so of
course you need to provide a tree with a model, with optimized
branch lengths and model parameters. This is a comp homogeneity
test, so the model should be tree-homogeneous.
The analysis usually tests all sequences in the data partition
together (like paup), but you can also 'doIndividualSequences'
(like puzzle). Beware that the latter is a multiple simultaneous
stats test, with associated Type I Error.
For purposes of comparison, this test can also do compo tests in
the style of PAUP and puzzle, using chi-square to assess
significance. Do this by turning 'doChiSquare' on. The compo test
in PAUP tests all sequences together, while the compo test in
puzzle tests all sequences separately. There are advantages and
disadvantages to the latter-- doing all sequences separately
allows you to identify the worst offenders, but suffers due to the
problems of multiple simultaneous stats tests. There are slight
differences between the computation of the Chi-square in PAUP and
puzzle and the p4 version. The compo test in PAUP (basefreq) does
the chi-squared test, but if sequences are blank it still counts
them in the degrees of freedom; p4 does not count blank sequences
in the degrees of freedom. Puzzle simply uses the row sums, ie
the contributions of each sequence to the total X-squared, and
assesses significance with chi-squared using the number of symbols
minus 1 as the degrees of freedom. Ie for DNA dof=3, for protein
dof=19. Puzzle correctly gets the composition from sequences with
gaps, but does not do the right thing for sequences with
ambiguities like r, y, and so on. P4 does calculate the
composition correctly when there are such ambiguities. So p4 will
give you the same numbers as paup and puzzle for the chi-squared
part as long as you don't have blank sequences or ambiguities like
r and y.
This uses the Data.compoChiSquaredTest() method to get the
stats. See the doc string for that method, where it describes how
zero column sums (ie some character is absent) can be dealt with.
Here, when that method is invoked, 'skipColumnZeros' is turned on,
so that the analysis is robust against data with zero or low
values for some characters.
For example::
# First, do a homog opt, and pickle the optimized tree.
# Here I use a bionj tree, but you could use whatever.
read('d.nex')
a = var.alignments[0]
dm = a.pDistances()
t = dm.bionj()
d = Data()
t.data = d
t.newComp(free=1, spec='empirical')
t.newRMatrix(free=1, spec='ones')
t.setNGammaCat(nGammaCat=4)
t.newGdasrv(free=1, val=0.5)
t.setPInvar(free=0, val=0.0)
t.optLogLike()
t.name = 'homogOpt'
t.tPickle()
# Then, do the test ...
read('homogOpt.p4_tPickle')
t = var.trees[0]
read('d.nex')
d = Data()
t.data = d
t.compoTestUsingSimulations()
# Output would be something like ...
# Composition homogeneity test using simulations.
# P-values are shown.
# Part Num 0
# Part Name all
# -------------------- --------
# All Sequences 0.0000
# Or using more sims for more precision, and also doing the
# Chi-square test for contrast ...
t.compoTestUsingSimulations(nSims=1000, doChiSquare=True)
# Output might be something like ...
# Composition homogeneity test using simulations.
# P-values are shown.
# (P-values from Chi-Square are shown in parens.)
# Part Num 0
# Part Name all
# -------------------- --------
# All Sequences 0.0140
# (Chi-Squared Prob) (0.9933)
# It returns the P-value for the sims for the first data
# partition.
It is often the case, as above, that this test will show
significance while the Chi-square test does not.
"""
gm = ['Tree.compoTestUsingSimulations()']
# print "inComp = %s" % self.model.parts[0].comps[0].val
if not self.data:
gm.append("No data. Set the data first.")
raise P4Error(gm)
if not self.model:
gm.append("No model. You need to set the model first.")
raise P4Error(gm)
self.modelSanityCheck()
if self.model.isHet:
gm.append("The model for this tree is tree-heterogeneous.")
gm.append("This test is not valid for tree-hetero models.")
raise P4Error(gm)
# Make a new data object in which to do the sims, so we do not over-write self
# print "a self.data = %s" % self.data
# self.data.dump()
savedData = self.data
self.data = None # This triggers self.deleteCStuff()
self.data = savedData.dupe()
# print "b self.data = %s" % self.data
# self.data.dump()
#raise P4Error(gm)
# Check for missing sequences in any of the parts. Missing seq
# nums go in skips, a list of lists.
skips = []
for pNum in range(self.data.nParts):
skips.append([])
for pNum in range(self.data.nParts):
for tNum in range(self.data.nTax):
nSites = pf.partSequenceSitesCount(
self.data.parts[pNum].cPart, tNum) # no gaps, no missings
if not nSites:
skips[pNum].append(tNum)
# Get the original stats from self.data.
# compoChiSquaredTest(self, verbose=1, skipColumnZeros=0, useConstantSites=1, skipTaxNums=None, getRows=0)
original = self.data.compoChiSquaredTest(verbose=0,
skipColumnZeros=1,
skipTaxNums=skips,
getRows=doIndividualSequences)
# print "original =", original
# Make some empty lists in which to put our stats
full = []
if doIndividualSequences:
rows = []
for pNum in range(self.data.nParts):
full.append([])
if doIndividualSequences:
onePartRows = []
for i in range(self.data.nTax):
onePartRows.append([])
rows.append(onePartRows)
# Do the sims
for i in range(nSims):
# if i < 5:
# print "%i simComp = %s" % (i, self.model.parts[0].comps[0].val)
self.simulate()
ret = self.data.compoChiSquaredTest(skipColumnZeros=1,
skipTaxNums=skips, getRows=doIndividualSequences, verbose=0)
# print "%i ret=%s" % (i, ret)
for pNum in range(self.data.nParts):
full[pNum].append(ret[pNum][0])
if doIndividualSequences:
for tNum in range(self.data.nTax):
if tNum not in skips[pNum]:
rows[pNum][tNum].append(ret[pNum][3][tNum])
# Find the longest part name length, and heading width, so the output
# looks nice.
partWid = 8
for p in self.data.parts:
if len(p.name) > partWid:
partWid = len(p.name)
partWid += 2
headWid = 20
for tN in self.data.taxNames:
if len(tN) > headWid:
headWid = len(tN)
headWid += 2
headSig = '%' + "%i" % (headWid - 2) + 's '
# Get the all-sequences tail area probs
partTaps = []
for pNum in range(self.data.nParts):
partTaps.append(
p4.func.tailAreaProbability(original[pNum][0], full[pNum], verbose=0)[2])
#print("partTaps is ", partTaps)
# Intro
if verbose:
print("Composition homogeneity test using simulations.")
print("P-values are shown.")
if doChiSquare:
print("(P-values from Chi-Square are shown in parens.)")
print()
# Print the Part Nums and Part Names
if verbose:
print(headSig % 'Part Num', end=' ')
for pNum in range(self.data.nParts):
print(('%i' % pNum).center(partWid), end=' ')
print()
print(headSig % 'Part Name', end=' ')
for pNum in range(self.data.nParts):
print(self.data.parts[pNum].name.center(partWid), end=' ')
print()
print(headSig % ('-' * (headWid - 2)), end=' ')
for pNum in range(self.data.nParts):
print(('-' * (partWid - 2)).center(partWid), end=' ')
print()
# Print the all-sequences results
if verbose:
print(headSig % 'All Sequences', end=' ')
for pNum in range(self.data.nParts):
print(('%6.4f' % partTaps[pNum]).center(partWid), end=' ')
print()
if doChiSquare:
print(headSig % '(Chi-Squared Prob)', end=' ')
for pNum in range(self.data.nParts):
print(('(%6.4f)' % original[pNum][2]).center(partWid), end=' ')
print()
if doIndividualSequences and verbose:
print()
# print "Individual sequences"
# print "--------------------"
for tNum in range(self.data.nTax):
print(headSig % self.data.taxNames[tNum], end=' ')
for pNum in range(self.data.nParts):
if tNum not in skips[pNum]:
ret = p4.func.tailAreaProbability(
original[pNum][3][tNum], rows[pNum][tNum], verbose=0)[2]
print(('%6.4f' % ret).center(partWid), end=' ')
else:
print(('-' * 4).center(partWid), end=' ')
print()
if doChiSquare:
print(headSig % ' ', end=' ')
for pNum in range(self.data.nParts):
# degrees of freedom
dof = self.data.parts[pNum].dim - 1
if tNum not in skips[pNum]:
ret = p4.func.chiSquaredProb(
original[pNum][3][tNum], dof)
print(('(%6.4f)' % ret).center(partWid), end=' ')
else:
print(('-' * 4).center(partWid), end=' ')
print()
# Replace the saved data
# Since we are replacing an exisiting data, this triggers
# self.deleteCStuff()
self.data = savedData
#print(partTaps, "\n" * 2) # one for each data part
return partTaps[0] # only the first part
[docs]
def bigXSquaredSubM(self, verbose=False):
"""Calculate the X^2_m stat
This can handle gaps and ambiguities.
Column zeros in the observed is not a problem with this stat,
as we are dividing by the expected composition, and that comes
from the model, which does not allow compositions with values
of zero. """
if not self.cTree:
self._commonCStuff(resetEmpiricalComps=True)
l = []
for pNum in range(self.data.nParts):
if verbose:
print("Part %i" % pNum)
print("======")
obs = []
nSites = [] # no gaps or ?
for taxNum in range(self.nTax):
thisNSites = pf.partSequenceSitesCount(
self.data.parts[pNum].cPart, taxNum)
comp = self.data.parts[pNum].composition([taxNum])
for symbNum in range(self.data.parts[pNum].dim):
comp[symbNum] *= thisNSites
nSites.append(thisNSites)
obs.append(comp)
if verbose:
print("\n Observed")
print(" " * 10, end=' ')
for symbNum in range(self.data.parts[pNum].dim):
print("%8s" % self.data.parts[pNum].symbols[symbNum], end=' ')
print()
for taxNum in range(self.nTax):
print("%10s" % self.taxNames[taxNum], end=' ')
for symbNum in range(self.data.parts[pNum].dim):
print("%8.4f" % obs[taxNum][symbNum], end=' ')
print(" n=%i" % nSites[taxNum])
# pf.p4_expectedCompositionCounts returns a tuple of tuples
# representing the counts of the nodes in proper alignment order.
exp = list(pf.p4_expectedCompositionCounts(self.cTree, pNum))
if verbose:
print("\n Expected")
print(" " * 10, end=' ')
for symbNum in range(self.data.parts[pNum].dim):
print("%8s" % self.data.parts[pNum].symbols[symbNum], end=' ')
print()
for taxNum in range(self.nTax):
print("%10s" % self.taxNames[taxNum], end=' ')
for symbNum in range(self.data.parts[pNum].dim):
print("%8.4f" % exp[taxNum][symbNum], end=' ')
print(" n=%i" % nSites[taxNum])
# do the summation
theSum = 0.0
for taxNum in range(self.nTax):
for symbNum in range(self.data.parts[pNum].dim):
x = obs[taxNum][symbNum] - exp[taxNum][symbNum]
theSum += (x * x) / exp[taxNum][symbNum]
l.append(theSum)
if verbose:
print("The bigXSquaredSubM stat for this part is %.5f" % theSum)
return l
[docs]
def compStatFromCharFreqs(self, verbose=False):
"""Calculate a statistic from observed and model character frequencies.
Call it c_m, little c sub m.
It is calculated from observed character frequencies and character
frequencies expected from the (possibly tree-heterogeneous) model.
It would be the sum of abs(obs-exp)/exp
"""
if not self.cTree:
self._commonCStuff(resetEmpiricalComps=True)
# pf.p4_expectedComposition returns a tuple of tuples of tuples
# representing the counts of the nodes in proper alignment order.
exp = pf.p4_expectedComposition(self.cTree)
l = []
for pNum in range(self.data.nParts):
if verbose:
print("Part %i" % pNum)
print("======")
obs = []
for taxNum in range(self.nTax):
comp = self.data.parts[pNum].composition([taxNum])
obs.append(comp)
if verbose:
print("\n Observed")
print(" " * 10, end=' ')
for symbNum in range(self.data.parts[pNum].dim):
print("%8s" % self.data.parts[pNum].symbols[symbNum], end=' ')
print()
for taxNum in range(self.nTax):
print("%10s" % self.taxNames[taxNum], end=' ')
for symbNum in range(self.data.parts[pNum].dim):
print("%8.4f" % obs[taxNum][symbNum], end=' ')
print()
if verbose:
print("\n Expected")
print(" " * 10, end=' ')
for symbNum in range(self.data.parts[pNum].dim):
print("%8s" % self.data.parts[pNum].symbols[symbNum], end=' ')
print()
for taxNum in range(self.nTax):
print("%10s" % self.taxNames[taxNum], end=' ')
for symbNum in range(self.data.parts[pNum].dim):
print("%8.4f" % exp[pNum][taxNum][symbNum], end=' ')
print()
# do the summation
theSum = 0.0
for taxNum in range(self.nTax):
for symbNum in range(self.data.parts[pNum].dim):
theSum += math.fabs(obs[taxNum][symbNum] - exp[pNum]
[taxNum][symbNum]) / exp[pNum][taxNum][symbNum]
l.append(theSum)
if verbose:
print("The c_m stat for this part is %.5f" % theSum)
return l
[docs]
def getEuclideanDistanceFromSelfDataToExpectedComposition(self):
"""Calculate the c_E stat between self.data and model expected composition.
The expected composition comes from the current tree (self) and model.
There is an expected composition of each sequence in each part, and is
obtained via pf.p4_expectedComposition(cTree). In non-stationary
evolution, the expected composition of sequences approach the model
composition asymptotically as the branch increases.
I am calling the Euclidean distance from the actual sequence composition
to the expected composition c_E.
Returns:
A list of lists --- the c_E for each sequence, for each part.
Order of the sequences is as in the Data.
"""
if not self.cTree:
self.calcLogLike(verbose=False)
expected = pf.p4_expectedComposition(self.cTree)
#print expected
allParts = []
for pNum in range(self.model.nParts):
thePart = self.data.parts[pNum]
statsForPart = []
for seqNum in range(self.data.nTax):
expectedForPartSeq = expected[pNum][seqNum]
compForPartSeq = thePart.composition([seqNum])
c_E = 0.0
for cNum in range(thePart.dim):
dif = expectedForPartSeq[cNum] - compForPartSeq[cNum]
c_E += (dif * dif)
c_E = math.sqrt(c_E)
statsForPart.append(c_E)
allParts.append(statsForPart)
return allParts
######################################################## opt and sim
########################################################
def __del__(self, freeTree=pf.p4_freeTree, freeNode=pf.p4_freeNode, mysys=sys):
#mysys.stdout.write('Tree.__del__() here.\n')
# mysys.stdout.flush()
# Refers to nodes, which causes grief.
if hasattr(self, "splitKeyHash"):
del(self.splitKeyHash)
self._data = None
# self._model = None # model is needed for freeNode()
# If this is not here, then nodes tend to hang around forever ...
if 1:
for n in self.nodes:
n.wipe()
for n in self.nodes:
if n.cNode:
#mysys.stdout.write(' Tree.__del__(), freeing node %i\n' % n.nodeNum)
# mysys.stdout.flush()
freeNode(n.cNode)
n.cNode = None
for n in self.nodes:
del(n)
self.root = None
self.nodes = None
if self.cTree:
if self.doDataPart:
dp_freeTree(self.cTree)
else:
freeTree(self.cTree)
self.cTree = None
#mysys.stdout.write('Tree.__del__() finished.\n')
# mysys.stdout.flush()
[docs]
def deleteCStuff(self):
"""Deletes c-pointers from nodes, self, and model, but not the data."""
# print 'Tree.deleteCStuff() here.'
for n in self.nodes:
if n.cNode:
# print ' about to free node %i, cNode %s' % (n.nodeNum,
# n.cNode)
pf.p4_freeNode(n.cNode)
n.cNode = 0
if self.cTree:
# print ' about to free cTree'
pf.p4_freeTree(self.cTree)
self.cTree = 0
# I need to delay deleting the cModel until after deleting the
# self.cStuff, because free-ing self.cStuff (eg nodes)
# requires the cModel.
if self.model and self.model.cModel:
# print ' about to free cModel'
pf.p4_freeModel(self.model.cModel)
self.model.cModel = 0
[docs]
def _allocCStuff(self, resetEmpiricalComps=True):
"""Allocate c-memory for self and its nodes."""
gm = ['Tree._allocCStuff()']
# Make sure the nodeNums go from zero to N-1
for i,n in enumerate(self.iterNodes()): # allow NO_ORDER nodes at the end
if self.nodes[i].nodeNum != i:
gm.append(
"Programming error: Problem with node number %i." % i)
gm.append("Nodes should be numbered consecutively from zero.")
raise P4Error(gm)
self.modelSanityCheck(resetEmpiricalComps=resetEmpiricalComps)
if not self.data.cData:
self.data._setCStuff()
if not self.model.cModel:
self.model.allocCStuff()
if var.doDataPart:
# print 'about to dp_newTree'
self.cTree = pf.dp_newTree(len(self.nodes), self.preOrder,
self.postOrder, self.data.cData, self.model.cModel)
self.doDataPart = 1
if not self.cTree:
gm.append("Unable to allocate a cTree")
raise P4Error(gm)
for n in self.iterNodes():
n.doDataPart = 1
# print 'about to dp_newNode (%i)' % n.nodeNum
cNode = pf.dp_newNode(
n.nodeNum, self.cTree, n.seqNum, n.isLeaf)
if not cNode:
gm.append("Unable to allocate a cNode.")
raise P4Error(gm)
n.cNode = cNode
else:
nLeaves = 0
for n in self.iterNodes():
if n.isLeaf:
nLeaves += 1
self.partLikes = numpy.zeros(self.model.nParts, dtype=numpy.double)
self.cTree = pf.p4_newTree(len(list(self.iterNodes())), nLeaves, self.preOrder,
self.postOrder, var._newtAndBrentPowellOptPassLimit, self.partLikes,
self.data.cData, self.model.cModel)
if not self.cTree:
gm.append("Unable to allocate a cTree")
raise P4Error(gm)
for i in range(len(self.nodes)):
n = self.nodes[i]
if n.nodeNum == var.NO_ORDER:
continue # seems a little mixed up.
if i in self.preOrder:
inTree = 1
else:
inTree = 0
# We include the inTree as a flag for whether the node
# is in the tree or not. If the inTree flag is 0,
# then the node is not actually part of the tree, and so
# clNeedsUpdating is turned off.
n.cNode = pf.p4_newNode(
n.nodeNum, self.cTree, n.seqNum, n.isLeaf, inTree)
if not n.cNode:
gm.append("Unable to allocate a cNode")
raise P4Error(gm)
[docs]
def setCStuff(self):
"""Transfer info about self to c-language stuff.
Transfer relationships among nodes, the root position, branch
lengths, model usage info (ie what model attributes apply to what
nodes), and pre- and post-order."""
#gm = ['Tree.setCStuff()']
# Set node relations, br.len, root, node modelNums, preOrder?,
# postOrder
# Set relations- parent, leftChild, sibling. Here's the code for
# pf.p4_setRelative(int theCNode, int relation, int relNum)
# parent- relation = 0, leftChild- relation = 1, sibling- relation
# = 2
for n in self.iterNodes():
if n.parent:
pf.p4_setNodeRelation(n.cNode, 0, n.parent.nodeNum)
else:
pf.p4_setNodeRelation(n.cNode, 0, -1) # "-1" gives NULL
if n.leftChild:
pf.p4_setNodeRelation(n.cNode, 1, n.leftChild.nodeNum)
else:
pf.p4_setNodeRelation(n.cNode, 1, -1)
if n.sibling:
pf.p4_setNodeRelation(n.cNode, 2, n.sibling.nodeNum)
else:
pf.p4_setNodeRelation(n.cNode, 2, -1)
# Root
pf.p4_setTreeRoot(self.cTree, self.root.cNode)
# br.lens
for n in self.iterNodesNoRoot():
#pf.p4_setBrLen(n.cNode, n.br.len, n.br.lenChanged)
pf.p4_setBrLen(n.cNode, n.br.len)
# Model usage info
if self.model.isHet:
for pNum in range(self.model.nParts):
if self.model.parts[pNum].isHet:
# print "setCStuff(). about to setCompNum"
for n in self.iterNodes():
pf.p4_setCompNum(n.cNode, pNum, n.parts[pNum].compNum)
if n != self.root:
pf.p4_setRMatrixNum(
n.cNode, pNum, n.br.parts[pNum].rMatrixNum)
pf.p4_setGdasrvNum(
n.cNode, pNum, n.br.parts[pNum].gdasrvNum)
# pre- and postOrder
if not self.preAndPostOrderAreValid:
self.setPreAndPostOrder()
[docs]
def _commonCStuff(self, resetEmpiricalComps=True):
"""Allocate and set c-stuff, and setPrams."""
if not self.data:
if self.name:
gm = ["Tree %s (_commonCStuff)" % self.name]
else:
gm = ["Tree (_commonCStuff)"]
gm.append(
"This tree has no data attached. Before doing an optimization, likelihood")
gm.append(
"calculation, or simulation, you need to do something like this:")
gm.append(" theTree.data = theData")
raise P4Error(gm)
#print("self.cTree = %s" % self.cTree)
if not self.cTree:
# This calls self.modelSanityCheck(), which calls
# self.setEmpiricalComps()
self._allocCStuff(resetEmpiricalComps=resetEmpiricalComps)
#print("About to self.model.setCStuff()")
self.model.setCStuff()
#print("About to self.setCStuff()")
self.setCStuff()
#print("about to p4_setPrams()...")
pf.p4_setPrams(self.cTree, -1) # "-1" means do all parts
#print("finished _commonCStuff()")
[docs]
def calcLogLike(self, verbose=1, resetEmpiricalComps=True):
"""Calculate the likelihood of the tree, without optimization."""
self._commonCStuff(resetEmpiricalComps=resetEmpiricalComps)
# print("about to p4_treeLogLike()...")
# second arg is getSiteLikes
self.logLike = pf.p4_treeLogLike(self.cTree, 0)
if verbose:
print("Tree.calcLogLike(). %f" % self.logLike)
[docs]
def optLogLike(self, verbose=1, method="BOBYQA", optBrLens=True):
"""Calculate the likelihood of the tree, with optimization.
There are different optimization methods-- choose one. I've
made 'BOBYQA' the default, as it is very fast and seems to be
working. It is from the nlopt library.
Other opt methods include ---
newtAndBrentPowell -- fairly fast, and works well. It was the
default. Perhaps use this in combination with BOBYQA, eg
t.optLogLike(method="BOBYQA")
t.optLogLike(method="newtAndBrentPowell")
The 'allBrentPowell' optimizer was the default several years
ago, as it seems to be the most robust, although it is slow.
It might be good for checking important calculations.
'newtAndBOBYQA' --- fast and seems to work well.
As suggested above, for difficult optimizations it may help to
repeat the call to optLogLike(), perhaps with a different
method.
Arg optBrLens (default True), can be turned off. This week,
this only works with method="BOBYQA".
"""
gm = ["Tree.optLogLike()"]
if verbose:
theStartTime = time.time()
if 0:
for n in self.iterNodesNoRoot():
if n.br.len < var.BRLEN_MIN:
gm.append("All branch lengths should be greater than or equal to var.BRLEN_MIN,")
gm.append(f" which at the moment is {var.BRLEN_MIN}")
gm.append(f"Got a branch length of {n.br.len:.8f} {n.br.len:g}")
gm.append("Either make the branch length bigger, or lower var.BRLEN_MIN.")
gm.append("You could, for example, t.stripBrLens() which makes all br lens default 0.1")
raise P4Error(gm)
if not optBrLens:
if method != "BOBYQA":
gm.append("Turning arg optBrLens off only works with BOBYQA")
raise P4Error(gm)
self._commonCStuff()
if method == "newtAndBrentPowell":
pf.p4_newtSetup(self.cTree)
pf.p4_newtAndBrentPowellOpt(self.cTree)
elif method == "allBrentPowell":
pf.p4_allBrentPowellOptimize(self.cTree)
elif method == "newtAndBOBYQA":
pf.p4_newtSetup(self.cTree)
pf.p4_newtAndBOBYQAOpt(self.cTree)
elif method == "BOBYQA":
if optBrLens:
pf.p4_allBOBYQAOptimize(self.cTree, 1)
else:
pf.p4_allBOBYQAOptimize(self.cTree, 0)
else:
gm.append('method should be one of "newtAndBrentPowell", "allBrentPowell", "newtAndBOBYQA", or "BOBYQA"')
raise P4Error(gm)
# Do a final like calc. (second arg is getSiteLikes)
self.logLike = pf.p4_treeLogLike(self.cTree, 0)
# get the brLens
brLens = pf.p4_getBrLens(self.cTree)
for n in self.iterNodesNoRoot():
n.br.len = brLens[n.nodeNum]
# get the other free prams
prams = pf.p4_getFreePrams(self.cTree)
self.model.restoreFreePrams(prams)
if verbose:
print("optLogLike = %f" % self.logLike)
theEndTime = time.time()
print("cpu time %s seconds." % (theEndTime - theStartTime))
[docs]
def optTest(self):
self._commonCStuff()
theStartTime = time.time()
doXfer = 0
for i in range(1):
if doXfer:
self.model.setCStuff()
self.setCStuff()
pf.p4_setPrams(self.cTree, -1)
self.logLike = pf.p4_treeLogLike(self.cTree, 0)
if doXfer:
# get the brLens
brLens = pf.p4_getBrLens(self.cTree)
for i in range(len(self.nodes)):
n = self.nodes[i]
if n != self.root:
n.br.len = brLens[i]
# get the other free prams
prams = pf.p4_getFreePrams(self.cTree)
self.model.restoreFreePrams(prams)
print("time %s seconds." % (time.time() - theStartTime))
[docs]
def simulate(self, calculatePatterns=True, resetSequences=True, resetNexusSetsConstantMask=True, refTree=None):
"""Simulate into the attached data.
The tree self needs to have a data and model attached.
Generation of random numbers uses the GSL random number
generator. The state is held in var.gsl_rng, which is None by
default. If you do a simulation using this method, it will
use ``var.gsl_rng`` if it exists, or make it if it does not exist
yet. When it makes it, it seeds the state based on the
current time. That should give you lots of variation in the
simulations.
If on the other hand you want to make simulations that are the
same you can reseed the randomizer with the same seed whenever
you do it, like this::
if not var.gsl_rng:
var.gsl_rng = pf.gsl_rng_get()
# unusually, set the seed with each simulation
mySeed = 23 # your chosen int seed
pf.gsl_rng_set(var.gsl_rng, mySeed)
The usual way to simulate does not use reference data. An unusual way to
simulate comes from (inspired by?) PhyloBayes, where the simulation is
conditional on the original data. It uses conditional likelihoods of
that reference data at the root. To turn that on, set refTree to the
tree+model+data that you would like to use. Calculate a likelihood with
that refTree before using it, so that conditional likelihoods are set.
The tree and model for refTree should be identical to the tree and model
for self.
Args:
calculatePatterns (bool): True by default. Whether to "compress" the
newly simulated data to facilitate a faster likelihood
calculation.
resetSequences (bool): True by default. whether to bring the
simulated sequences in C back into Python
resetNexusSetsConstantMask (bool): True by default. When
simulations are made, the constant mask in any associated nexus
sets will get out of sync. Setting this to True makes a new
mask and sets it.
refTree (Tree): None by default. If supplied, a tree+model+data
which has had its likelihood calculated, where the tree+model is
identical to self.
"""
if refTree:
from p4.tree import Tree
assert isinstance(refTree, Tree)
assert refTree.model
assert refTree.data
if not refTree.cTree:
refTree.calcLogLike(verbose=False)
assert refTree.model.cModel
assert refTree.data.cData
if not var.gsl_rng:
var.gsl_rng = pf.gsl_rng_get()
pf.gsl_rng_set(var.gsl_rng, int(time.time()))
self._commonCStuff()
if refTree:
assert refTree.data.cData != self.data.cData
assert refTree.data.nParts == self.data.nParts
assert refTree.data.nTax == self.data.nTax
for i in range(self.data.nTax):
assert refTree.data.taxNames[i] == self.data.taxNames[i]
assert len(refTree.data.alignments) == len(self.data.alignments)
assert refTree.logLike, "Do a likelihood calculation with the refTree before using it here."
# could have some more checks ...
# If there is a NexusSets object attached to any of the alignments
# in the Data, the constant sites mask at least will become out of sync, but we can't just
# delete the whole nexusSets object, as they define what the parts are.
# for a in self.data.alignments:
#
# if a.nexusSets:
# a.nexusSets = None
# Probably better to do something like this
# a.nexusSets.constant.mask = self.constantMask()
# at the end.
# print "About to pf.p4_simulate(self.cTree)"
if refTree:
pf.p4_simulate(self.cTree, refTree.cTree, var.gsl_rng)
else:
pf.p4_simulate(self.cTree, 0, var.gsl_rng)
if calculatePatterns:
for p in self.data.parts:
pf.makePatterns(p.cPart)
pf.setGlobalInvarSitesVec(p.cPart)
if resetSequences:
self.data.resetSequencesFromParts()
if resetNexusSetsConstantMask:
for a in self.data.alignments:
if a.nexusSets:
a.nexusSets.constant.mask = a.constantMask()
else:
if resetNexusSetsConstantMask:
gm = ['Tree.simulate().']
gm.append(
"resetSequences is not set, but resetNexusSetsConstantMask is set,")
gm.append("which is probably not going to work as you want.")
raise P4Error(gm)
[docs]
def ancestralStateDraw(self):
"""Make a draw from the inferred root character state distribution
This method works on a tree with an attached model and data.
Conditional on the tree, branch lengths, model, and data, this method
infers the ancestral character states of the root node. However, that
inference is probabilistic, a distribution, and this method takes a
single draw. It returns a string.
"""
gm = ['Tree.ancestralStateDraw().']
self._commonCStuff()
self.logLike = pf.p4_treeLogLike(self.cTree, 0)
draw = numpy.empty(4, dtype=numpy.int32)
ancSts = []
for pNum in range(self.data.nParts):
dp = self.data.parts[pNum]
ancStsPart = []
for seqPos in range(dp.nChar):
pf.p4_drawAncState(self.cTree, pNum, seqPos, draw)
if draw[1] >= 0: # gamma cat if it is a variable site, else -1
assert draw[2] == 0 # not invar
assert draw[0] >= 0 # char num
ancStsPart.append(dp.symbols[draw[0]])
elif draw[2]: # isInvar, zero if not
assert draw[0] == -1
assert draw[1] == -1
assert draw[3] >= 0 # invar char num
ancStsPart.append(dp.symbols[draw[3]])
else:
gm.append("Problem with returned draw. Got %s" % draw)
raise P4Error(gm)
assert len(ancStsPart) == dp.nChar
ancSts.append(''.join(ancStsPart))
return ''.join(ancSts)
[docs]
def getSiteLikes(self):
"""Likelihoods, not log likes. Placed in self.siteLikes, a list."""
self._commonCStuff()
# second arg is getSiteLikes
self.logLike = pf.p4_treeLogLike(self.cTree, 1)
self.siteLikes = []
for p in self.data.parts:
self.siteLikes += pf.getSiteLikes(p.cPart)
# def getSiteRates(self):
# """Get posterior mean site rate, and gamma category.
# This says two things --
# 1. The posterior mean site rate, calculated like PAML
# 2. Which GDASRV category contributes most to the likelihood.
# The posterior mean site rate calculation requires that there be
# only one gdasrv over the tree, which will usually be the case.
# For placement in categories, if its a tie score, then it is placed
# in the first one.
# The list of site rates, and the list of categories, both with one
# value for each site, are put into separate numpy arrays, returned
# as a list, ie [siteRatesArray, categoriesArray]
# There is one of these lists for each data partition, and the results as a
# whole are returned as a list. So if you only have one data
# partition, then you get a 1-item list, and that single item is a list with 2
# numpy arrays. Ie [[siteRatesArray, categoriesArray]]
# If nGammaCat for a partition is 1, it will give that partition an
# array of ones for the site rates and zeros for the categories.
# """
# self._commonCStuff()
# # second arg is getSiteLikes
# self.logLike = pf.p4_treeLogLike(self.cTree, 0)
# #self.winningGammaCats = []
# # for p in self.data.parts:
# # self.winningGammaCats += pf.getWinningGammaCats(p.cPart)
# results = []
# for partNum in range(len(self.data.parts)):
# if len(self.model.parts[partNum].gdasrvs) > 1:
# gm = ['Tree.getSiteRates()']
# gm.append("Part %i has %i gdasrvs. Maximum 1 allowed." % (
# partNum, len(self.model.parts[partNum].gdasrvs)))
# raise P4Error(gm)
# for partNum in range(len(self.data.parts)):
# p = self.data.parts[partNum]
# if self.model.parts[partNum].nGammaCat == 1:
# siteRates = numpy.ones(p.nChar, dtype=numpy.double)
# gammaCats = numpy.zeros(p.nChar, numpy.int32)
# elif self.model.parts[partNum].nGammaCat > 1:
# siteRates = numpy.zeros(p.nChar, dtype=numpy.double)
# gammaCats = numpy.zeros(p.nChar, numpy.int32)
# work = numpy.zeros(
# self.model.parts[partNum].nGammaCat, dtype=numpy.double)
# for charNum in range(p.nChar):
# gammaCats[charNum] = -1
# #pf.getWinningGammaCats(self.cTree, p.cPart, i, gammaCats, work)
# pf.getSiteRates(
# self.cTree, p.cPart, partNum, siteRates, gammaCats, work)
# # print siteRates
# # print gammaCats
# # print work
# if 0:
# counts = numpy.zeros(
# self.model.parts[partNum].nGammaCat, numpy.int32)
# for charNum in range(p.nChar):
# counts[winningGammaCats[charNum]] += 1
# print(counts)
# else:
# raise P4Error("This should not happen.")
# results.append([siteRates, gammaCats])
# return results