Source code for p4.mrp

# Matrix representation / parsimony.

from p4.tree import Tree
from p4.alignment import Alignment
from p4.p4exceptions import P4Error
import p4.func
from p4.nexussets import CharSet
from p4.node import Node
from p4.treepartitions import TreePartitions


"See also Alignment.mrpSlice()"


[docs] def mrp(trees, taxNames=None): """Code a list of trees with matrix representation. The input should be a list of p4 Tree objects. The argument 'taxNames' need not be supplied, but you can if you want to. This returns an alignment, with a character set for each input tree. For example, you might say:: read('myTrees.phy') a = mrp(var.trees) a.writeNexus('a.nex') """ gm = ['mrp()'] if not isinstance(trees, list): gm.append("The 'trees' arg should be a list of p4 tree objects.") raise P4Error(gm) for t in trees: if not isinstance(t, Tree): gm.append("The 'trees' arg should be a list of p4 tree objects.") raise P4Error(gm) myTaxNames = [] for t in trees: for n in t.iterLeavesNoRoot(): if n.name not in myTaxNames: myTaxNames.append(n.name) if taxNames: suppliedTaxNamesSet = set(taxNames) myTaxNamesSet = set(myTaxNames) if suppliedTaxNamesSet != myTaxNamesSet: print(suppliedTaxNamesSet) print(myTaxNamesSet) symDiff = myTaxNamesSet.symmetric_difference(suppliedTaxNamesSet) gm.append( "The taxNames list supplied does not represent the taxa in the input trees.") gm.append("The symmetric difference is:") gm.append(symDiff) raise P4Error(gm) else: taxNames = myTaxNames # make bitKey's for taxNames, in a dictionary txBkDict = {} for tNum in range(len(taxNames)): tx = taxNames[tNum] bk = 1 << tNum txBkDict[tx] = bk # Decorate trees with BitKeys, and count the number of splits. nSplits = 0 for t in trees: tNSplits = 0 for n in t.iterPostOrder(): if not n == t.root: if n.isLeaf: # order comes from taxNames, not from the tree n.br.bitKey = 1 << taxNames.index(n.name) else: nSplits += 1 tNSplits += 1 childrenNums = t.getChildrenNums(n) try: x = t.nodes[childrenNums[0]].br.bitKey for i in childrenNums[1:]: y = t.nodes[i].br.bitKey x = x | y except AttributeError: print("t.preAndPostOrderAreValid = %s" % t.preAndPostOrderAreValid) # t.draw() print("n is nodeNum %i" % n.nodeNum) print("childrenNums = %s" % childrenNums) raise AttributeError n.br.bitKey = x t.nSplits = tNSplits n = t.root assert not n.isLeaf childrenNums = t.getChildrenNums(n) x = t.nodes[childrenNums[0]].br.bitKey for i in childrenNums[1:]: y = t.nodes[i].br.bitKey x = x | y t.taxBits = x if nSplits == 0: for t in trees: t.write() gm = ["mrp(). No splits were found in the input trees."] gm.append("That does not work.") raise P4Error(gm) a = p4.func.newEmptyAlignment( dataType='standard', symbols='01', taxNames=taxNames, length=nSplits) a.setNexusSets() for s in a.sequences: s.sequence = list(s.sequence) siteNum = 0 tRange = range(len(taxNames)) for tNum in range(len(trees)): t = trees[tNum] if t.nSplits: csName = 'cs%i' % tNum cs = CharSet(a.nexusSets) cs.nChar = nSplits cs.name = csName cs.num = tNum cs.lowName = csName cs.format = 'vector' cs.start = siteNum for n in t.iterPostOrder(): if n != t.root: if not n.isLeaf: assert n.br.bitKey for tNum in tRange: tx = taxNames[tNum] bk = txBkDict[tx] s = a.sequences[tNum].sequence if bk & n.br.bitKey: s[siteNum] = '1' elif bk & t.taxBits: s[siteNum] = '0' else: s[siteNum] = '?' siteNum += 1 cs.mask = ['0'] * nSplits for cPos in range(cs.start, siteNum): cs.mask[cPos] = '1' cs.mask = ''.join(cs.mask) cs.standardize() a.nexusSets.charSets.append(cs) a.nexusSets.charSetsDict[cs.name] = cs for s in a.sequences: s.sequence = ''.join(s.sequence) return a
[docs] def reverseMrp(alignment): """Reconstruct trees from a matrix representation. This needs character sets, one for each tree. You might say:: read('a.nex') # read the matrix representation in a = var.alignments[0] # give the alignment a name a.setNexusSets() # apply var.nexusSets to the alignment tt = reverseMrp(a) # the function returns a list of tree objects for t in tt: t.write() """ a = alignment assert a.nexusSets assert a.nexusSets.charSets tRange = range(len(a.taxNames)) tt = [] for cs in a.nexusSets.charSets: # print cs.name # cs.dump() # cs.vectorize() vPos = 0 while cs.mask[vPos] == '0': vPos += 1 firstVPos = vPos firstSite = a.sequenceSlice(vPos) # print firstSite thisTNames = [] tNums = [] for tPos in tRange: if firstSite[tPos] != '?': thisTNames.append(a.taxNames[tPos]) tNums.append(tPos) # print thisTNames csMaskChar = cs.mask[vPos] while 1: if csMaskChar == '1': st = a.sequenceSlice(vPos) for tPos in tRange: if st[tPos] == '?': assert tPos not in tNums, "bad site %i, taxon %s" % ( vPos, a.taxNames[tPos]) else: assert tPos in tNums, "bad site %i, taxon %s" % ( vPos, a.taxNames[tPos]) vPos += 1 try: csMaskChar = cs.mask[vPos] except IndexError: break vPos = firstVPos csMaskChar = cs.mask[vPos] partitions = [] while 1: if csMaskChar == '1': st = a.sequenceSlice(vPos) txPos = 0 aPart = [] for tPos in tRange: if st[tPos] != '?': if st[tPos] == '1': aPart.append(txPos) txPos += 1 partitions.append(aPart) vPos += 1 try: csMaskChar = cs.mask[vPos] except IndexError: break # print partitions tp = TreePartitions() t = tp.makeTreeFromPartitions(partitions, taxNames=thisTNames) t.name = cs.name tt.append(t) return tt