Source code for p4.mrp

# Matrix representation / parsimony.

from p4.tree import Tree
from p4.alignment import Alignment
from p4.p4exceptions import P4Error
import p4.func
from p4.nexussets import CharSet
from p4.node import Node
from p4.treepartitions import TreePartitions


"See also Alignment.mrpSlice()"



[docs]
def mrp(trees, taxNames=None):
    """Code a list of trees with matrix representation.

    The input should be a list of p4 Tree objects.

    The argument 'taxNames' need not be supplied, but you can if you
    want to.

    This returns an alignment, with a character set for each input tree.

    For example, you might say::

      read('myTrees.phy')
      a = mrp(var.trees)
      a.writeNexus('a.nex')
    """

    gm = ['mrp()']
    if not isinstance(trees, list):
        gm.append("The 'trees' arg should be a list of p4 tree objects.")
        raise P4Error(gm)
    for t in trees:
        if not isinstance(t, Tree):
            gm.append("The 'trees' arg should be a list of p4 tree objects.")
            raise P4Error(gm)

    myTaxNames = []
    for t in trees:
        for n in t.iterLeavesNoRoot():
            if n.name not in myTaxNames:
                myTaxNames.append(n.name)
    if taxNames:
        suppliedTaxNamesSet = set(taxNames)
        myTaxNamesSet = set(myTaxNames)
        if suppliedTaxNamesSet != myTaxNamesSet:
            print(suppliedTaxNamesSet)
            print(myTaxNamesSet)
            symDiff = myTaxNamesSet.symmetric_difference(suppliedTaxNamesSet)
            gm.append(
                "The taxNames list supplied does not represent the taxa in the input trees.")
            gm.append("The symmetric difference is:")
            gm.append(symDiff)
            raise P4Error(gm)
    else:
        taxNames = myTaxNames

    # make bitKey's for taxNames, in a dictionary
    txBkDict = {}
    for tNum in range(len(taxNames)):
        tx = taxNames[tNum]
        bk = 1 << tNum
        txBkDict[tx] = bk

    # Decorate trees with BitKeys, and count the number of splits.
    nSplits = 0
    for t in trees:
        tNSplits = 0
        for n in t.iterPostOrder():
            if not n == t.root:
                if n.isLeaf:
                    # order comes from taxNames, not from the tree
                    n.br.bitKey = 1 << taxNames.index(n.name)
                else:
                    nSplits += 1
                    tNSplits += 1
                    childrenNums = t.getChildrenNums(n)
                    try:
                        x = t.nodes[childrenNums[0]].br.bitKey
                        for i in childrenNums[1:]:
                            y = t.nodes[i].br.bitKey
                            x = x | y
                    except AttributeError:
                        print("t.preAndPostOrderAreValid = %s" % t.preAndPostOrderAreValid)
                        # t.draw()
                        print("n is nodeNum %i" % n.nodeNum)
                        print("childrenNums = %s" % childrenNums)
                        raise AttributeError
                    n.br.bitKey = x

        t.nSplits = tNSplits
        n = t.root
        assert not n.isLeaf
        childrenNums = t.getChildrenNums(n)
        x = t.nodes[childrenNums[0]].br.bitKey
        for i in childrenNums[1:]:
            y = t.nodes[i].br.bitKey
            x = x | y
        t.taxBits = x

    if nSplits == 0:
        for t in trees:
            t.write()
        gm = ["mrp().  No splits were found in the input trees."]
        gm.append("That does not work.")
        raise P4Error(gm)
    a = p4.func.newEmptyAlignment(
        dataType='standard', symbols='01', taxNames=taxNames, length=nSplits)
    a.setNexusSets()
    for s in a.sequences:
        s.sequence = list(s.sequence)
    siteNum = 0
    tRange = range(len(taxNames))
    for tNum in range(len(trees)):
        t = trees[tNum]
        if t.nSplits:
            csName = 'cs%i' % tNum
            cs = CharSet(a.nexusSets)
            cs.nChar = nSplits
            cs.name = csName
            cs.num = tNum
            cs.lowName = csName
            cs.format = 'vector'
            cs.start = siteNum
            for n in t.iterPostOrder():
                if n != t.root:
                    if not n.isLeaf:
                        assert n.br.bitKey
                        for tNum in tRange:
                            tx = taxNames[tNum]
                            bk = txBkDict[tx]
                            s = a.sequences[tNum].sequence
                            if bk & n.br.bitKey:
                                s[siteNum] = '1'
                            elif bk & t.taxBits:
                                s[siteNum] = '0'
                            else:
                                s[siteNum] = '?'
                        siteNum += 1
            cs.mask = ['0'] * nSplits
            for cPos in range(cs.start, siteNum):
                cs.mask[cPos] = '1'
            cs.mask = ''.join(cs.mask)
            cs.standardize()
            a.nexusSets.charSets.append(cs)
            a.nexusSets.charSetsDict[cs.name] = cs
    for s in a.sequences:
        s.sequence = ''.join(s.sequence)
    return a




[docs]
def reverseMrp(alignment):
    """Reconstruct trees from a matrix representation.

    This needs character sets, one for each tree.

    You might say::

      read('a.nex')          # read the matrix representation in
      a = var.alignments[0]  # give the alignment a name
      a.setNexusSets()       # apply var.nexusSets to the alignment
      tt = reverseMrp(a)     # the function returns a list of tree objects
      for t in tt:
          t.write()

    """

    a = alignment
    assert a.nexusSets
    assert a.nexusSets.charSets
    tRange = range(len(a.taxNames))
    tt = []
    for cs in a.nexusSets.charSets:
        # print cs.name
        # cs.dump()
        # cs.vectorize()
        vPos = 0
        while cs.mask[vPos] == '0':
            vPos += 1
        firstVPos = vPos
        firstSite = a.sequenceSlice(vPos)
        # print firstSite
        thisTNames = []
        tNums = []
        for tPos in tRange:
            if firstSite[tPos] != '?':
                thisTNames.append(a.taxNames[tPos])
                tNums.append(tPos)
        # print thisTNames

        csMaskChar = cs.mask[vPos]
        while 1:
            if csMaskChar == '1':
                st = a.sequenceSlice(vPos)
                for tPos in tRange:
                    if st[tPos] == '?':
                        assert tPos not in tNums, "bad site %i, taxon %s" % (
                            vPos, a.taxNames[tPos])
                    else:
                        assert tPos in tNums, "bad site %i, taxon %s" % (
                            vPos, a.taxNames[tPos])
            vPos += 1
            try:
                csMaskChar = cs.mask[vPos]
            except IndexError:
                break

        vPos = firstVPos
        csMaskChar = cs.mask[vPos]
        partitions = []
        while 1:
            if csMaskChar == '1':
                st = a.sequenceSlice(vPos)
                txPos = 0
                aPart = []
                for tPos in tRange:
                    if st[tPos] != '?':
                        if st[tPos] == '1':
                            aPart.append(txPos)
                        txPos += 1
                partitions.append(aPart)
            vPos += 1
            try:
                csMaskChar = cs.mask[vPos]
            except IndexError:
                break
        # print partitions
        tp = TreePartitions()
        t = tp.makeTreeFromPartitions(partitions, taxNames=thisTNames)
        t.name = cs.name
        tt.append(t)
    return tt