Source code for p4.nexussets

import os
import sys
import string
import copy
from p4.var import var
from p4.nexustoken import nexusSkipPastNextSemiColon, safeNextTok
import p4.func
from p4.p4exceptions import P4Error


# [Examples from the paup manual,
# but note the bad charpartition subset names '1' and '2'.  P4 would not allow those names.]
# charset coding = 2-457 660-896;
# charset noncoding = 1 458-659 897-898;
# charpartition gfunc = 1:coding, 2:noncoding;

# Notes from MadSwofMad97.
# TaxSet taxset-name [({Standard | Vector})] = taxon-set;  # standard is default
# TaxPartition partition-name [([{[No]Tokens}]       # tokens is default
#                             [{standard|vector}])]  # standard is default
#                              = subset-name:taxon-set [, subset-name:taxon-set...];
# eg TaxSet outgroup=1-4;
#    TaxSet beetles=Omma-.;
#
# taxpartition populations=1:1-3, 2:4-6, 3:7 8;  # note bad taxpartition names 1, 2, 3
# taxpartition populations (vector notokens) = 11122233;
#

class CaseInsensitiveDict(dict):

    """A dictionary that is case insensitive, for Nexus"""

    def __init__(self, default=None):
        dict.__init__(self)
        self.default = default
        #self.keyDict = {}

    def __setitem__(self, key, val):
        if not isinstance(key, str):
            gm = ["CaseInsensitiveDict()"]
            gm.append("The key must be a string.  Got '%s'" % key)
            raise P4Error(gm)
        lowKey = key.lower()
        dict.__setitem__(self, lowKey, val)
        #self.keyDict[key.lower()] = key

    def __getitem__(self, key):
        if not isinstance(key, str):
            gm = ["CaseInsensitiveDict()"]
            gm.append("The key must be a string.  Got '%s'" % key)
            raise P4Error(gm)
        lowKey = key.lower()
        try:
            return dict.__getitem__(self, lowKey)
        except KeyError:
            return self.default

    def get(self, key, *args):
        if not args:
            args = (self.default,)
        return dict.get(self, key, *args)


#########################################################################
# CLASS    NexusSets
#########################################################################


[docs]class NexusSets(object): """A container for Nexus CharSet, CharPartition, and TaxSet objects. When the first Nexus sets block is read, a NexusSets object is made and saved as ``var.nexusSets``. ``CharSet``, ``TaxSet``, and ``CharPartition`` objects are placed in it, as they are read/created. TaxPartition commands are not implemented. Here is a simple nexus sets block that only has charsets:: #nexus begin sets; charset pos1 = 1-.\\3; charset pos2 = 2-.\\3; charset pos3 = 3-.\\3; end; To get the third positions only, you could say:: read('myAlignment.phy') a = var.alignments[0] read('mySets.nex') # the sets block above b = a.subsetUsingCharSet('pos3') What happens above when the mySets.nex file is read is that a NexusSets object is created as ``var.nexusSets`` and populated with the three charsets as CharSet objects. Then when you asked for a subset, a copy of that NexusSets object was made and applied to the alignment. Notice that the length of the alignment is not part of the information in the sets block, and so things remain undefined in ``var.nexusSets`` until the nexus sets are applied to a particular alignment. One consequence of this somewhat awkward system is that the same charsets could then be applied to another alignment of a different size:: read('myAlignment.phy') aA = var.alignments[0] read('anotherAlignment.nex') aB = var.alignments[1] read('mySets.nex') # the sets block above bA = aA.subsetUsingCharSet('pos3') bB = aB.subsetUsingCharSet('pos3') In the above example, ``bA.nexusSets`` and ``bB.nexusSets`` are both derived from ``var.nexusSets`` but are independent of it, and different from each other. So when an Alignment (or Tree object) wants to use ``var.nexusSets``, it makes a copy of it, and attaches the copy as theAlignment.nexusSets or theTree.nexusSets Here is another example, including a ``charPartition`` definition:: begin sets; charset gene1 = 1-213; charset gene2 = 214-497; charPartition cpName = gene1:gene1, gene2:gene2; end; For an alignment, you can then set a **character partition** by :: a.setCharPartition(cpName) Do this *before* you make a Data object, to partition the alignment. You can also use charsets to extract subsets, eg via:: b = a.subsetUsingCharSet(csName) Setting a charPartition or asking for a subset will trigger applying ``var.nexusSets`` to the alignment, but you can also do it explicitly, by:: myTree.setNexusSets() NexusSets knows about predefined 'constant', 'gapped', and 'remainder' charsets. It does not know about 'missambig' or 'uninf' charsets. NexusSets can either be in the default standard format or in vector format -- you can change them to vector format with the :: mySet.vectorize() method, and you can change them to standard format with the :: mySet.standardize() method. For taxSets, you can use actual tax names (rather than numbers or ranges) by setting:: myTaxSet.useTaxNames = True # default None Each taxSet has a:: taxSet.taxNames list, which might be handy. You can see the current state of a NexusSets object using :: myNexusSets.dump() It can also be written out as a nexus sets block. If an Alignment object has a ``nexusSets`` attribute then if you ask the alignment to write itself to a nexus file then the Alignment.nexusSets is also written. If you would rather it not be written, delete it first. If you would rather it be written to a separate file, do that first and then delete it. One nice thing about taxsets is that :meth:`Tree.Tree.tv` and :meth:`Tree.Tree.btv` know about them and can display them. """ def __init__(self): self.charSets = [] self.charSetsDict = CaseInsensitiveDict() self.charSetLowNames = [] self.taxSets = [] self.taxSetsDict = CaseInsensitiveDict() self.taxSetLowNames = [] self.charPartitions = [] self.charPartitionsDict = CaseInsensitiveDict() self.charPartitionLowNames = [] self.charPartition = None #self.alignment = None self.aligNChar = None self.taxNames = [] self.nTax = None self.predefinedCharSetLowNames = ['constant', 'gapped'] # The nexus format defines several "predefined" charSets. # For all datatypes: # constant # gapped # missambig # remainder # uninf # I only have implemented 2-- constant and gapped. The # 'remainder' charSet is handled by p4, but not as a CharSet # object, since its content depends on the context. cS = CharSet(self) cS.num = -1 cS.name = 'constant' cS.lowName = 'constant' cS.format = 'vector' # self.charSets.append(cS) self.constant = cS self.charSetsDict['constant'] = self.constant cS = CharSet(self) cS.num = -1 cS.name = 'gapped' cS.lowName = 'gapped' cS.format = 'vector' # self.charSets.append(cS) self.gapped = cS self.charSetsDict['gapped'] = self.gapped def _continueReadingFromNexusFile(self, flob): gm = ['NexusSets._continueReadingFromNexusFile()'] if hasattr(flob, 'name') and flob.name: gm.append("file name %s" % flob.name) nexusSkipPastNextSemiColon(flob) commandName = safeNextTok(flob, gm[0]) lowCommandName = commandName.lower() # print 'got lowCommandName = %s' % lowCommandName while lowCommandName not in [None, 'end', 'endblock']: # print "Got lowCommandName '%s'" % lowCommandName if lowCommandName == 'charset': self._readCharSetCommand(flob) elif lowCommandName == 'charpartition': self._readCharPartitionCommand(flob) elif lowCommandName == 'taxset': self._readTaxSetCommand(flob) elif lowCommandName == 'taxpartition': print() print(gm[0]) if len(gm) > 1: print(gm[1]) print(" Sorry-- taxpartition is not implemented.") nexusSkipPastNextSemiColon(flob) else: gm.append("Got unrecognized sets block command '%s'" % commandName) raise P4Error(gm) commandName = safeNextTok( flob, 'NexusSets.continueReadingFromNexusFile()') lowCommandName = commandName.lower() def _readCharSetCommand(self, flob): # We have just read 'charset'. The next thing we expect is the charset # name. gm = ['NexusSets._readCharSetCommand()'] if hasattr(flob, 'name') and flob.name: gm.append("file name %s" % flob.name) name = p4.func.nexusUnquoteName( safeNextTok(flob, 'NexusSets: _readCharSetCommand')) # print "readCharSetCommand: got name '%s'" % name lowName = name.lower() if not p4.func.nexusCheckName(lowName): gm.append("Bad charSet name '%s'" % name) raise P4Error(gm) # Check for duped names if lowName in self.charSetLowNames: gm.append("Duplicated charSet name '%s'" % name) raise P4Error(gm) elif lowName in self.predefinedCharSetLowNames: gm.append( "You cannot use the name '%s' -- it is predefined." % name) raise P4Error(gm) cs = CharSet(self) cs.name = name cs.lowName = lowName cs.readTaxOrCharSetDefinition(flob) cs.num = len(self.charSets) self.charSets.append(cs) self.charSetsDict[name] = cs self.charSetLowNames.append(cs.lowName) def _readTaxSetCommand(self, flob): # We have just read 'taxset'. The next thing we expect is the taxset # name. gm = ['NexusSets._readTaxSetCommand()'] if hasattr(flob, 'name') and flob.name: gm.append("file name %s" % flob.name) name = p4.func.nexusUnquoteName( safeNextTok(flob, 'NexusSets: readTaxSetCommand')) # print "readTaxSetCommand: got name '%s'" % name lowName = name.lower() if not p4.func.nexusCheckName(lowName): gm.append("Bad taxSet name '%s'" % name) raise P4Error(gm) # Check for duped names if lowName in self.taxSetLowNames: gm.append("Duplicated taxSet name '%s'" % name) raise P4Error(gm) ts = TaxSet(self) ts.name = name ts.lowName = lowName ts.readTaxOrCharSetDefinition(flob) ts.num = len(self.taxSets) self.taxSets.append(ts) self.taxSetsDict[name] = ts self.taxSetLowNames.append(ts.lowName) def _readCharPartitionCommand(self, flob): gm = ['NexusSets._readCharPartitionCommand()'] if hasattr(flob, 'name') and flob.name: gm.append("file name %s" % flob.name) name = p4.func.nexusUnquoteName(safeNextTok(flob, gm[0])) # print "readCharPartitionCommand: got name '%s'" % name lowName = name.lower() if not p4.func.nexusCheckName(lowName): gm.append("Bad charPartition name '%s'" % name) if lowName in self.charPartitionLowNames: gm.append("Duplicated charPartition name '%s'" % name) raise P4Error(gm) cp = CharPartition(self) cp.name = name cp.lowName = lowName cp._readCharPartitionDefinition(flob) self.charPartitions.append(cp) self.charPartitionsDict[name] = cp self.charPartitionLowNames.append(cp.lowName)
[docs] def dump(self): print(" NexusSets dump") if self.constant: print(" Predefined char set 'constant'") self.constant.dump() if self.gapped: print(" Predefined char set 'gapped'") self.gapped.dump() print(" There are %i non-predefined char sets" % len(self.charSets)) for cs in self.charSets: cs.dump() print(" There are %i tax sets" % len(self.taxSets)) for ts in self.taxSets: ts.dump() print(" There are %i char partitions" % len(self.charPartitions)) for cp in self.charPartitions: cp.dump() if self.charPartition: print(" self.charPartition.name is %s" % p4.func.nexusFixNameIfQuotesAreNeeded(self.charPartition.name)) else: print(" There is no self.charPartition")
[docs] def write(self): """Write self in Nexus format to stdout.""" self.writeNexusToOpenFile(sys.stdout)
[docs] def writeNexus(self, fName=None): """Write self in Nexus format to stdout or a file.""" if fName: f = open(fName, 'w') else: f = sys.stdout f.write('#nexus\n\n') self.writeNexusToOpenFile(f) if fName: f.close()
[docs] def writeNexusToOpenFile(self, flob): """This only writes non-trivial stuff. Ie if self has only constant and gapped charsets, then it does not write anything.""" if self.charSets or self.charPartitions or self.taxSets: flob.write('begin sets;\n') for cs in self.charSets: cs.writeNexusToOpenFile(flob) for cp in self.charPartitions: cp.writeNexusToOpenFile(flob) for ts in self.taxSets: ts.writeNexusToOpenFile(flob) flob.write('end;\n\n')
[docs] def newCharSet(self, name, mask=None): cs = CharSet(self) cs.name = name cs.name = name.lower() cs.num = len(self.charSets) if mask: cs.format = 'vector' cs.mask = mask else: pass self.charSets.append(cs) self.charSetsDict[cs.name] = cs
[docs] def dupeCharSet(self, existingCharSetName, newName): theCS = self.charSetsDict.get(existingCharSetName) if not theCS: raise P4Error( "NexusSets.dupeCharSet() -- can't find char set '%s'" % existingCharSetName) cs = CharSet(self) cs.name = newName cs.lowName = newName.lower() cs.num = len(self.charSets) self.charSets.append(cs) self.charSetsDict[cs.name] = cs self.charSetLowNames.append(cs.lowName) cs.format = theCS.format cs.triplets = copy.deepcopy(theCS.triplets) # its a list of lists cs.tokens = theCS.tokens[:] cs.mask = theCS.mask cs.aligNChar = theCS.aligNChar
class TaxOrCharSet(object): def __init__(self, theNexusSets): self.nexusSets = theNexusSets self.num = -1 self.name = None self.lowName = None self._format = 'standard' # or 'vector' So it should be a property. self.triplets = [] self.tokens = [] self.mask = None self.className = 'TaxOrCharSet' self.lowTaxNames = [] self.taxNames = [] self.useTaxNames = None # undecided def _getFormat(self): return self._format def _setFormat(self, newFormat): assert newFormat in ['standard', 'vector'] self._format = newFormat format = property(_getFormat, _setFormat) def dump(self): print(" %s %i" % (self.className, self.num)) print(" name: %s" % self.name) if hasattr(self, 'aligNChar'): print(" aligNChar: %s" % self.aligNChar) print(" format: %s" % self.format) print(" useTaxNames: %s" % self.useTaxNames) print(" triplets: ") for t in self.triplets: print(" %s" % t) if hasattr(self, 'numberTriplets'): print(" numberTriplets: ") for t in self.numberTriplets: print(" %s" % t) print(" tokens: %s" % self.tokens) print(" mask: %s" % self.mask) if self.mask: print(" mask 1s-count: %s" % self.mask.count('1')) def readTaxOrCharSetDefinition(self, flob): gm = ['%s.readTaxSetDefinition()' % self.className] if hasattr(flob, 'name') and flob.name: gm.append("file name %s" % flob.name) tok = safeNextTok(flob, gm[0]) lowTok = tok.lower() # print "readTaxSetDefinition: get tok '%s'" % tok if lowTok == '=': pass elif lowTok == '(': #['standard', 'vector']: tok = p4.func.nexusUnquoteName(safeNextTok(flob, gm[0])) lowTok = tok.lower() if lowTok == 'standard': pass elif lowTok == 'vector': self.format = 'vector' else: gm.append("Unexpected '%s'" % tok) gm.append("(I was expecting either 'standard' or") gm.append("'vector' following the parenthesis.)") raise P4Error(gm) tok = p4.func.nexusUnquoteName(safeNextTok(flob, gm[0])) if tok == ')': pass else: gm.append("Unexpected '%s'" % tok) gm.append( "(I was expecting an unparentheis after '%s')" % self.format) raise P4Error(gm) tok = p4.func.nexusUnquoteName(safeNextTok(flob, gm[0])) if tok != '=': gm.append("Unexpected '%s'" % tok) gm.append("I was expecting an '=' after '(%s)'" % self.format) raise P4Error(gm) else: gm.append("Unexpected '%s'" % tok) raise P4Error(gm) # Now we are on the other side of the '=' tok = p4.func.nexusUnquoteName(safeNextTok(flob, gm[0])) lowTok = tok.lower() while lowTok not in [None, ';', 'end', 'endblock']: self.tokens.append(tok) tok = p4.func.nexusUnquoteName(safeNextTok(flob, gm[0])) lowTok = tok.lower() if self.format == 'vector': self.mask = ''.join(self.tokens) self.tokens = [] for i in range(len(self.mask)): if self.mask[i] not in ['0', '1']: gm.append("%s '%s', vector format" % (self.className, self.name)) gm.append("The vector must be all zeros or ones.") raise P4Error(gm) # print self.mask # do a once-over sanity check, and convert integer strings to ints # print "xx1 self.tokens is now %s" % self.tokens for tokNum in range(len(self.tokens)): tok = self.tokens[tokNum] lowTok = tok.lower() if lowTok in ['.', 'all', '-', '\\']: pass elif self.className == 'CharSet' and lowTok in self.nexusSets.charSetLowNames: # print " xx3 %s is an existing charSet" % tok pass elif self.className == 'CharSet' and lowTok in self.nexusSets.predefinedCharSetLowNames: # print " xx3 %s is a pre-defined charSet" % tok pass elif self.className == 'TaxSet' and lowTok in self.nexusSets.taxSetLowNames: # print " xx4 %s is an existing taxSet" % tok pass else: # print " xx5" try: intTok = int(tok) self.tokens[tokNum] = intTok except ValueError: if self.className == 'TaxSet': pass elif self.className == 'CharSet': gm.append("I don't understand the token '%s'" % tok) raise P4Error(gm) # Now I want to make a list of triplets representing eg 23-87\3 # first item = 23, second item = 87, third = 3 # not all will exist for each part of the char definition. tokNum = 0 self.triplets = [] while tokNum < len(self.tokens): tok = self.tokens[tokNum] # print "Considering tok[%i] '%s'" % (tokNum, tok) if isinstance(tok, str): lowTok = tok.lower() else: lowTok = None if self.className == 'TaxSet' and lowTok in self.nexusSets.taxSetLowNames or \ self.className == 'charSet' and lowTok in self.nexusSets.charSetLowNames: aTriplet = [tok, None, None] self.triplets.append(aTriplet) tokNum += 1 if tokNum < len(self.tokens): if self.tokens[tokNum] == '-': gm.append("%s '%s' definition" % (self.className, self.name)) gm.append( "An existing tax or char set may not be followed by a '-'") raise P4Error(gm) if self.tokens[tokNum] == '\\': gm.append("%s '%s' definition" % (self.className, self.name)) gm.append( "An existing tax or char set may not be followed by a '\\'") raise P4Error(gm) elif tok == 'all': aTriplet = [tok, None, None] self.triplets.append(aTriplet) tokNum += 1 if tokNum < len(self.tokens): if self.tokens[tokNum] == '-': gm.append("%s '%s' definition" % (self.className, self.name)) gm.append( "Tax or char set 'all' may not be followed by a '-'") raise P4Error(gm) if self.tokens[tokNum] == '\\': gm.append("%s '%s' definition" % (self.className, self.name)) gm.append( "Tax or char set 'all' may not be followed by a '\\'") raise P4Error(gm) elif tok == '-': gm.append("%s '%s' definition" % (self.className, self.name)) gm.append("Out of place '-'") raise P4Error(gm) elif tok == '\\': gm.append("%s '%s' definition" % (self.className, self.name)) gm.append("Out of place '\\'") raise P4Error(gm) elif tok == '.': aTriplet = [tok, None, None] self.triplets.append(aTriplet) tokNum += 1 if tokNum < len(self.tokens): if self.tokens[tokNum] == '-': gm.append("%s '%s' definition" % (self.className, self.name)) gm.append( "Tax or char set '.' may not be followed by a '-'") raise P4Error(gm) if self.tokens[tokNum] == '\\': gm.append("%s '%s' definition" % (self.className, self.name)) gm.append( "Tax or char set '.' may not be followed by a '\\'") raise P4Error(gm) elif isinstance(tok, (int, str)): aTriplet = [tok, None, None] tokNum += 1 if tokNum < len(self.tokens): if self.tokens[tokNum] == '-': tokNum += 1 if tokNum < len(self.tokens): # maybe '.' if isinstance(self.tokens[tokNum], str): aTriplet[1] = self.tokens[tokNum] elif isinstance(self.tokens[tokNum], int): if isinstance(aTriplet[0], int): if self.tokens[tokNum] > aTriplet[0]: aTriplet[1] = self.tokens[tokNum] else: gm.append( "%s '%s' definition" % (self.className, self.name)) gm.append( "If a range is defined by two numbers,") # gm.append("(as it appears to be -- %s %s %s)" % ( # aTriplet[0], aTriplet[1], # aTriplet[2])) gm.append( "the second number of a range must be bigger than") gm.append("the first.") raise P4Error(gm) else: aTriplet[1] = self.tokens[tokNum] else: raise P4Error(gm) tokNum += 1 if tokNum < len(self.tokens): if self.tokens[tokNum] == '\\': tokNum += 1 if tokNum < len(self.tokens): if isinstance(self.tokens[tokNum], int): aTriplet[2] = self.tokens[tokNum] else: gm.append( "%s '%s' definition" % (self.className, self.name)) gm.append( "Step value of a range must be a number") gm.append("(Got '%s')" % self.tokens[tokNum]) raise P4Error(gm) tokNum += 1 self.triplets.append(aTriplet) # print "xxy self.mask = %s" % self.mask if not self.triplets and not self.mask: if not var.allowEmptyCharSetsAndTaxSets: gm.append("%s '%s' definition" % (self.className, self.name)) gm.append("Got no definition (no triplets or mask)") gm.append("(Allow this by turning var.allowEmptyCharSetsAndTaxSets on)") raise P4Error(gm) if 0: print(gm[0]) print(" Got self.triplets %s" % self.triplets) def setMask(self): """Set self.mask.""" gm = ["%s.setMask() name='%s'" % (self.className, self.name)] if self.format == 'vector': if self.mask: pass else: gm.append("vector format, but no mask?") raise P4Error(gm) elif self.format == 'standard': if 0: print(gm[0]) self.dump() if not len(self.triplets): if not var.allowEmptyCharSetsAndTaxSets: gm.append( "standard format, but we have no triplets? - no definition?") gm.append("(Allow this by turning var.allowEmptyCharSetsAndTaxSets on.)") raise P4Error(gm) if self.className == 'CharSet': thisMaskLen = self.aligNChar existingSetNames = self.nexusSets.charSetLowNames existingSets = self.nexusSets.charSets theTriplets = self.triplets elif self.className == 'TaxSet': thisMaskLen = self.nexusSets.nTax existingSetNames = self.nexusSets.taxSetLowNames existingSets = self.nexusSets.taxSets theTriplets = self.numberTriplets mask = ['0'] * thisMaskLen for aTriplet in theTriplets: if 0: print(gm[0]) print(" '%s' aTriplet=%s" % (self.name, aTriplet)) first = aTriplet[0] second = aTriplet[1] third = aTriplet[2] lowFirst = None lowSecond = None if isinstance(first, str): lowFirst = first.lower() if isinstance(second, str): lowSecond = second.lower() # its a single, or an existing set, not a range if first and not second: if lowFirst: if lowFirst == 'all': for i in range(thisMaskLen): mask[i] = '1' if lowFirst in existingSetNames: for aSet in existingSets: if lowFirst == aSet.lowName: if not aSet.mask: aSet.setMask() for j in range(thisMaskLen): if aSet.mask[j] == '1': mask[j] = '1' # Maybe its a predefined charset --- constant or gapped elif self.className == 'CharSet' and lowFirst in self.nexusSets.predefinedCharSetLowNames: aSet = None if lowFirst == 'constant': aSet = self.nexusSets.constant elif lowFirst == 'gapped': aSet = self.nexusSets.gapped assert aSet for j in range(thisMaskLen): if aSet.mask[j] == '1': mask[j] = '1' else: gm.append("I don't know '%s'" % first) raise P4Error(gm) elif first == '.': mask[-1] = '1' elif isinstance(first, int): if first > 0 and first <= thisMaskLen: mask[first - 1] = '1' else: # This will have been checked before. gm.append("Component '%s' is out of range of mask len (%s)" % (first, thisMask)) raise P4Error(gm) elif first and second: # Its a range. start = int(first) if second == '.': fin = len(mask) else: fin = int(second) if third: bystep = int(third) # print "mask len %i, start-1 %i, fin %i, bystep %i" % # (len(mask), (start-1), fin, bystep) for spot in range(start - 1, fin, bystep): mask[spot] = '1' else: for spot in range(start - 1, fin): mask[spot] = '1' # print " finished incorporating triplet %s into # '%s' mask." % (aTriplet, self.name) self.mask = ''.join(mask) def invertMask(self): """Change zeros to ones, and non-zeros to zero.""" gm = ['%s.invertMask()' % self.className] if not self.mask: self.dump() gm.append("The charset has no mask") raise P4Error(gm) self.mask = list(self.mask) for i in range(len(self.mask)): if self.mask[i] == '0': self.mask[i] = '1' else: self.mask[i] = '0' self.mask = ''.join(self.mask) def write(self): """Write self in Nexus format to stdout.""" self.writeNexusToOpenFile(sys.stdout) def writeNexus(self): """Write self in Nexus format to stdout.""" self.writeNexusToOpenFile(sys.stdout) def writeNexusToOpenFile(self, flob): if self.className == 'CharSet': theSetName = 'charSet' else: theSetName = 'taxSet' if self.format == 'standard': flob.write(' %s %s =' % (theSetName, self.name)) if self.useTaxNames: for tN in self.taxNames: flob.write(" %s" % p4.func.nexusFixNameIfQuotesAreNeeded(tN)) else: # for i in self.tokens: # flob.write(' %s' % i) previousTok = None for theTok in self.tokens: if isinstance(theTok, str): if theTok not in ['-', '\\']: tok = p4.func.nexusFixNameIfQuotesAreNeeded(theTok) else: tok = theTok else: tok = theTok if previousTok != None: # tokens will be either ints or strings previousType = type(previousTok) # print "previousTok = %s, previousType = %s" % # (previousTok, previousType) # usually put in a space if type(tok) == previousType: # except in this case if tok in ['-'] or previousTok in ['-']: flob.write('%s' % tok) else: flob.write(' %s' % tok) else: # usually no space if tok in ['-'] or previousTok in ['-']: flob.write('%s' % tok) else: # except in this case flob.write(' %s' % tok) previousTok = tok # print "previousTok = %s, previousType = %s" % # (previousTok, previousType) else: flob.write(' %s' % tok) previousTok = tok flob.write(';\n') elif self.format == 'vector': flob.write(' %s %s (vector) = ' % (theSetName, self.name)) flob.write('%s;\n' % self.mask) def vectorize(self): if self.format == 'vector': return if not self.mask: self.setMask() #self.triplets = [] #self.tokens = [] self.format = 'vector' def standardize(self): if self.format == 'standard': return self.triplets = [] self.tokens = [] thisTriplet = [] for mPos in range(len(self.mask)): # print "mPos=%i mask=%s thisTriplet=%s" % (mPos, # self.mask[mPos], thisTriplet) if self.mask[mPos] == '0': if thisTriplet: if thisTriplet[0] == mPos: thisTriplet.append(None) thisTriplet.append(None) else: thisTriplet.append(mPos) thisTriplet.append(None) # print " finished triplet -- %s" % thisTriplet self.triplets.append(thisTriplet) thisTriplet = [] else: if thisTriplet: pass else: thisTriplet.append(mPos + 1) # print " started triplet -- %s" % thisTriplet if thisTriplet: if thisTriplet[0] == len(self.mask): thisTriplet.append(None) thisTriplet.append(None) else: thisTriplet.append(mPos + 1) thisTriplet.append(None) # print " finished last triplet -- %s" % thisTriplet self.triplets.append(thisTriplet) # print self.triplets for triplet in self.triplets: if triplet[1] == None: self.tokens.append(triplet[0]) else: self.tokens.append(triplet[0]) self.tokens.append('-') self.tokens.append(triplet[1]) self.format = 'standard' # self.dump() class CharSet(TaxOrCharSet): def __init__(self, theNexusSets): TaxOrCharSet.__init__(self, theNexusSets) self.className = 'CharSet' self.aligNChar = None def getNChar(self): self.setMask() return self.mask.count('1') def setAligNChar(self, aligNChar): gm = ['CharSet.setAligNChar()'] # print "CharSet name=%s, format=%s, aligNChar=%i" % (self.name, # self.format, aligNChar) self.aligNChar = aligNChar if self.format == 'standard': for aTriplet in self.triplets: first = aTriplet[0] second = aTriplet[1] third = aTriplet[2] if first and not second: # its a single if isinstance(first, int): if first > 0 and first <= self.aligNChar: pass else: gm.append("Charset '%s' definition" % self.name) gm.append( "Charset definition element '%s' is out of range" % first) gm.append("(aligNChar = %i)" % self.aligNChar) raise P4Error(gm) pass elif first and second: # its a range try: start = int(first) except ValueError: gm.append("Charset '%s' definition" % self.name) gm.append( "Can't parse definition element '%s'" % first) raise P4Error(gm) if second == '.': fin = self.aligNChar else: try: fin = int(second) except ValueError: gm.append("Charset '%s' definition" % self.name) gm.append( "Can't parse definition element '%s'" % second) raise P4Error(gm) if third: try: bystep = int(third) except ValueError: gm.append("Charset '%s' definition" % self.name) gm.append( "Can't parse definition element '%s'" % third) raise P4Error(gm) elif self.format == 'vector': # print "charset %s, vector format %s, mask %s" % (self.name, # self.format, self.mask) if self.mask: if len(self.mask) == self.aligNChar: pass else: gm.append("len(self.mask) is %i, but aligNChar is %i" % ( len(self.mask), self.aligNChar)) raise P4Error(gm) else: gm.append("bad format %s" % self.format) raise P4Error(gm) class TaxSet(TaxOrCharSet): def __init__(self, theNexusSets): TaxOrCharSet.__init__(self, theNexusSets) self.className = 'TaxSet' self.numberTriplets = [] def setNumberTriplets(self): gm = ['TaxSet.setNumberTriplets()'] if not self.nexusSets.lowTaxNames: self.nexusSets.lowTaxNames = [txName.lower() for txName in self.nexusSets.taxNames] self.numberTriplets = [] # print "self.triplets = %s" % self.triplets for tr in self.triplets: # print "setNumberTriplets() tr=%s" % tr numTr = [] for itemNum in range(2): trItem = tr[itemNum] # print " considering '%s'" % trItem if trItem == None: numTr.append(trItem) elif isinstance(trItem, int): numTr.append(trItem) elif trItem == '.': numTr.append(self.nexusSets.nTax) else: assert isinstance(trItem, str) lowTrItem = trItem.lower() if lowTrItem in self.nexusSets.taxSetLowNames: numTr.append(trItem) else: if lowTrItem not in self.nexusSets.lowTaxNames: gm.append("Triplet %s" % tr) gm.append( "'%s' is a string, but not in the taxNames." % trItem) raise P4Error(gm) theIndx = self.nexusSets.lowTaxNames.index(lowTrItem) theIndx += 1 numTr.append(theIndx) trItem = tr[2] if trItem == None: numTr.append(None) else: assert isinstance(trItem, int) numTr.append(trItem) assert len(numTr) == 3 # print numTr first = numTr[0] # first might be a pre-existing taxSet name if isinstance(first, str): pass else: second = numTr[1] assert isinstance(first, int) and first != 0 if isinstance(second, int): assert second != 0 if second <= first: gm.append("Triplet %s" % tr) gm.append("Triplet expressed as numbers. %s" % numTr) gm.append( "This appears to be a range, but the second number") gm.append("is not bigger than the first.") raise P4Error(gm) assert second <= self.nexusSets.nTax assert first <= self.nexusSets.nTax self.numberTriplets.append(numTr) class CharPartitionSubset(object): def __init__(self): self.name = None self.lowName = None self.tokens = [] self.mask = None self.triplets = [] def dump(self): print(" -- CharPartitionSubset") print(" name: %s" % p4.func.nexusFixNameIfQuotesAreNeeded(self.name)) print(" triplets: ") for t in self.triplets: print(" %s" % t) print(" tokens: %s" % self.tokens) # for t in self.tokens: # print " %s" % t print(" mask: %s" % self.mask) def writeNexusToOpenFile(self, flob): flob.write('%s:' % self.name) # print self.tokens # for i in self.tokens: # flob.write(' %s' % i) previousTok = None for i in self.tokens: if previousTok != None: # tokens will be either ints or strings previousType = type(previousTok) # print "previousTok = %s, previousType = %s" % (previousTok, # previousType) if type(i) == previousType: # put in a space flob.write(' %s' % i) else: # no space flob.write('%s' % i) previousTok = i else: flob.write(' %s' % i) previousTok = i class CharPartition(object): def __init__(self, theNexusSets): self.nexusSets = theNexusSets self.name = None self.lowName = None self.tokens = [] self.subsets = [] def _readCharPartitionDefinition(self, flob): gm = ['CharPartition._readCharPartitionDefinition()'] if hasattr(flob, 'name') and flob.name: gm.append("file name %s" % flob.name) tok = p4.func.nexusUnquoteName(safeNextTok(flob, gm[0])) lowTok = tok.lower() while lowTok != '=': if lowTok == '(': tok = p4.func.nexusUnquoteName(safeNextTok(flob, gm[0])) lowTok = tok.lower() while lowTok != ')': if lowTok in ['notokens', 'vector']: gm.append("Got charpartition modifier: '%s'" % tok) gm.append("It is not implemented.") gm.append( "Only 'tokens' and 'standard' are implemented.") raise P4Error(gm) elif lowTok in ['tokens', 'standard']: pass else: gm.append("Got charpartition modifier: '%s'" % tok) gm.append("This is not understood.") gm.append( "(Only 'tokens' and 'standard' are implemented.)") raise P4Error(gm) tok = p4.func.nexusUnquoteName(safeNextTok(flob, gm[0])) lowTok = tok.lower() else: gm.append("Got unexpected token: '%s'" % tok) gm.append( "I was expecting either an '=' or something in parentheses.") raise P4Error(gm) tok = p4.func.nexusUnquoteName(safeNextTok(flob, gm[0])) lowTok = tok.lower() while lowTok not in [None, ';', 'end', 'endblock']: self.tokens.append(tok) tok = p4.func.nexusUnquoteName(safeNextTok(flob, gm[0])) lowTok = tok.lower() # print "_readCharPartitionDefinition: tokens %s" % self.tokens # Divide into CharPartitionSubset instances i = 0 while i < len(self.tokens): aSubset = CharPartitionSubset() aSubset.name = self.tokens[i] if not p4.func.nexusCheckName(aSubset.name): gm.append("CharPartition '%s' definition:" % self.name) gm.append("Bad subset name (%s, I think)" % aSubset.name) raise P4Error(gm) aSubset.lowName = aSubset.name.lower() i += 1 if i >= len(self.tokens): gm.append("CharPartition '%s' definition:" % self.name) gm.append( "Subset name (%s) should be followed by a colon" % aSubset.name) raise P4Error(gm) if self.tokens[i] != ':': gm.append("CharPartition '%s' definition:" % self.name) gm.append( "Subset name (%s) should be followed by a colon" % aSubset.name) raise P4Error(gm) i += 1 if i >= len(self.tokens): gm.append("CharPartition '%s' definition:" % self.name) gm.append( "Subset name (%s) and colon should be followed" % aSubset.name) gm.append( "by a subset definition (charSet or charSet definition)") raise P4Error(gm) while i < len(self.tokens) and self.tokens[i] != ',': aSubset.tokens.append(self.tokens[i]) i += 1 i += 1 self.subsets.append(aSubset) # do a once-over sanity check, # check for duplicated names # and convert integer strings to ints existingPartNames = [] for aSubset in self.subsets: # print "Checking charPartitionPart '%s'" % aSubset.name # print " existingPartNames '%s'" % existingPartNames if aSubset.lowName in existingPartNames: gm.append("CharPartition '%s' definition:" % self.name) gm.append("Duplicated subset name (%s, I think)" % aSubset.name) raise P4Error(gm) existingPartNames.append(aSubset.lowName) for i in range(len(aSubset.tokens)): tok = aSubset.tokens[i] lowTok = tok.lower() # print "considering '%s', ord(lowTok[0])=%i" % (lowTok, # ord(lowTok[0])) # Does not pick up '.'!!!! if lowTok in ['.', 'all', '-', '\\', 'remainder']: pass elif lowTok in self.nexusSets.charSetLowNames: pass elif lowTok in self.nexusSets.predefinedCharSetLowNames: pass else: # print " lowTok=%s, ord(lowTok[0])=%s, ord('.')=%s" % ( # lowTok, ord(lowTok[0]), ord('.')) try: intTok = int(tok) aSubset.tokens[i] = intTok except ValueError: gm.append("CharPartition '%s' definition:" % self.name) gm.append("Can't understand '%s' in subset '%s' definition" % (tok, aSubset.name)) gm.append( "(If you are using read('whatever'), and there are backslashes,") gm.append( "are you using raw strings, ie read(r'whatever')?)") raise P4Error(gm) def setSubsetMasks(self): """Make charParititionSubset.mask's appropriate to the Alignment. This is called by theAlignment.setCharPartition(). """ gm = ['CharPartition.setSubsetMasks()'] assert self.nexusSets.aligNChar # Make a list of triplets representing eg 23-87\3 # first item = 23, second item = 87, third = 3 # Not all will exist for each part of the char definition. for aSubset in self.subsets: i = 0 aSubset.triplets = [] while i < len(aSubset.tokens): tok = aSubset.tokens[i] if isinstance(tok, str): lowTok = tok.lower() else: lowTok = None # print "Doing triplets: looking at tok '%s'" % tok if lowTok and lowTok in self.nexusSets.charSetLowNames or \ lowTok in self.nexusSets.predefinedCharSetLowNames: aTriplet = [lowTok, None, None] aSubset.triplets.append(aTriplet) i += 1 if i < len(aSubset.tokens): if aSubset.tokens[i] == '-': gm.append( "CharPartition '%s' definition" % self.name) gm.append("Subset '%s' definition" % aSubset.name) gm.append( "An existing char set may not be followed by a '-'") raise P4Error(gm) if aSubset.tokens[i] == '\\': gm.append( "CharPartition '%s' definition" % self.name) gm.append("Subset '%s' definition" % aSubset.name) gm.append( "An existing char set may not be followed by a '\\'") raise P4Error(gm) elif lowTok in ['all', 'remainder']: aTriplet = [lowTok, None, None] aSubset.triplets.append(aTriplet) i += 1 if lowTok == 'remainder' and i < len(aSubset.tokens): gm.append("CharPartition '%s' definition" % self.name) gm.append("Subset '%s' definition" % aSubset.name) gm.append( "Char set 'remainder' must be the last one in the charPartition definition") raise P4Error(gm) if i < len(aSubset.tokens): if aSubset.tokens[i] == '-': gm.append( "CharPartition '%s' definition" % self.name) gm.append("Subset '%s' definition" % aSubset.name) gm.append( "Char set '%s' may not be followed by a '-'" % lowTok) raise P4Error(gm) if aSubset.tokens[i] == '\\': gm.append( "CharPartition '%s' definition" % self.name) gm.append("Subset '%s' definition" % aSubset.name) gm.append( "Char set '%s' may not be followed by a '\\'" % lowTok) raise P4Error(gm) elif tok == '-': gm.append("CharPartition '%s' definition" % self.name) gm.append("Subset '%s' definition" % aSubset.name) gm.append("Out of place '-'") raise P4Error(gm) elif tok == '\\': gm.append("CharPartition '%s' definition" % self.name) gm.append("Subset '%s' definition" % aSubset.name) gm.append("Out of place '\\'") raise P4Error(gm) elif tok == '.': aTriplet = [tok, None, None] aSubset.triplets.append(aTriplet) i += 1 if i < len(aSubset.tokens): if aSubset.tokens[i] == '-': gm.append( "CharPartition '%s' definition" % self.name) gm.append("Subset '%s' definition" % aSubset.name) gm.append( "Char set '.' may not be followed by a '-'") raise P4Error(gm) if aSubset.tokens[i] == '\\': gm.append( "CharPartition '%s' definition" % self.name) gm.append("Subset '%s' definition" % aSubset.name) gm.append( "Char set '.' may not be followed by a '\\'") raise P4Error(gm) elif isinstance(tok, int): aTriplet = [tok, None, None] i = i + 1 if i < len(aSubset.tokens): if aSubset.tokens[i] == '-': i = i + 1 if i < len(aSubset.tokens): if aSubset.tokens[i] == '.': aTriplet[1] = aSubset.tokens[i] elif isinstance(aSubset.tokens[i], int): if aSubset.tokens[i] > aTriplet[0]: aTriplet[1] = aSubset.tokens[i] else: gm.append( "CharPartition '%s' definition" % self.name) gm.append( "Subset '%s' definition" % aSubset.name) gm.append( "Second number of a character range must be bigger than") gm.append("the first.") raise P4Error(gm) else: gm.append( "CharPartition '%s' definition" % self.name) gm.append( "Subset '%s' definition" % aSubset.name) gm.append( "Second item of a character range must be either a") gm.append( "number or a '.'. I got '%s'" % aSubset.tokens[i]) raise P4Error(gm) i = i + 1 if i < len(aSubset.tokens): if aSubset.tokens[i] == '\\': i = i + 1 if i < len(aSubset.tokens): if isinstance(aSubset.tokens[i], int): aTriplet[2] = aSubset.tokens[i] else: gm.append( "CharPartition '%s' definition" % self.name) gm.append( "Subset '%s' definition" % aSubset.name) gm.append( "Step value of a range must be a number") gm.append( "(Got '%s')" % aSubset.tokens[i]) raise P4Error(gm) i = i + 1 aSubset.triplets.append(aTriplet) else: gm.append("CharPartition '%s' definition" % self.name) gm.append("Subset '%s' definition" % aSubset.name) gm.append("token '%s' is not understood." % tok) raise P4Error(gm) if 0: print(gm[0]) print("Got aSubset (%s) triplets %s" % (aSubset.name, aSubset.triplets)) # sys.exit() aSubset.mask = ['0'] * self.nexusSets.aligNChar for aTriplet in aSubset.triplets: # print "setSubsetMasks() Looking at triplet '%s'" % aTriplet first = aTriplet[0] second = aTriplet[1] third = aTriplet[2] lowFirst = None lowSecond = None if isinstance(first, str): lowFirst = first.lower() if isinstance(second, str): lowSecond = second.lower() if first and not second: # its a single # print "Got single: %s" % first if lowFirst == 'all': for i in range(self.nexusSets.aligNChar): aSubset.mask[i] = '1' elif lowFirst in self.nexusSets.predefinedCharSetLowNames: theCS = None if lowFirst == 'constant': theCS = self.nexusSets.constant elif lowFirst == 'gapped': theCS = self.nexusSets.gapped assert theCS assert theCS.mask for j in range(self.nexusSets.aligNChar): if theCS.mask[j] == '1': aSubset.mask[j] = '1' elif lowFirst in self.nexusSets.charSetLowNames: theCS = None for cs in self.nexusSets.charSets: if lowFirst == cs.lowName: theCS = cs break assert theCS assert theCS.mask for j in range(self.nexusSets.aligNChar): if theCS.mask[j] == '1': aSubset.mask[j] = '1' # Its legit to use this as a single char. elif first == '.': aSubset.mask[-1] = '1' elif isinstance(first, int): if first > 0 and first <= self.nexusSets.aligNChar: aSubset.mask[first - 1] = '1' else: gm.append("CharPartition '%s' definition" % self.name) gm.append("Subset '%s' definition" % aSubset.name) gm.append("Charset definition element '%s' is out of range" % first) gm.append("(aligNChar = %i)" % self.nexusSets.aligNChar) raise P4Error(gm) elif lowFirst == 'remainder': # print "Got first == remainder" for i in range(self.nexusSets.aligNChar): aSubset.mask[i] = '1' # print "Got new aSubset.mask = %s" % aSubset.mask for ss in self.subsets[:-1]: if ss.mask: # print "Previous mask: %s" % ss.mask for j in range(self.nexusSets.aligNChar): if ss.mask[j] == '1': aSubset.mask[j] = '0' else: gm.append("CharPartition '%s' definition" % self.name) gm.append("Subset '%s' definition" % aSubset.name) gm.append("When implementing 'remainder' charset") gm.append("Found that subset '%s' had no mask" % ss) raise P4Error(gm) else: gm.append("CharPartition '%s' definition" % self.name) gm.append("Subset '%s' definition" % aSubset.name) gm.append("Charset definition element '%s' is not understood" % first) raise P4Error(gm) elif first and second: # its a range try: start = int(first) except ValueError: gm.append("CharPartition '%s' definition" % self.name) gm.append("Subset '%s' definition" % aSubset.name) gm.append("Can't parse definition element '%s'" % first) raise P4Error(gm) if second == '.': fin = len(aSubset.mask) else: try: fin = int(second) except ValueError: gm.append( "CharPartition '%s' definition" % self.name) gm.append("Subset '%s' definition" % aSubset.name) gm.append("Can't parse definition element '%s'" % second) raise P4Error(gm) if third: try: bystep = int(third) except ValueError: gm.append("CharPartition '%s' definition" % self.name) gm.append("Subset '%s' definition" % aSubset.name) gm.append("Can't parse definition element '%s'" % third) for spot in range(start - 1, fin, bystep): aSubset.mask[spot] = '1' else: for spot in range(start - 1, fin): aSubset.mask[spot] = '1' aSubset.mask = ''.join(aSubset.mask) # print "Got char subset '%s' mask '%s'" % (aSubset.name, aSubset.mask) if aSubset.mask.count('1') == 0: gm.append("The mask for charPartitionSubset '%s' is empty." % aSubset.name) raise P4Error(gm) def checkForOverlaps(self): gm = ['CharParitition.checkForOverlaps()'] unspanned = 0 for i in range(self.nexusSets.aligNChar): sum = 0 for aSubset in self.subsets: if aSubset.mask[i] == '1': sum += 1 if sum > 1: gm.append("Char partition '%s'" % self.name) gm.append("The problem is that there are overlapping subsets in this") gm.append("charpartition. The same position is in more than one subset.") gm.append("Zero-based position %i, one-based position %i." % (i, i + 1)) raise P4Error(gm) if sum < 1: unspanned = 1 if unspanned: gm.append("Char partition '%s'" % self.name) gm.append("You should be aware that this partition does not span") gm.append("the entire sequence. Hopefully that is intentional.") def dump(self): print(" CharPartition: name: %s" % p4.func.nexusFixNameIfQuotesAreNeeded(self.name)) # ' '.join(self.tokens) print(" tokens: %s" % self.tokens) # for t in self.tokens: # print " %s" % t print(" number of subsets: %s" % len(self.subsets)) for aSubset in self.subsets: aSubset.dump() def writeNexusToOpenFile(self, flob): flob.write(' charPartition %s = ' % self.name) # print " [ %s subsets ] " % len(self.subsets) for aSubset in self.subsets[:-1]: aSubset.writeNexusToOpenFile(flob) flob.write(', ') self.subsets[-1].writeNexusToOpenFile(flob) flob.write(';\n') def mask(self): if not self.nexusSets.aligNChar: self.nexusSets.aligNChar = self.theNexusSets.aligNChar self.setSubsetMasks() m = ['0'] * self.nexusSets.aligNChar for i in range(self.nexusSets.aligNChar): for aSubset in self.subsets: if aSubset.mask[i] == '1': m[i] = '1' return ''.join(m)