Source code for p4.alignment

from p4.sequence import Sequence
from p4.sequencelist import SequenceList
from p4.nexussets import NexusSets
from p4.p4exceptions import P4Error
import string
import copy
import os
import math
import string
import re
import sys
from p4.nexussets import CharSet
import subprocess
from p4.distancematrix import DistanceMatrix
from p4.var import var
from p4.part import Part
import numpy
import numpy.linalg

cListPat = re.compile('(\d+)-?(.+)?')
cList2Pat = re.compile('(.+)\\\\(\d+)')
cListAllPat = re.compile('all\\\\?(\d+)?')

longMessage1 = """
 You may want to do the alignment method 

   checkForDuplicateSequences(removeDupes=True, makeDict=True)

 which keeps the first of the duplicate sequences, and removes the
 subsequent duplicates from the alignment.  If makeDict is set (it is
 set by default), it renames the first to p4Dupe1 (or p4Dupe2, ...),
 and saves a dictionary with all the names of the dupes in a file (by
 default 'p4DupeSeqRenameDict.py', but you can change that file name)
 to facilitate programmatic restoration of names to any output that
 you may get (generally a tree).

 To restore the taxa to your resulting tree you can use the Tree
 method restoreDupeTaxa().

 (You are probably seeing this message because the variable
 var.doCheckForDuplicateSequences is set.  If you do not want to
 automatically check for duplicate sequences when you read in an
 alignment, then turn that variable off.)

"""


class ExcludeDelete(object):

    """A class for Alignment exclude/include and delete/restore.

       - Exclude         for character sites.   Works.
       - Include         for character sites.   No Workee!
       - Delete/restore  for taxa.              No Workee!

    In Nexus-speak, when you exclude character sites, the remaining
    sites are not renumbered.  So if we have an alignment of one short
    sequence, as::

        123456789    1-based char numbers
        acgtacgta    the sequence

    and I exclude chars 4, 5, and 6, I get::

        123789    1-based char numbers
        acggta    the sequence

    Now what if I want to exclude sites 2 and 8 from the original as
    well?  It would not work if I re-numbered the characters; I need
    access to the original numbers.  This class provides that access.

    It involves making a copy of the original Alignment.  This is only
    done if needed-- ie it is not done routinely-- it is only done if
    a call to exclude or delete is made.  A mask (list) is used; that
    mask starts out as all 1's.  If a character is to be excluded, the
    mask is set to zero for that position.  So in the above example,
    we could do this::

        a=alignment()
        a.excludeCharSet('e4_5_6')
        a.excludeCharSet('e2_8')
        print(a.excludeDelete.mask)      # [1, 0, 1, 0, 0, 0, 1, 0, 1]
        print(a.sequences[0].sequence)   # agga
        print(a.excludeDelete.sequences[0].sequence)   # acgtacgta

    The Alignment interface stuff is only partly written.  We have::

        Alignment.excludeCharSet()   # works

    but we do not have an ``include()`` yet, and we do not have either
    ``deleteTaxaSet()`` or ``restore()`` yet.

    """

    def __init__(self, alignment):
        self.alignment = alignment
        self.length = alignment.length
        self.sequences = copy.deepcopy(alignment.sequences)
        #self.taxNames = copy.deepcopy(alignment.taxNames)
        self.mask = [1] * self.length
        self.excludedCharSets = []

    def dump(self):
        print("\n  ExcludeDelete dump.")
        print("        length = %i" % self.length)
        print("        mask = ", end=' ')
        upper = 30
        if self.length < upper:
            upper = self.length
        for i in range(upper):
            print("%1i" % self.mask[i], end=' ')
        if upper == self.length:
            print('')
        else:
            print(" ...")
        print("        %i excludedCharSets:" % len(self.excludedCharSets))
        for cs in self.excludedCharSets:
            print("            %s" % cs.name)

    def _resetMask(self):
        self.mask = [1] * self.length
        for cs in self.excludedCharSets:
            if not cs.mask:
                cs.setMask()
            for i in range(cs.aligNChar):
                if cs.mask[i] == '1':
                    self.mask[i] = 0

    def _resetSequences(self):
        for sNum in range(len(self.alignment.sequences)):
            s = []
            for i in range(len(self.mask)):
                if self.mask[i]:
                    s.append(self.sequences[sNum].sequence[i])
            self.alignment.sequences[sNum].sequence = ''.join(s)
        self.alignment.length = len(self.alignment.sequences[0].sequence)


[docs]class Alignment(SequenceList): """A SequenceList where all the sequences are the same length and dataType. From SequenceList, this inherits * ``sequences``, a list of :class:`~p4.sequence.Sequence` objects * ``fName``, the file name, by default None * ``sequenceForNameDict``, a dictionary which allows you to get a Sequence given its name. This week it is not made by default --- you need to make it explicitly with the inherited method :meth:`~p4.sequencelist.SequenceList.makeSequenceForNameDict`. * and the other methods from :class:`~p4.sequencelist.SequenceList` Alignment objects additionally have the ``length``, which has a synonym ``nChar``, and is also accessible via ``len(self)``. Also: * ``nTax``, the number of sequences * ``taxNames``, a list of the names of the sequences * ``dataType``, a NEXUS-centric string --- 'dna', 'protein', or 'standard' * ``symbols``, the character symbols, not including equates or ambiguities * ``equates``, a dictionary of NEXUS-style ambiguities, eg ``'r':['a','g']`` for DNA * ``dim``, the dimension, ie the number of symbols * ``nexusSets``, a :class:`~p4.nexussets.NexusSets` object, usually made from a copy of ``var.nexusSets``, but made specific to self **Various checks are made when alignments are read in from files** .. currentmodule:: p4.alignment .. autosummary:: :nosignatures: p4.sequencelist.SequenceList.checkNamesForDupes ~Alignment.checkForAllGapColumns ~Alignment.checkForBlankSequences ~Alignment.checkForDuplicateSequences ~Alignment.checkLengthsAndTypes The checks above are under the control of some variables in :py:class:`~p4.var.Var`, always available to you as the instance ``var``. * ``var.doCheckForDuplicateSequenceNames`` * ``var.doRepairDupedTaxonNames`` * ``var.doCheckForAllGapColumns`` * ``var.doCheckForBlankSequences`` * ``var.doCheckForDuplicateSequences`` **Writing** .. autosummary:: :nosignatures: ~Alignment.writeNexus ~Alignment.writePhylip p4.sequencelist.SequenceList.writeFasta ~Alignment.writeMolphy **Copy self** .. autosummary:: :nosignatures: ~Alignment.dupe ~Alignment.noGapsOrAmbiguitiesCopy ~Alignment.bootstrap p4.data.Data.bootstrap ~Alignment.sequenceSlice **Masks are strings that are nChar long, usually just 0s and 1s** .. autosummary:: :nosignatures: ~Alignment.constantMask ~Alignment.simpleConstantMask ~Alignment.gappedMask ~Alignment.getAllGapsMask ~Alignment.getEnoughCharsMask ~Alignment.getLikelihoodTopologyInformativeSitesMask ~Alignment.getMaskForAutapomorphies ~Alignment.getMaskForCharDiversity You can also make masks with :func:`p4.func.maskFromNexusCharacterList`. You can combine masks bitwise with the Alignment methods :meth:`~Alignment.andMasks` and :meth:`~Alignment.orMasks` **CharSets and CharPartitions** A NEXUS-style charSet or charPartition is usually defined using a NEXUS sets block, as explained in :class:`p4.nexussets.NexusSets`. A NEXUS sets block might be something like:: begin sets; charset pos1 = 1 - .\3; charset pos2 = 2 - .\3; charset pos3 = 3 - .\3; end; or :: begin sets; charset gene1 = 1 - 103; charset gene2 = 104 - 397; charpartition by_gene = gene1:gene1, gene2:gene2; end; However, you can also make new charSets from an Alignment.nexusSets with a mask using :meth:`p4.nexussets.NexusSets.newCharSet` CharPartitions must cover the entire length of self, without overlap of the character partition subsets. Whenever you do something like :meth:`Alignment.subsetUsingCharSet` that requires ``self.nexusSets``, if ``self.nexusSets`` does not exist yet then :meth:`Alignment.setNexusSets` is automatically called to make it --- so usually you do not need to do :meth:`Alignment.setNexusSets`. .. autosummary:: :nosignatures: Alignment.excludeCharSet Alignment.setCharPartition Alignment.setGBlocksCharSet Alignment.setNexusSets p4.nexussets.NexusSets.newCharSet p4.nexussets.NexusSets.dupeCharSet **Extracting subsets** .. autosummary:: :nosignatures: Alignment.subsetUsingCharSet Alignment.subsetUsingMask **Recoding the data into groups** .. autosummary:: :nosignatures: Alignment.recodeDayhoff Alignment.recodeProteinIntoGroups Alignment.recodeRY **Translating DNA to protein** .. autosummary:: :nosignatures: Alignment.translate Alignment.checkTranslation """ from p4.alignment_manip import simpleConstantMask, constantMask,gappedMask,getLikelihoodTopologyInformativeSitesMask,getMaskForAutapomorphies,getMaskForCharDiversity,getCharDiversityDistribution,orMasks,andMasks,sequenceSlice,bluntEndLigate, concatenate, constantSitesProportion, constantSitesCount, noGapsOrAmbiguitiesCopy, hasGapsOrAmbiguities, bootstrap, compositionEuclideanDistanceMatrix, covarionStats, pDistances, recodeDayhoff, recodeProteinIntoGroups, recodeRY, checkTranslation, translate, excludeCharSet, dupe, putGaps, setGBlocksCharSet, meanNCharsPerSite, getAllGapsMask, getEnoughCharsMask, simpleCharCounts, getSimpleBigF, matchedPairsTests, symtestAsInIQTreeNaserKhdour, testOverallFromAbabnehEtAl2006, getMinmaxChiSqGroups, getKosiolAISGroups, mrpSlice from p4.alignment_readwrite import readOpenPhylipFile, _readPhylipSequential, _readPhylipInterleaved, _readPhylipSequentialStrict, _readPhylipInterleavedStrict, _readOpenClustalwFile,writeNexus,writePhylip,writeMolphy,_readOpenGdeFile from p4.alignment_part import _initParts,initDataParts,resetSequencesFromParts,resetPartsContentFromSequences # from p4.alignment_logdet import logDet def __init__(self): SequenceList.__init__(self) # Inherited from SequenceList: #self.sequences = [] #self.fName = None #self.sequenceForNameDict = None self.length = 0 #: Nexus-centric data type. One of dna, rna, protein, or standard. self.dataType = None #self.sequences = [] #self.fName = None #self.sequenceForNameDict = None #: Lowercase string, eg 'acgt'. The order is all-important. self.symbols = None #: The number of symbols, eg 4 for DNA. self.dim = None #: A dictionary of NEXUS-style equates, eg r=[a,g] in DNA self.equates = {} # A hash #: A :class:`~p4.nexussets.NexusSets` object, perhaps copied from #: var.nexusSets and made specific to self. You can do a #: :meth:`p4.nexussets.NexusSets.dump` on it to see what is in #: there. self.nexusSets = None #: A list of Part objects, encapsulating data partitions in #: :class:`Data` objects. There would be one or more parts in #: an Alignment. self.parts = [] self.excludeDelete = None # An ExcludeDelete object @property def nTax(self): """Return the number of sequences""" return len(self.sequences) @property def nChar(self): """Return the length of the alignment""" return self.length @property def nEquates(self): """Return the number of equates""" return len(self.equates) def _getTaxNames(self): theTaxNames = [] for s in self.sequences: theTaxNames.append(s.name) return theTaxNames def _setTaxNames(self, theArg=None): gm = ["Alignment._setTaxNames()"] gm.append("Attempt to set Alignment taxNames.") gm.append("However, it is a property, so don't do that.") raise P4Error(gm) def _delTaxNames(self): gm = ["Alignment._delTaxNames()"] gm.append("Caught an attempt to delete self.taxNames, but") gm.append("self.taxNames is a property, so you shouldn't delete it.") raise P4Error(gm) taxNames = property(_getTaxNames, _setTaxNames, _delTaxNames) """A list of the names of the Sequences. """ def _getRows(self): return self.sequences def _getColumns(self): columns = [] for pos in range(self.nChar): column = [] for seqNum in range(0, self.nTax): column.append(self.sequences[seqNum].sequence[pos]) columns.append(column) return columns # Here I over-ride __bool__(). If the self.length len is zero, and I don't # have __bool__() redefined as below, then "assert self" will raise an # AssertionError, basing that response on the result of len(self). Having # __bool__() redefined here makes "assert self" work, even with no sequence # length. Previously, python 2 only, I had to over-ride __nonzero__() for # the same reason --- but that does not work with Python 3. # Checked July 2020, this is still needed. See similar in Sequence class. def __bool__(self): return True def __len__(self): return self.length
[docs] def checkLengthsAndTypes(self): """Last checks after reading an Alignment. Make sure the sequence lengths and dataType are all the same. Set self.length, self.dataType, self.symbols, self.dim, and self.equates """ gm = ["Alignment.checkLengthsAndTypes()"] if self.fName: gm.append("fName = %s" % self.fName) if len(self.sequences) > 1: len0 = len(self.sequences[0].sequence) type0 = self.sequences[0].dataType for i in range(len(self.sequences))[1:]: if len0 != len(self.sequences[i].sequence): gm.append("(Zero-based) sequence %i (%s) length (%i)," % (i, self.sequences[i].name, len(self.sequences[i].sequence))) gm.append( "is not the same as the first sequence length (%i)." % len0) raise P4Error(gm) if type0 != self.sequences[i].dataType: gm.append("Type of (zero-based) sequence %i (%s)," % (i, self.sequences[i].dataType)) gm.append( "is not the same as the first sequence type (%s)." % type0) raise P4Error(gm) self.length = len0 self.dataType = type0 elif len(self.sequences) == 1: self.length = len(self.sequences[0].sequence) self.dataType = self.sequences[0].dataType elif len(self.sequences) == 0: print(gm[0]) print(" The alignment has no sequences!") if self.dataType == 'dna': self.symbols = 'acgt' self.dim = 4 if not self.equates: self.equates = {'n': 'acgt', 'm': 'ac', 'k': 'gt', # 'x': 'acgt', 'h': 'act', 'y': 'ct', 'v': 'acg', 'w': 'at', 'd': 'agt', 'b': 'cgt', 'r': 'ag', 's': 'cg'} elif self.dataType == 'protein': self.symbols = 'arndcqeghilkmfpstwyv' self.dim = 20 if not self.equates: self.equates = { 'b': 'dn', 'x': 'arndcqeghilkmfpstwyv', 'z': 'eq'} elif self.dataType == 'standard': if not self.symbols: gm.append("symbols are missing.") raise P4Error(gm) if not self.dim: self.dim = len(self.symbols) if not self.equates: self.equates = {} elif self.dataType == 'rna': self.symbols = 'acgu' self.dim = 4 if not self.equates: self.equates = {'n': 'acgu', 'm': 'ac', 'k': 'gu', # 'x': 'acgu', 'h': 'acu', 'y': 'cu', 'v': 'acg', 'w': 'au', 'd': 'agu', 'b': 'cgu', 'r': 'ag', 's': 'cg'} else: gm.append("unknown dataType %s." % self.dataType) raise P4Error(gm)
[docs] def composition(self, sequenceNumberList=None): """Returns a list of compositions. This returns a list of floats, the composition of the sequence(s) in the sequenceNumberList. If the sequenceNumberList=None (the default), then the overall composition is given, which is the mean of the individual sequences (the sequence comps are not weighted by the sequence length, ie the non-gap sequence length). For DNA, the order is acgt. For protein, the order is arndcqeghilkmfpstwyv. For standard, its the order in symbols. Gaps and questionmarks are ignored. Equates are handled properly, iterating to the final comp. """ gm = ['Alignment.composition(sequenceNumberList=%s).' % sequenceNumberList] dbug = 0 # symbolFreq and equateFreq are hashes for the raw counts symbolFreq = {} for symb in self.symbols: symbolFreq[symb] = 0.0 if self.equates: equateFreq = {} for equate in self.equates: equateFreq[equate] = 0.0 hasEquates = 0 if dbug: print("Alignment.composition() sequenceNumberList = %s" % sequenceNumberList) if sequenceNumberList: if not isinstance(sequenceNumberList, list): gm.append("The sequenceNumberList should be a list, ok?") raise P4Error(gm) if len(sequenceNumberList) == 0: gm.append("The sequenceNumberList should have something in it, ok?") raise P4Error(gm) else: sequenceNumberList = range(len(self.sequences)) result = [0.0] * self.dim grandNSites = 0 import math epsilon = 1.0e-12 maxIterations = 1000 for i in sequenceNumberList: if not isinstance(i, int): gm.append("The sequenceNumberList should be integers, ok?") raise P4Error(gm) if i < 0 or i > len(self.sequences) - 1: gm.append("Item '%i' in sequenceNumberList is out of range" % i) raise P4Error(gm) seq = self.sequences[i] nGapsMissings = 0 for j in seq.sequence: if j == '-' or j == '?': nGapsMissings += 1 nSites = self.length - nGapsMissings grandNSites = grandNSites + nSites for symb in self.symbols: #symbolFreq[symb] = symbolFreq[symb] + float(seq.sequence.count(symb)) symbolFreq[symb] = float(seq.sequence.count(symb)) if self.equates: for equate in self.equates: # equateFreq[equate] = equateFreq[equate] + float(seq.sequence.count(equate)) equateFreq[equate] = float(seq.sequence.count(equate)) if equateFreq[equate]: hasEquates = 1 if dbug: print("symbolFreq = ", symbolFreq) print("equateFreq = ", equateFreq) initComp = 1.0 / self.dim comp = {} symbSum = {} for symb in self.symbols: comp[symb] = initComp symbSum[symb] = 0.0 for dummy in range(maxIterations): for symb in self.symbols: symbSum[symb] = symbolFreq[symb] if hasEquates: for equate in self.equates: if equateFreq[equate]: factor = 0.0 for symb in self.equates[equate]: factor = factor + comp[symb] for symb in self.equates[equate]: symbSum[symb] = symbSum[ symb] + (equateFreq[equate] * (comp[symb] / factor)) factor = 0.0 for symb in self.symbols: factor = factor + symbSum[symb] if not factor: gm.append('(Zero-based) sequence %i. Empty?' % i) gm.append( 'Perhaps exclude it by specifying arg sequenceNumberList.') gm.append('(Or use the Data/Part composition.)') raise P4Error(gm) diff = 0.0 for symb in self.symbols: oldComp = comp[symb] comp[symb] = symbSum[symb] / factor diff = diff + math.fabs(comp[symb] - oldComp) if 0: print("diff=%8.5f %3i " % (diff, i), end=' ') for symb in self.symbols: print("%s: %.6f " % (symb, comp[symb]), end=' ') print('') if diff < epsilon: # print("did %i iterations" % dummy) break for j in range(len(self.symbols)): result[j] = result[j] + (comp[self.symbols[j]] * nSites) for j in range(len(self.symbols)): result[j] = result[j] / grandNSites return result
# def subsetUsingCharPartition(self, charPartitionName, inverse=0): # """Return a subset of self based on a character partition. # A charpartition has one or more subsets, which together need # not span the length of self. This method would of course only # be useful if the charpartition does not span the entire # alignment-- otherwise you would get the whole alignment. This # method makes a new mask (using the CharPartition.mask() # method) which has a 1 wherever any subset of the charPartition # has a 1, and a zero otherwise. Returns an alignment. """ ## gm = ['Alignment.subsetUsingCharPartition()'] # if not self.nexusSets: # self.setNexusSets() # if not len(self.nexusSets.charPartitions): ## gm.append("This alignment has no charPartitions") ## raise P4Error(gm) ## theCP = None ## lowName = charPartitionName.lower() # for cp in self.nexusSets.charPartitions: # if cp.lowName == lowName: ## theCP = cp # break # if theCP == None: ## gm.append("This alignment has no charPartition named '%s'" % charPartitionName) ## raise P4Error(gm) # Get the mask ## m = theCP.mask(self.nexusSets, self) # print("The mask is: %s" % m) # if not inverse: ## a = self.subsetUsingMask(m, theMaskChar='1') # else: ## a = self.subsetUsingMask(m, theMaskChar='0') # return a # def subsetUsingCharPartitionSubset(self, charPartitionName, charPartitionSubsetName, inverse=0): # """Return a subset of self based on a charPartition subset. # A charPartition has one or more subsets, each of which can # have a mask. This method uses that mask to subset self. # Returns an alignment. """ ## gm = 'Alignment.subsetUsingCharPartitionSubset(charPartitionName=\'%s\'' % charPartitionName ## gm += ', charPartitionSubsetName=\'%s\', inverse=%s)' % (charPartitionSubsetName, inverse) ## gm = [gm] # if not self.nexusSets: # self.setNexusSets() # if not len(self.nexusSets.charPartitions): ## gm.append("This alignment has no charPartitions") ## raise P4Error(gm) # Find the charPartition ## theCP = None ## lowName = charPartitionName.lower() # for cp in self.nexusSets.charPartitions: # if cp.lowName == lowName: ## theCP = cp # break # if theCP == None: ## gm.append("This alignment has no charPartition named '%s'" % charPartitionName) ## raise P4Error(gm) # Find the charPartitionSubset ## theCPsubset = None ## lowName = charPartitionSubsetName.lower() # for cps in theCP.subsets: # if cps.lowName == lowName: ## theCPsubset = cps # break # if theCPsubset == None: # gm.append("The charPartition '%s' has no charPartitionSubset named '%s'" % \ # (charPartitionName, charPartitionSubsetName)) ## raise P4Error(gm) # Prepare the mask ## theCP.setSubsetMasks(self.nexusSets, self) # if not inverse: ## a = self.subsetUsingMask(theCPsubset.mask, theMaskChar='1') # else: ## a = self.subsetUsingMask(theCPsubset.mask, theMaskChar='0') # return a
[docs] def subsetUsingCharSet(self, charSetName, inverse=0): """Return a subset of self based on a charSet. A charset has a mask, composed of zeros and ones, which is used to subset self. Returns an alignment. For example:: read("myAlignment.nex") a = var.alignments[0] read('myNexusSets.nex') # with a charset named 'foo' b = a.subsetUsingCharSet('foo') b.writeNexus('myFooSubset.nex') """ gm = ['Alignment.subsetUsingCharSet(charSetName=\'%s\', inverse=%s)' % ( charSetName, inverse)] if not self.nexusSets: self.setNexusSets() theCS = None lowName = charSetName.lower() if lowName not in self.nexusSets.predefinedCharSetLowNames and lowName not in self.nexusSets.charSetLowNames: gm.append("This alignment has no charset named '%s'" % charSetName) raise P4Error(gm) if lowName in self.nexusSets.predefinedCharSetLowNames: if lowName == 'constant': theCS = self.nexusSets.constant elif lowName == 'gapped': theCS = self.nexusSets.gapped else: for cs in self.nexusSets.charSets: if cs.lowName == lowName: theCS = cs break if theCS == None: gm.append( "This should not happen -- alignment has no charset named '%s'" % charSetName) raise P4Error(gm) assert theCS.aligNChar assert theCS.mask # prepare the mask # if not theCS.mask: # theCS.setMask() # print("The mask is: %s" % theCS.mask) if len(theCS.mask) != self.length: gm.append("The length of the mask is %i, the length of the alignment is %i" % ( len(theCS.mask), self.length)) raise P4Error(gm) if not inverse: a = self.subsetUsingMask(theCS.mask, theMaskChar='1') else: a = self.subsetUsingMask(theCS.mask, theMaskChar='0') return a
[docs] def subsetUsingMask(self, theMask, theMaskChar='1', inverse=0): """Returns a subset of self based on a mask. This makes a copy of the alignment based on theMask. Arg theMask is a string, the same length as the alignment. The character theMaskChar determines which positions are included in the copy. If the character in theMask is theMaskChar, that position is included in the copy. Otherwise, no. If inverse is set, then theMaskChar determines which positions are excluded, and all other positions are included. """ gm = ['Alignment.subsetUsingMask()'] # Make sure theMask is the right length, depending on whether # characters have been excluded or not. if self.excludeDelete: if len(theMask) != self.excludeDelete.length: gm.append("The mask length (%i) does not" % len(theMask)) gm.append( "equal the (pre-exclude charSets) alignment length (%i)" % self.excludeDelete.length) raise P4Error(gm) else: if len(theMask) != self.length: gm.append("The mask length (%i) does not" % len(theMask)) gm.append("equal the alignment length (%i)" % self.length) raise P4Error(gm) if not isinstance(theMaskChar, str) or len(theMaskChar) != 1: gm.append("theMaskChar needs to be a single-character string") raise P4Error(gm) # first, make a copy a = copy.deepcopy(self) a.excludeDelete = None if len(a.parts): for i in a.parts: i.cPart = None a.parts = [] a.nexusSets = None theMask2 = [0] * len(theMask) if not inverse: for i in range(len(theMask2)): if theMask[i] == theMaskChar: theMask2[i] = 1 else: for i in range(len(theMask2)): if theMask[i] != theMaskChar: theMask2[i] = 1 if self.excludeDelete: for i in range(self.excludeDelete.length): if self.excludeDelete.mask[i] == 0: theMask2[i] = 0 # From now on, inverse or not does not apply to theMask2. # Nor does theMatchChar apply-- its just 1's and zeros. # If it is 0, exclude it. If it is 1, include it. a.length = theMask2.count(1) if a.length == 0: if not var.allowEmptyCharSetsAndTaxSets: gm.append("The mask has a length of zero.") gm.append("(Allow by turning var.allowEmptyCharSetsAndTaxSets on.)") raise P4Error(gm) # make a 2D array the same size as the sequences, filled. newList = [] for i in range(len(self.sequences)): one = ['a'] * a.length newList.append(one) # print("newList = ", newList) # fill the array with slices from the sequences k = 0 # print("self.length = %i, a.length = %i" % (self.length, a.length)) if self.excludeDelete: for i in range(len(theMask2)): if theMask2[i]: for j in range(len(self.excludeDelete.sequences)): newList[j][k] = self.excludeDelete.sequences[ j].sequence[i] k = k + 1 else: for i in range(len(theMask2)): if theMask2[i]: for j in range(len(self.sequences)): newList[j][k] = self.sequences[j].sequence[i] k = k + 1 # replace the sequences for i in range(len(self.sequences)): a.sequences[i].sequence = ''.join(newList[i]) a.checkLengthsAndTypes() if 0: from nexussets import NexusSets a.nexusSets = NexusSets() a.nexusSets.aligNChar = a.length a.nexusSets.setPredefinedCharSets(a) return a
[docs] def checkForDuplicateSequences(self, removeDupes=False, makeDict=True, dictFileName='p4DupeSeqRenameDict.py', dupeBaseName='p4Dupe'): """Like it says, with verbose output. If the p4 variable var.doCheckForDuplicateSequences is set, this method, with removeDupes=False, is called automatically every time an alignment is read in. If you would rather it not do that, turn var.doCheckForDuplicateSequences off. When these automatic checks are done, if any dupe sequences are found then a verbose warning is issued, inviting you to run it again with removeDupes turned on. If removeDupes is set, the duplicate sequences after the first are removed. This option is not set by default. If both removeDupes and makeDict are set, then it will rename the first sequence to p4Dupe1 (or p4Dupe2, and so on --- the dupeBaseName is 'p4Dupe' by default, but it can be set as an arg) and make a dictionary to hold the other names, and write that dictionary to a file (by default p4DupeSeqRenameDict.py). The option makeDict is set by default, but it won't happen unless removeDupes is also set, and there are dupes to be removed. """ #gm = ['Alignment.checkForDuplicateSequences()'] dupeNumPairs = [] dupes = [] firsts = {} doneDupeSeqNums = set([]) if removeDupes and makeDict: if os.path.isfile(dictFileName): gm = ['Alignment.checkForDuplicateSequences()'] gm.append("file '%s' already exists" % dictFileName) raise P4Error(gm) theRange = range(self.length) for i in range(len(self.sequences))[:-1]: if i not in doneDupeSeqNums: si = self.sequences[i].sequence for j in range(len(self.sequences))[i + 1:]: if j not in doneDupeSeqNums: sj = self.sequences[j].sequence # print("trying", i, j) isSame = True for k in theRange: if si[k] != sj[k]: isSame = False break # xfasta, with RNA structure line. if hasattr(self.sequences[i], 'parens'): sip = self.sequences[i].parens sjp = self.sequences[j].parens p_isSame = True for k in theRange: if sip[k] != sjp[k]: p_isSame = False break if isSame and not p_isSame: print("(One-based) Sequences %i and %i are the same," % (i + 1, j + 1), end=' ') print("but the structures differ.") # else: # print('ok') if isSame: dupeNumPairs.append([i, j]) doneDupeSeqNums.add(i) doneDupeSeqNums.add(j) # print(dupeNumPairs) if dupeNumPairs and not removeDupes: print() print("=" * 50) if self.fName: print(" Alignment from file '%s'" % self.fName) print(" This alignment has duplicate sequences!") print(" Sequence numbers below are 1-based.") for dp in dupeNumPairs: i = dp[0] j = dp[1] print(" sequence %i (%s) is the same as sequence %i (%s)." % ( i + 1, self.sequences[i].name, j + 1, self.sequences[j].name)) print(longMessage1) print("=" * 50) print() if dupeNumPairs and removeDupes: if makeDict: myDict = {} newNameCounter = 1 dpNum = 0 # dupe pair index while 1: dp = None try: dp = dupeNumPairs[dpNum] except IndexError: break i = dp[0] j = dp[1] iName = self.sequences[i].name jName = self.sequences[j].name newIName = "%s%i" % (dupeBaseName, newNameCounter) myDict[newIName] = [iName, jName] self.sequences[i].name = newIName newNameCounter += 1 # Get the other j's for the same i. while 1: dpNum += 1 dp = None try: dp = dupeNumPairs[dpNum] except IndexError: break if dp[0] == i: myDict[newIName].append(self.sequences[dp[1]].name) else: break f = open(dictFileName, 'w') f.write("p4DupeSeqRenameDict = %s\n" % myDict) f.close() # Remove the dupe sequences. toRemove = [] for dp in dupeNumPairs: j = dp[1] # remove the second, not the first toRemove.append(self.sequences[j]) #self.sequences[dp[0]].name += "_%s" % self.sequences[j].name for s in toRemove: self.sequences.remove(s) if self.nexusSets and self.nexusSets.taxSets: print() print("-" * 50) print("There are tax sets, possibly affected by dupe removal.") print("So I am removing them.") print("-" * 50) self.nexusSets.taxSets = []
[docs] def checkForBlankSequences(self, removeBlanks=False, includeN=True, listSeqNumsOfBlanks=False): """Like it says, with verbose output. If the p4 variable var.doCheckForBlankSequences is set, this method, with removeBlanks=False, is called automatically every time an alignment is read in. If you would rather it not do that, turn var.doCheckForBlankSequences off. When these automatic checks are done, if any blank sequences are found then a verbose P4Error is raised, inviting you to run it again with removeBlanks turned on. Sometimes you just want to know what sequences are blank; get the sequence numbers by turning arg listSeqNumsOfBlanks on. That returns a list of sequence numbers, without removing blanks or raising a P4Error. If removeBlanks is set, the blank sequences are removed. This option is not set by default. If this method removes sequences, it returns the number of blank sequences removed. Blank sequences are defined as sequences wholly composed of '?' and '-'. If includeN is turned on (which it is by default) then 'n' is included for DNA, and 'x' for protein. If that is done, that means, for DNA, that it is blank if it is wholly composed of '?', '-', and 'n'. """ gm = ['Alignment.checkForBlankSequences()'] if self.fName: gm.append("fName = %s" % self.fName) bChars = ['-', '?'] if includeN: if self.dataType == 'dna': bChars.append('n') elif self.dataType == 'protein': bChars.append('x') blankSeqs = [] seqNums = [] for seqNum in range(len(self.sequences)): seqObj = self.sequences[seqNum] isBlank = True for c in seqObj.sequence: if c not in bChars: isBlank = False break if isBlank: blankSeqs.append(seqObj) seqNums.append(seqNum) # print("=" * 50) # seqObj.write() if listSeqNumsOfBlanks: return seqNums if blankSeqs and not removeBlanks: gm.append("This alignment has %i blank sequences," % len(blankSeqs)) gm.append("wholly composed of %s." % bChars) gm.append("To remove them, re-run this method, with arg") gm.append("removeBlanks turned on.") gm.append( "To prevent checking, turn var.doCheckForBlankSequences off.") raise P4Error(gm) if blankSeqs and removeBlanks: for s in blankSeqs: self.sequences.remove(s) if self.nexusSets and self.nexusSets.taxSets: print() print("-" * 50) print("There are tax sets, possibly affected by blank sequence removal.") print("So I am removing them.") print("-" * 50) self.nexusSets.taxSets = [] return len(blankSeqs) return 0
[docs] def checkForAllGapColumns(self, returnMask=False): """Check for alignment columns that are made of only gap or ? chars. By default, p4 does this on new alignments that are read. It is under the control of var.doCheckForAllGapColumns. If there are all gap columns then a verbose output is printed and a P4Error is raised, unless ``returnMask`` is set, in which case no output is printed, no P4Error is raised, but the mask is returned. """ gm = ['Alignment.checkForAllGapColumns()'] if self.fName: gm.append("fName = %s" % self.fName) allGapPositions = [] firstSeqSequence = self.sequences[0].sequence if '-' not in firstSeqSequence and '?' not in firstSeqSequence: return for pos in range(self.nChar): if firstSeqSequence[pos] == '-' or firstSeqSequence[pos] == '?': allGaps = True for seqNum in range(1, self.nTax): c = self.sequences[seqNum].sequence[pos] if c == '-' or c == '?': pass else: allGaps = False break if allGaps: allGapPositions.append(pos) if allGapPositions: if returnMask: m = ['0'] * self.nChar for i in allGapPositions: m[i] = '1' return ''.join(m) else: gm.append("The following %i positions were composed solely of '-' or '?'" % len( allGapPositions)) gm.append("Zero-based numbering - %s" % allGapPositions) for i in range(len(allGapPositions)): allGapPositions[i] = allGapPositions[i] + 1 gm.append("One-based numbering - %s" % allGapPositions) nxsString = ' '.join(["%i" % i for i in allGapPositions]) gm.append('nexus charSet %s' % nxsString) gm.append("(To turn off auto-checking for all-gap columns,") gm.append("turn var.doCheckForAllGapColumns off.)") raise P4Error(gm)
[docs] def dump(self): """Print rubbish about self.""" print("\nAlignment dump:") if self.fName: print(" File name '%s'" % self.fName) if self.length: print(" Length is %s" % self.length) # if hasattr(self, 'nTax'): # print(" nTax is %i" % self.nTax) # if hasattr(self, 'nChar'): # print(" nChar is %i" % self.nChar) if hasattr(self, 'dataType'): print(" dataType is '%s'" % self.dataType) if hasattr(self, 'symbols'): print(" symbols are '%s'" % self.symbols) if self.equates: print(" equates") theKeys = list(self.equates.keys()) theKeys.sort() for k in theKeys: print("%20s %-30s" % (k, self.equates[k])) if self.nexusSets: if self.nexusSets.charSets: if len(self.nexusSets.charSets) == 1: print(" There is 1 charSet") else: print(" There are %i charSets" % len(self.nexusSets.charSets)) for cp in self.nexusSets.charSets: print(" %s" % cp.name) if self.nexusSets.charPartitions: if len(self.nexusSets.charPartitions) == 1: print(" There is 1 charPartition") else: print(" There are %i charPartitions" % len(self.nexusSets.charPartitions)) for cp in self.nexusSets.charPartitions: print(" %s" % cp.name) if self.nexusSets.charPartition: print(" The current charPartition is %s" % self.nexusSets.charPartition.name) else: print(" There is no current charPartition.") if self.excludeDelete: self.excludeDelete.dump() if len(self.parts): if len(self.parts) == 1: print(" There is %i part" % len(self.parts)) else: print(" There are %i parts" % len(self.parts)) for p in self.parts: print(" %s, length %i" % (p.name, p.nChar)) # SequenceList.dump(self) print(" There are %i sequences" % len(self.sequences)) upper = len(self.sequences) if upper > 5: upper = 5 for i in range(upper): print(" %4i %s" % (i, self.sequences[i].name)) if len(self.sequences) > upper: print(" <... and more ...>")
[docs] def setNexusSets(self): """Set self.nexusSets from var.nexusSets. A deepcopy is made of var.nexusSets, and then attached to self. Sometimes other Nexus-set related methods trigger this. If var.nexusSets does not yet exist, a new blank one is made. """ gm = ["Alignment.setNexusSets()"] if not var.nexusSets: var.nexusSets = NexusSets() self.nexusSets = copy.deepcopy(var.nexusSets) self.nexusSets.taxNames = self.taxNames self.nexusSets.nTax = self.nTax self.nexusSets.aligNChar = self.nChar self.nexusSets.constant.setAligNChar(self.nChar) self.nexusSets.gapped.setAligNChar(self.nChar) self.nexusSets.constant.mask = self.constantMask() self.nexusSets.gapped.mask = self.gappedMask() if self.nexusSets.charSets: for cs in self.nexusSets.charSets: cs.setAligNChar(self.nChar) cs.setMask() if self.nexusSets.taxSets: # print("%s. There are %i taxSets." % (gm[0], len(self.nexusSets.taxSets))) # Check that no taxSet name is a taxName lowSelfTaxNames = [txName.lower() for txName in self.taxNames] for ts in self.nexusSets.taxSets: if ts.lowName in lowSelfTaxNames: gm.append( "Can't have taxSet names that are the same (case-insensitive) as a tax name") gm.append( "Lowercased taxSet name '%s' is the same as a lowcased taxName." % ts.name) raise P4Error(gm) self.nexusSets.lowTaxNames = lowSelfTaxNames # If it is standard format, # convert triplets to numberTriplets, and then mask for ts in self.nexusSets.taxSets: if ts.format == 'standard': ts.setNumberTriplets() ts.setMask() # print(ts.mask) elif ts.format == 'vector': assert ts.mask if len(ts.mask) != self.nTax: gm.append("taxSet %s" % ts.name) gm.append( "It is vector format, but the length is wrong.") gm.append( "taxSet mask is length %i, but self nTax is %i" % (len(ts.mask), self.nTax)) raise P4Error(gm) else: gm.append("taxSet %s" % ts.name) gm.append("unknown format %s" % ts.format) raise P4Error(gm) # Now set ts.taxNames from the mask. Note self.taxNames is a property ts.taxNames = [self.taxNames[i] for i,c in enumerate(ts.mask) if c == '1'] if self.nexusSets.charPartitions: pass
[docs] def setCharPartition(self, charPartitionName): """Partition self into Parts based on charPartitionName. You need to do this before you ask for a Data object. You can also un-partition an already partitioned alignment by feeding this charPartitionName = None.""" gm = ["Alignment.setCharPartition('%s')" % charPartitionName] if not self.nexusSets: if not charPartitionName: print(gm[0]) print("setNexusSets() has not been done -- self has no nexusSets") print("yet we are doing setCharPartition, with no partition!") print("Its not an error, but are we a little confused?") return self.setNexusSets() self.nexusSets.charPartition = None self.parts = [] if not charPartitionName: return for cp in self.nexusSets.charPartitions: if cp.name == charPartitionName: self.nexusSets.charPartition = cp if not self.nexusSets.charPartition: gm.append( "Could not find a CharPartition with the name '%s'" % charPartitionName) raise P4Error(gm) self.nexusSets.charPartition.setSubsetMasks() self.nexusSets.charPartition.checkForOverlaps() if 0: print(gm[0]) self.nexusSets.charPartition.dump() self.dump()
[docs] def changeDataTypeTo(self, newDataType, newSymbols=None, newEquates=None): """Coerce the alignment to be a new datatype. This would be good for pathological cases where eg DNA with lots of ambigs is mistaken for protein. It is not sufficient to simply change the dataType -- we must change the symbols and equates as well. And the dataType for all the sequences in self. If you are changing to 'standard' dataType, then you need to specify the symbols and the equates. The symbols is a string, and the equates is a dictionary (see eg yourAlignment.equates for a DNA alignment to see the format). """ gm = ['Alignment.changeDataTypeTo(%s, newSymbols=%s)' % ( newDataType, newSymbols)] if newDataType not in ['dna', 'protein', 'standard']: gm.append( "newDataType must be one of 'dna', 'protein', 'standard'") raise P4Error(gm) if newDataType == self.dataType: gm.append("Self is already dataType %s" % self.dataType) raise P4Error(gm) if newDataType == 'standard': validChars = newSymbols + '-?' + ''.join(newEquates.keys()) # print("standard datatype: got validChars '%s'" % validChars) for s in self.sequences: if newDataType == 'dna': for c in s.sequence: if c not in var.validDnaChars: gm.append( "Sequence %s, char %s not a valid DNA character." % (s.name, c)) raise P4Error(gm) s.dataType = newDataType elif newDataType == 'protein': for c in s.sequence: if c not in var.validProteinChars: gm.append( "Sequence %s, char %s not a valid protein character." % (s.name, c)) raise P4Error(gm) s.dataType = newDataType if newDataType == 'standard': for c in s.sequence: if c not in validChars: gm.append( "Sequence %s, char '%s' not in valid chars '%s'." % (s.name, c, validChars)) raise P4Error(gm) s.dataType = newDataType self.dataType = newDataType if newDataType == 'dna': self.symbols = 'acgt' self.dim = 4 self.equates = {'n': 'acgt', 'm': 'ac', 'k': 'gt', # 'x': 'acgt', 'h': 'act', 'y': 'ct', 'v': 'acg', 'w': 'at', 'd': 'agt', 'b': 'cgt', 'r': 'ag', 's': 'cg'} elif newDataType == 'protein': self.symbols = 'arndcqeghilkmfpstwyv' self.dim = 20 self.equates = {'b': 'dn', 'x': 'arndcqeghilkmfpstwyv', 'z': 'eq'} elif newDataType == 'standard': self.symbols = newSymbols self.dim = len(newSymbols) self.equates = newEquates # Is this needed? Probably not. self.checkLengthsAndTypes()