Source code for p4.sequence

import sys
import re
import string
import os
import io
import p4.func
import copy
from p4.var import var
from p4.p4exceptions import P4Error
from subprocess import Popen, PIPE

[docs]class Sequence(object):

    """A container for a single molecular sequence.

    Data attributes

    - **sequence** a string, the molecular sequence
    - **name**     a string, the name
    - **dataType** either 'dna', or 'protein', or None, meaning 'standard'
    """

    def __init__(self):
        self.name = None
        self.comment = None
        self.sequence = ''
        self.dataType = None

    def _getNChar(self):
        if self.sequence:
            return len(self.sequence)
        else:
            return 0

    nChar = property(_getNChar)
    """(property) return the length of the sequence, or zero"""

    # See the comment in alignment.py, for the same redefinition of __bool__().
    def __bool__(self):
        return True

    def __len__(self):
        if self.sequence:
            return len(self.sequence)
        else:
            return 0

[docs]    def dump(self):
        """Print rubbish about self."""
        print('%15s: %s' % ('name', self.name))
        if self.comment:
            print('%15s: %s' % ('comment', self.comment))
        # if self.dataType == 'dna':
           # if self.transl_table:
           #    print "%15s: %s' % ('transl_table", self.transl_table)
        if self.sequence:
            print('%15s: %s' % ('sequence', self.sequence[:25]), end=' ')
            if len(self.sequence) > 25:
                print("...")
            else:
                print('')

[docs]    def dupe(self):
        """Return a duplicate of self."""

        return copy.deepcopy(self)

[docs]    def reverseComplement(self):
        """Convert self.sequence, a DNA sequence, to its reverse complement.

        Ambigs are handled correctly.  I think.
        """

        assert self.dataType == 'dna'
        self.sequence = list(self.sequence)
        self.sequence.reverse()
        # {'b': 'cgt', 'd': 'agt', 'h': 'act', 'k': 'gt', 'm': 'ac',
        #  'n': 'acgt', 's': 'cg', 'r': 'ag', 'w': 'at', 'v': 'acg', 'y': 'ct'}  # 'x': 'acgt',
        for i in range(len(self.sequence)):
            c = self.sequence[i]
            if c == 'a':
                self.sequence[i] = 't'
            elif c == 't':
                self.sequence[i] = 'a'
            elif c == 'c':
                self.sequence[i] = 'g'
            elif c == 'g':
                self.sequence[i] = 'c'
            elif c == '-':
                pass
            elif c == 'n':
                pass
            # elif c == 'x':
            #    pass
            elif c == 'r':
                self.sequence[i] = 'y'
            elif c == 'y':
                self.sequence[i] = 'r'

            elif c == 'b':
                self.sequence[i] = 'v'
            elif c == 'd':
                self.sequence[i] = 'h'
            elif c == 'h':
                self.sequence[i] = 'd'
            elif c == 'k':
                self.sequence[i] = 'm'
            elif c == 'm':
                self.sequence[i] = 'k'
            elif c == 's':
                pass
                #self.sequence[i] = 's'
            elif c == 'w':
                pass
                #self.sequence[i] = 'w'
            elif c == 'v':
                self.sequence[i] = 'b'
            else:
                gm = ["Sequence.reverseComplement()"]
                if c in string.ascii_uppercase:
                    gm.append("Got uppercase '%s' How did that happen? -- can only handle lowercase." % c)
                else:
                    gm.append("Sequence.reverseComplement().  Got char '%s' What is it?" % c)
                raise P4Error(gm)

        self.sequence = ''.join(self.sequence)

[docs]    def writeFastaToOpenFile(self, flob, width=60, doComment=True, writeExtraNewline=True):
        flob.write('>%s' % self.name)
        if doComment and self.comment:
            flob.write(' %s' % self.comment)
        flob.write('\n')
        left = len(self.sequence)
        pos = 0
        if width > 0:
            while left >= width:
                if var.writeFastaUppercase:
                    flob.write('%s\n' % self.sequence[pos: pos + width].upper())
                else:
                    flob.write('%s\n' % self.sequence[pos: pos + width])
                pos = pos + width
                left = left - width
        if left > 0:
            if var.writeFastaUppercase:
                flob.write('%s\n' % self.sequence[pos:].upper())
            else:
                flob.write('%s\n' % self.sequence[pos:])
        if writeExtraNewline:
            flob.write('\n')

[docs]    def write(self):
        self.writeFastaToOpenFile(sys.stdout)

[docs]    def writeFasta(self, fName=None, width=60, doComment=True, writeExtraNewline=True):
        isFlob = False
        if not fName or fName == sys.stdout:
            f = sys.stdout
            isFlob = True
        elif hasattr(fName, 'write'):  # an open file-like object
            f = fName
            isFlob = True
        else:
            f = open(fName, 'w')
        self.writeFastaToOpenFile(f, width=width, doComment=doComment, writeExtraNewline=writeExtraNewline)
        if not isFlob:
            f.close()

[docs]    def translate(self, transl_table=1, checkStarts=False, nnn_is_gap=False):
        """Returns a protein Sequence from self, a DNA sequence.

        Self is translated using
        :meth:`p4.geneticcode.GeneticCode.translate`, so it handles
        ambiguities.  At the moment, we can only do translations where the
        frame of the codon is 123, ie the first sequence position is the
        first position of the codon.  The default transl_table is the
        standard (or so-called universal) genetic code, but you can change
        it.

        Other available translation tables, this week::

            if transl_table == 1: # standard
            elif transl_table == 2: # vertebrate mito
            elif transl_table == 4: # Mold, Protozoan,
                                    # and Coelenterate Mitochondrial Code
                                    # and the Mycoplasma/Spiroplasma Code
            elif transl_table == 5: # invertebrate mito
            elif transl_table == 9: # echinoderm mito

            and now 6, 10, 11, 12, 13, 14, 21.

        (These are found in :class:`p4.geneticcode.GeneticCode`)

        See also :meth:`p4.alignment.Alignment.checkTranslation` and
        :meth:`p4.alignment.Alignment.translate`.

        If the arg *checkStarts* is turned on (by default it is not turned
        on) then this method checks whether the first codon is a start
        codon, and if it is then it uses it.

        Arg *nnn_is_gap* is for odd sequences where there are long
        stretches of 'nnn' codons, which probably should be gaps.
        Probably best to correct those elsewise.

        """

        gm = ['Sequence.translate()']
        if self.dataType != 'dna':
            gm.append("Self should be a DNA Sequence")
            raise P4Error(gm)

        if self.nChar % 3 != 0:
            gm.append("The length of self should be a multiple of 3")
            raise P4Error(gm)
        nTriplets = int(self.nChar / 3)

        from p4.geneticcode import GeneticCode
        gc = GeneticCode(transl_table)

        prSeq = Sequence()
        prSeq.dataType = 'protein'
        prSeq.name = self.name
        prSeq.sequence = ['-'] * nTriplets

        dnaSeq = self.sequence
        protSeq = prSeq.sequence
        for j in range(nTriplets):
            theCodon = dnaSeq[(j * 3):(j * 3) + 3]
            # print theCodon
            if theCodon == '---':
                protSeq[j] = '-'
            elif theCodon.count('-'):
                print("    seq %s, position %4i, dnaSeq %4i, codon '%s' is incomplete" % (self.name, j, (j * 3), theCodon))
            elif theCodon == 'nnn':
                if nnn_is_gap:
                    print("    seq %s, position %4i, dnaSeq %4i, codon '%s' translating to a gap ('-')" % (
                        self.name, j, (j * 3), theCodon))
                    protSeq[j] = '-'
                else:
                    protSeq[j] = 'x'
            else:
                protSeq[j] = gc.translate(theCodon)
                if checkStarts and j == 0:
                    if theCodon in gc.startList:
                        sys.stderr.write("    Seq %s. The first codon, '%s', is a start codon -- making it m\n" % (
                            self.name, theCodon))
                        protSeq[j] = 'm'
                    else:
                        sys.stderr.write("    Seq %s. The first codon, '%s', is not a start codon\n" % (
                            self.name, theCodon))

        # Get rid of stop translation '*'
        if prSeq.sequence[-1] == '*':
            prSeq.sequence.pop()

        prSeq.sequence = ''.join(prSeq.sequence)
        return prSeq

[docs]    def checkTranslation(self, theProteinSequence, transl_table=1, checkStarts=False):
        """Check that self translates to theProteinSequence

        Self should be a DNA sequence.  It is translated using
        :meth:`p4.geneticcode.GeneticCode.translate` (so it should handle
        ambiguities) and compared against theProteinSequence.  The
        theProteinSequence name and gap pattern should
        be the same as in the DNA sequence.  The default transl_table is
        the standard (or so-called universal) genetic code.

        Other available translation tables, this week::

            if transl_table == 1:   # standard
            elif transl_table == 2: # vertebrate mito
            elif transl_table == 4: # Mold, Protozoan,
                                    # and Coelenterate Mitochondrial Code
                                    # and the Mycoplasma/Spiroplasma Code
            elif transl_table == 5: # invertebrate mito
            elif transl_table == 9: # echinoderm mito

            # and now 6, 10, 11, 12, 13, 14, 21.

        (These are found in :class:`~p4.geneticcode.GeneticCode`)

        See also :meth:`p4.alignment.Alignment.translate` and 
        :meth:`p4.alignment.Alignment.checkTranslation`

        If the arg *checkStarts* is turned on (by default it is not turned
        on) then this method checks whether the first codon is a start
        codon.

        """

        gm = ['Sequence.checkTranslation()']
        if self.dataType != 'dna':
            gm.append("Self should be a DNA sequence.")
            raise P4Error(gm)
        if not theProteinSequence or \
                not isinstance(theProteinSequence, p4.sequencelist.Sequence) or \
                theProteinSequence.dataType != 'protein':
            gm.append("Something wrong with theProteinSequence")
            raise P4Error(gm)

        if self.name != theProteinSequence.name:
            gm.append("The sequence names of self and theProteinSequence are not the same")
            raise P4Error(gm)

        if self.nChar != (3 * theProteinSequence.nChar):
            gm.append("The length of the DNA sequence should be 3 times that of theProteinSequence")
            gm.append("DNA sequence (self):  %i" % self.nChar)
            gm.append("Protein sequence:     %i  ( * 3 = %i)" %
                      (theProteinSequence.nChar, (3 * theProteinSequence.nChar)))
            raise P4Error(gm)

        gc = p4.geneticcode.GeneticCode(transl_table)

        pLen = theProteinSequence.nChar
        crimes = 0
        for j in range(pLen):
            theCodon = self.sequence[(3 * j) + 0] + \
                self.sequence[(3 * j) + 1] + \
                self.sequence[(3 * j) + 2]
            if theCodon == '---':
                if theProteinSequence.sequence[j] != '-':
                    print("    position %4i, codon '---' is '%s', should be '-'" % (j, theProteinSequence.sequence[j]))
                    crimes += 1
            elif theCodon.count('-'):
                print("    position %4i, codon '%s' is incomplete" % (j, theCodon))
                crimes += 1
            # elif theCodon in gc.code:
            #     if gc.code[theCodon] != theProteinSequence.sequence[j]:
            #         print "    position %4i, codon '%s' is '%s', should be '%s'" % (
            #             j, theCodon, theProteinSequence.sequence[j], gc.code[theCodon])
            #         crimes += 1
            # else:
            #     print("    position %4i, codon '%s' is not a known codon" % (j, theCodon))
            #     crimes += 1
            else:
                tr = gc.translate(theCodon)
                if tr != theProteinSequence.sequence[j]:
                    print("    position %4i, codon '%s' is '%s', should be '%s'" % (
                        j, theCodon, theProteinSequence.sequence[j], gc.code[theCodon]))
                    crimes += 1

                # If arg checkStarts is turned on -- Is the first
                # codon a start?  -- if not, it is not a crime
                if checkStarts and j == 0:
                    if theCodon in gc.startList:
                        print("    Seq %i (%s). The first codon, '%s', is a start codon" % (i, self.name, theCodon))
                    else:
                        print("    Seq %i (%s). The first codon, '%s', is not a start codon" % (i, self.name, theCodon))
            if crimes > 6:
                break
        if crimes > 6:
            print("    ... and possibly others, skipped.")
p4

Navigation

Related Topics

Source code for p4.sequence