Source code for p4.geneticcode

import string
import sys


[docs]class GeneticCode:

    """A container for NCBI translation tables.

    See the ncbi translation tables, which this week are at
    http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c

    (If they move, poke around the 'taxonomy browser' area.)

    This week we have

    - **1** standard
    - **2** vertebrate mito
    - **3** yeast mito
    - **4** Mold, Protozoan
    - **5** invertbrate mito
    - **6** The Ciliate, Dasycladacean and Hexamita Nuclear Code
    - **9** echinoderm and flatworm mito
    - **10**  Euplotid Nuclear Code
    - **11**  Bacterial and Plant Plastid Code
    -  **12** Alternative Yeast Nuclear Code
    -  **13** Ascidian Mitochondrial Code
    -  **14** Alternative Flatworm Mitochondrial Code
    -  **21** Trematode Mitochondrial Code
    -  **24** Pterobranchia mito

    If more transl_tables are needed, you should be able to just drop
    them in, with a little tweaking.

    This provides

    - **code**  A dictionary.  So you can ask for eg myGC.code['ggg']
    - **codonsForAA**  Another dictionary, where you can ask for eg myGC.codonsForAA['v']
    - **startList**  A list of start codons

    **Methods**
     .. autosummary::

        GeneticCode.translate
        GeneticCode.wise2Table

    Wikipedia says: The joint nomenclature committee of the
    IUPAC/IUBMB has officially recommended the three-letter symbol Sec
    and the one-letter symbol U for selenocysteine.  The UGA codon is
    made to encode selenocysteine by the presence of a SECIS element
    (SElenoCysteine Insertion Sequence) in the mRNA.

    """

    def __init__(self, transl_table=1):
        self.transl_table = transl_table
        self.code = {}
        self.codonsForAA = {}
        self.startList = []

        if transl_table == 1:  # standard
            AAs = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
            Starts = '---M---------------M---------------M----------------------------'
            Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
            Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
            Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'

        elif transl_table == 2:  # vertebrate mito
            AAs = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG'
            Starts = '--------------------------------MMMM---------------M------------'
            Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
            Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
            Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'

        # 3. The Yeast Mitochondrial Code (transl_table=3)
        elif transl_table == 3:
            AAs = 'FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
            Starts = '----------------------------------MM----------------------------'
            Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
            Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
            Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'

        elif transl_table == 4:  # Mold, Protozoan,
                                # and Coelenterate Mitochondrial Code and the
                                # Mycoplasma/Spiroplasma Code
            AAs = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
            Starts = '--MM---------------M------------MMMM---------------M------------'
            Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
            Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
            Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'

        elif transl_table == 5:  # invertebrate mito
            AAs = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG'
            Starts = '---M----------------------------MMMM---------------M------------'
            Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
            Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
            Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'

        # The Ciliate, Dasycladacean and Hexamita Nuclear Code (transl_table=6)
        elif transl_table == 6:
            AAs = 'FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
            Starts = '-----------------------------------M----------------------------'
            Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
            Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
            Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'

        # tables 7 and 8 have been deleted from NCBI.

        elif transl_table == 9:  # echinoderm and flatworm mito
            AAs = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG'
            Starts = '-----------------------------------M----------------------------'
            Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
            Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
            Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'

        elif transl_table == 10:  # The Euplotid Nuclear Code (transl_table=10)
            AAs = 'FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
            Starts = '-----------------------------------M----------------------------'
            Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
            Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
            Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'

        # The Bacterial and Plant Plastid Code (transl_table=11)
        elif transl_table == 11:
            AAs = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
            Starts = '---M---------------M------------MMMM---------------M------------'
            Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
            Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
            Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'

        # The Alternative Yeast Nuclear Code (transl_table=12)
        elif transl_table == 12:
            AAs = 'FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
            Starts = '-------------------M---------------M----------------------------'
            Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
            Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
            Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'

        # The Ascidian Mitochondrial Code (transl_table=13)
        elif transl_table == 13:
            AAs = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG'
            Starts = '---M------------------------------MM---------------M------------'
            Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
            Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
            Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'

        # The Alternative Flatworm Mitochondrial Code (transl_table=14)
        elif transl_table == 14:
            AAs = 'FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG'
            Starts = '-----------------------------------M----------------------------'
            Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
            Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
            Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'

        # Trematode Mitochondrial Code (transl_table=21)
        elif transl_table == 21:
            AAs = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG'
            Starts = '-----------------------------------M---------------M------------'
            Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
            Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
            Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'

        # Pterobranchia mitochondrial code (transl_table=24)
        elif transl_table == 24:
            AAs = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG'
            Starts = '---M---------------M---------------M---------------M------------'
            Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
            Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
            Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'

        else:
            print("GeneticCode: I don't know transl_table %i.  Get it from NCBI and add it!" % transl_table)
            sys.exit()

        for i in range(64):
            theCodon = Base1[i] + Base2[i] + Base3[i]
            theCodon = theCodon.lower()
            theAA = AAs[i].lower()
            self.code[theCodon] = theAA
            if theAA in self.codonsForAA:
                self.codonsForAA[theAA].append(theCodon)
            else:
                self.codonsForAA[theAA] = [theCodon]
            if Starts[i] == 'M':
                self.startList.append(theCodon)

        if 1:
            self.codonsForAA['b'] = self.codonsForAA[
                'n'] + self.codonsForAA['d']
            self.codonsForAA['z'] = self.codonsForAA[
                'q'] + self.codonsForAA['e']

        if 1:
            self.codonsForAA['u'] = ['tga']  # selenocysteine

        if 0:
            k = self.code.keys()
            k.sort()
            for aKey in k:
                print("%20s  %-30s" % (aKey, self.code[aKey]))

[docs]    def wise2Table(self):
        """Output in a form suitable to replace ``codon.table`` in ``genewise`` in Wise2.

        See the `Wise2 web site. <http://www.ebi.ac.uk/Tools/Wise2/doc_wise2.html>`_

        By default, ``genewise`` from the Wise2 package uses the
        standard genetic code, defined in the file ``codon.table``.
        However, you can supply your own table, and you can use this
        method to make a suitable file.  Due to lazy programming, this
        method prints to stdout, so you will need to put the output in
        a file yourself.  Put that file, suitably named (eg
        ``codon.table5`` or whatever) in the ``wisecfg`` directory
        (where the original ``codon.table`` resides), which might be
        ``/usr/local/src/wise2.2.0/wisecfg`` or some such location.

        Then, when you call ``genewise`` you can use the ``-codon``
        option to set the codon table file that you want to use, eg::

            genewise -genes -cdna -trans -pep -pretty -silent -codon codon.table5 guideFileName dnaFileName

        """

        print("! this is a codon table")
        print("! by p4, for ncbi %i" % self.transl_table)
        for first in "tcag":
            for second in "tcag":
                for third in "tcag":
                    lcod = "%s%s%s" % (first, second, third)
                    ret = self.code[lcod]
                    if ret == '*':
                        ret = 'X'
                    print(lcod.upper(), ret.upper())

[docs]    def translate(self, theCodon, verbose=1):
        """Translate a codon, handling ambiguities.

        This method will translate a codon, depending of course on the
        transl_table (which is specific to *self*), correctly handling
        ambiguities appropriate to the transl_table.  It does not give
        info about whether the codon is potentially a start codon.

        This method is used by the methods
        :meth:`Alignment.Alignment.translate` and
        :meth:`Alignment.Alignment.checkTranslation`.

        A translation like that from codon ``ggg`` to amino acid ``g``
        is direct and easy.  However, this method will also translate
        ambiguous codons ``ggy`` or ``ggs`` (and so on) to ``g``,
        unequivocally, because the four codons that start with ``gg``
        all code for ``g`` (which appears to be true for all
        translation tables, but that is not a requirement here).

        If all possible disambiguations of an ambiguous codon code for
        a particular amino acid then this method will return that
        amino acid; otherwise the translation is ambiguous and this
        method returns ``x``. 

        So for example the codon ``tgr`` will translate to ``x`` using
        transl_table = 1 (standard) because ``tga`` is a stop codon
        and ``tgg`` codes for ``w``.  However, using transl_table = 2
        (vertebrate mito), codon ``tgr`` will translate to ``w``
        because both ``tga`` and ``tgg`` code for ``w``::

            >>> gc = GeneticCode(transl_table=1)
            >>> gc.translate('tgr')
                #   codon 'tgr' translates to ['*', 'w'] -- ambiguous -- returning 'x'
            >>> gc = GeneticCode(transl_table=2)
            >>> gc.translate('tgr')
                #   codon 'tgr' translates to 'w'

        Exceptions to the latter rule are codons that ambiguously code
        for either ``d`` or ``n``, which return ambiguous amino acid
        ``b``, and codons that ambigously code for either ``q`` or
        ``e``, which return ambiguous amino acid ``z``.  See the
        example below.

        If arg *verbose* is 0, it does not speak (except for errors,
        of course).  If its 1, it speaks for ambiguous translations.
        If its 2, it speaks for all translations.  The default is 1::

            gc = GeneticCode(transl_table=1)

            for cdn in 'gga ggy ray ggn aam sar ccm'.split():
                gc.translate(cdn, verbose=2)

        prints::

            codon 'gga' translates to 'g'
            codon 'ggy' translates to 'g'
            codon 'ray' translates to ['d', 'n'] -- ambiguous aa 'b'
            codon 'ggn' translates to 'g'
            codon 'aam' translates to ['k', 'n'] -- ambiguous -- returning 'x'
            codon 'sar' translates to ['q', 'e'] -- ambiguous aa 'z'
            codon 'ccm' translates to 'p'

        """

        gm = ["GeneticCode.translate(), for codon '%s'" % theCodon]
        # print theCodon
        assert len(theCodon) == 3
        if verbose not in [0, 1, 2]:
            gm.append("Arg verbose should be one of 0, 1, or 2.")
            raise P4Error(gm)

        dnaEquateKeys = ['b', 'd', 'h', 'k', 'm', 'n', 's', 'r', 'w', 'v', 'y']
        equates = {'b': 'cgt', 'd': 'agt', 'h': 'act', 'k': 'gt', 'm': 'ac',
                   'n': 'acgt', 's': 'cg', 'r': 'ag', 'w': 'at', 'v': 'acg', 'y': 'ct'}
        # {'x': 'arndcqeghilkmfpstwyv', 'b': 'dn', 'z': 'eq'}

        # If its in self.code, its easy...
        ret = self.code.get(theCodon)
        if ret:
            if verbose >= 2:
                print("    codon '%s' translates to '%s'" % (theCodon, ret))
            return ret

        # Ok, so not easy.  Check for valid characters.
        if theCodon[0] not in 'acgtbdhkmnsrwvy':
            gm.append(
                "The first position of codon '%s' is not a lowercase DNA character." % theCodon)
            raise P4Error(gm)
        if theCodon[1] not in 'acgtbdhkmnsrwvy':
            gm.append(
                "The second position of codon '%s' is not a lowercase DNA character." % theCodon)
            raise P4Error(gm)
        if theCodon[2] not in 'acgtbdhkmnsrwvy':
            gm.append(
                "The third position of codon '%s' is not a lowercase DNA character." % theCodon)
            raise P4Error(gm)

        # Expand the ambiguities.  Eg 'gcy' becomes ['gcc', 'gct']
        expanded = []
        expanded2 = []
        dnaEquateKeys = 'bdhkmnsrwvy'
        equates = {'b': 'cgt', 'd': 'agt', 'h': 'act', 'k': 'gt', 'm': 'ac',
                   'n': 'acgt', 's': 'cg', 'r': 'ag', 'w': 'at', 'v': 'acg', 'y': 'ct'}
        bSet = set(['d', 'n'])
        zSet = set(['e', 'q'])

        if theCodon[0] in dnaEquateKeys:
            c = theCodon[0]
            vv = equates[c]
            for v in vv:
                expanded.append("%s%s%s" % (v, theCodon[1], theCodon[2]))
        else:
            expanded = [theCodon]
        if theCodon[1] in dnaEquateKeys:
            c = theCodon[1]
            vv = equates[c]
            for cd in expanded:
                for v in vv:
                    expanded2.append("%s%s%s" % (cd[0], v, cd[2]))
            expanded = expanded2
            expanded2 = []
        if theCodon[2] in dnaEquateKeys:
            c = theCodon[2]
            vv = equates[c]
            for cd in expanded:
                for v in vv:
                    expanded2.append("%s%s%s" % (cd[0], cd[1], v))
            expanded = expanded2

        # print expanded
        translations = []
        for cd in expanded:
            tr = self.code.get(cd)
            if not tr:
                gm.append("Could not translate expanded codon '%s'" % cd)
                raise P4Error(gm)
            translations.append(tr)
        # print translations
        if not translations:
            gm.append("Did not get any translations from expanded codons.")
            raise P4Error(gm)
        tSet = set(translations)
        if len(tSet) == 1:
            # Easy again
            if verbose >= 1:
                print("    codon '%s' translates to '%s'" % (theCodon, translations[0]))
            return translations[0]
        else:
            tList = list(tSet)

            if len(tSet) == 2:
                if tSet == bSet:
                    if verbose >= 1:
                        print("    codon '%s' translates to %s -- ambiguous aa 'b'" % (theCodon, tList))
                    return 'b'
                elif tSet == zSet:
                    if verbose >= 1:
                        print("    codon '%s' translates to %s -- ambiguous aa 'z'" % (theCodon, tList))
                    return 'z'
                else:
                    if verbose >= 1:
                        print("    codon '%s' translates to %s -- ambiguous -- returning 'x'" % (theCodon, tList))
                    return 'x'
            else:  # more than 2
                if verbose:
                    print("    codon '%s' translates to %s -- ambiguous -- returning 'x'" % (theCodon, tList))
                return 'x'
p4

Navigation

Related Topics

Source code for p4.geneticcode