import string
import sys
[docs]
class GeneticCode:
"""A container for NCBI translation tables.
See the ncbi translation tables, which this week are at
http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c
(If they move, poke around the 'taxonomy browser' area.)
This week we have
- **1** standard
- **2** vertebrate mito
- **3** yeast mito
- **4** Mold, Protozoan
- **5** invertbrate mito
- **6** The Ciliate, Dasycladacean and Hexamita Nuclear Code
- **9** echinoderm and flatworm mito
- **10** Euplotid Nuclear Code
- **11** Bacterial and Plant Plastid Code
- **12** Alternative Yeast Nuclear Code
- **13** Ascidian Mitochondrial Code
- **14** Alternative Flatworm Mitochondrial Code
- **21** Trematode Mitochondrial Code
- **24** Pterobranchia mito
If more transl_tables are needed, you should be able to just drop
them in, with a little tweaking.
This provides
- **code** A dictionary. So you can ask for eg myGC.code['ggg']
- **codonsForAA** Another dictionary, where you can ask for eg myGC.codonsForAA['v']
- **startList** A list of start codons
**Methods**
.. autosummary::
GeneticCode.translate
GeneticCode.wise2Table
Wikipedia says: The joint nomenclature committee of the
IUPAC/IUBMB has officially recommended the three-letter symbol Sec
and the one-letter symbol U for selenocysteine. The UGA codon is
made to encode selenocysteine by the presence of a SECIS element
(SElenoCysteine Insertion Sequence) in the mRNA.
"""
def __init__(self, transl_table=1):
self.transl_table = transl_table
self.code = {}
self.codonsForAA = {}
self.startList = []
if transl_table == 1: # standard
AAs = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
Starts = '---M---------------M---------------M----------------------------'
Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
elif transl_table == 2: # vertebrate mito
AAs = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG'
Starts = '--------------------------------MMMM---------------M------------'
Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
# 3. The Yeast Mitochondrial Code (transl_table=3)
elif transl_table == 3:
AAs = 'FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
Starts = '----------------------------------MM----------------------------'
Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
elif transl_table == 4: # Mold, Protozoan,
# and Coelenterate Mitochondrial Code and the
# Mycoplasma/Spiroplasma Code
AAs = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
Starts = '--MM---------------M------------MMMM---------------M------------'
Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
elif transl_table == 5: # invertebrate mito
AAs = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG'
Starts = '---M----------------------------MMMM---------------M------------'
Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
# The Ciliate, Dasycladacean and Hexamita Nuclear Code (transl_table=6)
elif transl_table == 6:
AAs = 'FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
Starts = '-----------------------------------M----------------------------'
Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
# tables 7 and 8 have been deleted from NCBI.
elif transl_table == 9: # echinoderm and flatworm mito
AAs = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG'
Starts = '-----------------------------------M----------------------------'
Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
elif transl_table == 10: # The Euplotid Nuclear Code (transl_table=10)
AAs = 'FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
Starts = '-----------------------------------M----------------------------'
Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
# The Bacterial and Plant Plastid Code (transl_table=11)
elif transl_table == 11:
AAs = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
Starts = '---M---------------M------------MMMM---------------M------------'
Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
# The Alternative Yeast Nuclear Code (transl_table=12)
elif transl_table == 12:
AAs = 'FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'
Starts = '-------------------M---------------M----------------------------'
Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
# The Ascidian Mitochondrial Code (transl_table=13)
elif transl_table == 13:
AAs = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG'
Starts = '---M------------------------------MM---------------M------------'
Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
# The Alternative Flatworm Mitochondrial Code (transl_table=14)
elif transl_table == 14:
AAs = 'FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG'
Starts = '-----------------------------------M----------------------------'
Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
# Trematode Mitochondrial Code (transl_table=21)
elif transl_table == 21:
AAs = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG'
Starts = '-----------------------------------M---------------M------------'
Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
# Pterobranchia mitochondrial code (transl_table=24)
elif transl_table == 24:
AAs = 'FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSSKVVVVAAAADDEEGGGG'
Starts = '---M---------------M---------------M---------------M------------'
Base1 = 'TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG'
Base2 = 'TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG'
Base3 = 'TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG'
else:
print("GeneticCode: I don't know transl_table %i. Get it from NCBI and add it!" % transl_table)
sys.exit()
for i in range(64):
theCodon = Base1[i] + Base2[i] + Base3[i]
theCodon = theCodon.lower()
theAA = AAs[i].lower()
self.code[theCodon] = theAA
if theAA in self.codonsForAA:
self.codonsForAA[theAA].append(theCodon)
else:
self.codonsForAA[theAA] = [theCodon]
if Starts[i] == 'M':
self.startList.append(theCodon)
if 1:
self.codonsForAA['b'] = self.codonsForAA[
'n'] + self.codonsForAA['d']
self.codonsForAA['z'] = self.codonsForAA[
'q'] + self.codonsForAA['e']
if 1:
self.codonsForAA['u'] = ['tga'] # selenocysteine
if 0:
k = self.code.keys()
k.sort()
for aKey in k:
print("%20s %-30s" % (aKey, self.code[aKey]))
[docs]
def wise2Table(self):
"""Output in a form suitable to replace ``codon.table`` in ``genewise`` in Wise2.
See the `Wise2 web site. <http://www.ebi.ac.uk/Tools/Wise2/doc_wise2.html>`_
By default, ``genewise`` from the Wise2 package uses the
standard genetic code, defined in the file ``codon.table``.
However, you can supply your own table, and you can use this
method to make a suitable file. Due to lazy programming, this
method prints to stdout, so you will need to put the output in
a file yourself. Put that file, suitably named (eg
``codon.table5`` or whatever) in the ``wisecfg`` directory
(where the original ``codon.table`` resides), which might be
``/usr/local/src/wise2.2.0/wisecfg`` or some such location.
Then, when you call ``genewise`` you can use the ``-codon``
option to set the codon table file that you want to use, eg::
genewise -genes -cdna -trans -pep -pretty -silent -codon codon.table5 guideFileName dnaFileName
"""
print("! this is a codon table")
print("! by p4, for ncbi %i" % self.transl_table)
for first in "tcag":
for second in "tcag":
for third in "tcag":
lcod = "%s%s%s" % (first, second, third)
ret = self.code[lcod]
if ret == '*':
ret = 'X'
print(lcod.upper(), ret.upper())
[docs]
def translate(self, theCodon, verbose=1):
"""Translate a codon, handling ambiguities.
This method will translate a codon, depending of course on the
transl_table (which is specific to *self*), correctly handling
ambiguities appropriate to the transl_table. It does not give
info about whether the codon is potentially a start codon.
This method is used by the methods
:meth:`Alignment.Alignment.translate` and
:meth:`Alignment.Alignment.checkTranslation`.
A translation like that from codon ``ggg`` to amino acid ``g``
is direct and easy. However, this method will also translate
ambiguous codons ``ggy`` or ``ggs`` (and so on) to ``g``,
unequivocally, because the four codons that start with ``gg``
all code for ``g`` (which appears to be true for all
translation tables, but that is not a requirement here).
If all possible disambiguations of an ambiguous codon code for
a particular amino acid then this method will return that
amino acid; otherwise the translation is ambiguous and this
method returns ``x``.
So for example the codon ``tgr`` will translate to ``x`` using
transl_table = 1 (standard) because ``tga`` is a stop codon
and ``tgg`` codes for ``w``. However, using transl_table = 2
(vertebrate mito), codon ``tgr`` will translate to ``w``
because both ``tga`` and ``tgg`` code for ``w``::
>>> gc = GeneticCode(transl_table=1)
>>> gc.translate('tgr')
# codon 'tgr' translates to ['*', 'w'] -- ambiguous -- returning 'x'
>>> gc = GeneticCode(transl_table=2)
>>> gc.translate('tgr')
# codon 'tgr' translates to 'w'
Exceptions to the latter rule are codons that ambiguously code
for either ``d`` or ``n``, which return ambiguous amino acid
``b``, and codons that ambigously code for either ``q`` or
``e``, which return ambiguous amino acid ``z``. See the
example below.
If arg *verbose* is 0, it does not speak (except for errors,
of course). If its 1, it speaks for ambiguous translations.
If its 2, it speaks for all translations. The default is 1::
gc = GeneticCode(transl_table=1)
for cdn in 'gga ggy ray ggn aam sar ccm'.split():
gc.translate(cdn, verbose=2)
prints::
codon 'gga' translates to 'g'
codon 'ggy' translates to 'g'
codon 'ray' translates to ['d', 'n'] -- ambiguous aa 'b'
codon 'ggn' translates to 'g'
codon 'aam' translates to ['k', 'n'] -- ambiguous -- returning 'x'
codon 'sar' translates to ['q', 'e'] -- ambiguous aa 'z'
codon 'ccm' translates to 'p'
"""
gm = ["GeneticCode.translate(), for codon '%s'" % theCodon]
# print theCodon
assert len(theCodon) == 3
if verbose not in [0, 1, 2]:
gm.append("Arg verbose should be one of 0, 1, or 2.")
raise P4Error(gm)
dnaEquateKeys = ['b', 'd', 'h', 'k', 'm', 'n', 's', 'r', 'w', 'v', 'y']
equates = {'b': 'cgt', 'd': 'agt', 'h': 'act', 'k': 'gt', 'm': 'ac',
'n': 'acgt', 's': 'cg', 'r': 'ag', 'w': 'at', 'v': 'acg', 'y': 'ct'}
# {'x': 'arndcqeghilkmfpstwyv', 'b': 'dn', 'z': 'eq'}
# If its in self.code, its easy...
ret = self.code.get(theCodon)
if ret:
if verbose >= 2:
print(" codon '%s' translates to '%s'" % (theCodon, ret))
return ret
# Ok, so not easy. Check for valid characters.
if theCodon[0] not in 'acgtbdhkmnsrwvy':
gm.append(
"The first position of codon '%s' is not a lowercase DNA character." % theCodon)
raise P4Error(gm)
if theCodon[1] not in 'acgtbdhkmnsrwvy':
gm.append(
"The second position of codon '%s' is not a lowercase DNA character." % theCodon)
raise P4Error(gm)
if theCodon[2] not in 'acgtbdhkmnsrwvy':
gm.append(
"The third position of codon '%s' is not a lowercase DNA character." % theCodon)
raise P4Error(gm)
# Expand the ambiguities. Eg 'gcy' becomes ['gcc', 'gct']
expanded = []
expanded2 = []
dnaEquateKeys = 'bdhkmnsrwvy'
equates = {'b': 'cgt', 'd': 'agt', 'h': 'act', 'k': 'gt', 'm': 'ac',
'n': 'acgt', 's': 'cg', 'r': 'ag', 'w': 'at', 'v': 'acg', 'y': 'ct'}
bSet = set(['d', 'n'])
zSet = set(['e', 'q'])
if theCodon[0] in dnaEquateKeys:
c = theCodon[0]
vv = equates[c]
for v in vv:
expanded.append("%s%s%s" % (v, theCodon[1], theCodon[2]))
else:
expanded = [theCodon]
if theCodon[1] in dnaEquateKeys:
c = theCodon[1]
vv = equates[c]
for cd in expanded:
for v in vv:
expanded2.append("%s%s%s" % (cd[0], v, cd[2]))
expanded = expanded2
expanded2 = []
if theCodon[2] in dnaEquateKeys:
c = theCodon[2]
vv = equates[c]
for cd in expanded:
for v in vv:
expanded2.append("%s%s%s" % (cd[0], cd[1], v))
expanded = expanded2
# print expanded
translations = []
for cd in expanded:
tr = self.code.get(cd)
if not tr:
gm.append("Could not translate expanded codon '%s'" % cd)
raise P4Error(gm)
translations.append(tr)
# print translations
if not translations:
gm.append("Did not get any translations from expanded codons.")
raise P4Error(gm)
tSet = set(translations)
if len(tSet) == 1:
# Easy again
if verbose >= 1:
print(" codon '%s' translates to '%s'" % (theCodon, translations[0]))
return translations[0]
else:
tList = list(tSet)
if len(tSet) == 2:
if tSet == bSet:
if verbose >= 1:
print(" codon '%s' translates to %s -- ambiguous aa 'b'" % (theCodon, tList))
return 'b'
elif tSet == zSet:
if verbose >= 1:
print(" codon '%s' translates to %s -- ambiguous aa 'z'" % (theCodon, tList))
return 'z'
else:
if verbose >= 1:
print(" codon '%s' translates to %s -- ambiguous -- returning 'x'" % (theCodon, tList))
return 'x'
else: # more than 2
if verbose:
print(" codon '%s' translates to %s -- ambiguous -- returning 'x'" % (theCodon, tList))
return 'x'