Zientzilaria

Covering the bioinformatics niche and much more

GenBank Parsing: Both Scripts

| Comments

Both scripts to parse GenBank scripts are below and included in the repository. First the one that extracts amino acid sequences, and then the latest to extract the DNA.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#! /usr/bin/env python

'''
input is a GenBank file. The script searches for gene annotations, extract all lines
from the file and then parses these lines in order to extract protein sequences
Ribosomal genes and other non-coding genes are not extracted - plan to do it later
output is a fasta formatted file
'''

import sys
import fasta

class Protein:
    '''
    class that stores protein information
    '''
    def __init__(self, gi, id, sequence):
        self.gi = gi
        self.id = id
        self.sequence = sequence

def parse_entry(gene_data):
    '''
    parses the entry received from the main function
    in order to extract information as protein id
    gi, etc
    '''
    prot_id = ''
    sequence = ''
    gi_id = ''
    gene_data = gene_data.splitlines()
    for line in gene_data:
        if line.find('/product') >=0:
            prot_id = line[line.find('=') + 2:-1]
        elif line.find('protein_id') >= 0:
            prot_id += '\t' + line[line.find('=') + 2: -1]
        elif line.find('GI:') >= 0:
            gi_id = 'gi' + line[line.find('GI:')+3:-1]
        elif line.find('/translation') >= 0:
            sequence = line[line.find('=') + 2:]
            temp = gene_data.index(line)
            for i in range(temp+1, len(gene_data)):
                if gene_data[i].find('sig_peptide') >= 0:
                    break
                else:
                    sequence += gene_data[i].strip()

    return Protein(gi_id, prot_id, sequence)

#only input is a genbank file
gbfile = open(sys.argv[1])

proteins = []
index = 0
entry = ''
for line in gbfile:
    if line.find('  gene ') >= 0:
        if index >= 1:
            #parses the CDS and appends to a list
            proteins.append(parse_entry(entry))
            entry = ''
        index += 1
        entry += line
    elif line.find('ORIGIN') >= 0:
        #found the DNA sequence, we can stop now
        break
    else:
        entry += line

#parses the last entry after leaving the loop
proteins.append(parse_entry(entry))

#output
for i in proteins:
    if len(i.gi) > 2:
        print i.gi, i.id
        output = open(i.gi + '.fasta', 'w')
        output.write('>' + i.gi + '\t' + i.id + '\n')
        i.sequence = i.sequence.replace('\"', '')
        output.write(fasta.format_output(i.sequence, 80))
        print i.id
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#! /usr/bin/env python

'''
script that extracts sequences from a GenBank file. Script reads the gene CDS from the file and
builds a list of start and end positions, and if gene is complement outputs the 5'3' sequence and
its reverse complement
only input is a GenBank file
outputs a fasta file with the GI ID as its name.
'''

import sys
import fasta

class CDSinfo:
    '''
    CDSinf class to store all the information from CDS
    '''
    def __init__(self, gi_id, id, start, end, complement):
        self.gi_id = gi_id
        self.id = id
        self.start = start
        self.end = end
        self.complement = complement

def parse_entry(gene_data):
    '''
    each CDS entry is obtained in the main function and a string of lines with
    information is passed to parse_entry to be parsed and have information extracted
    '''

    gene_data = gene_data.splitlines() #changes a string to list, splitting at line ends
    start, end = 0, 0
    gi_id = ''
    id = ''
    complement = False
    for line in gene_data: #searches for regions annotated as CDS
        if line.find('  CDS  ') >=0:
            temp = line.split()
            #checks for complement sequence, if true remove extra characters
            if temp[1].find('complement') >= 0:
                complement = True
                temp[1] = temp[1].replace('complement(', '')
                temp[1] = temp[1].replace(')', '')
            temp2 = temp[1].split('..')
            start = temp2[0]
            end = temp2[1]
        #checks for GI IDs
        elif line.find('GI:') >= 0:
            gi_id = 'gi' + line[line.find('GI:')+3:-1]
        #get the gene name/function
        elif line.find('/product') >=0:
            id = line[line.find('=') + 2:-1]
        #and adds the protein id
        elif line.find('protein_id') >= 0:
            id += '\t' + line[line.find('=') + 2: -1]

    return CDSinfo(gi_id, id, start, end, complement)

#only input is the genbank file with annotation and sequence
gbfile = open(sys.argv[1])

index = 0
entry = ''
sequence = []
is_seq = False

genes = []
for line in gbfile:
    #reads the genbank file and whenever finds a gene annotation
    #concatenate the lines up to the next gene
    if line.find('  gene ') >= 0:
        #if an entry is complete, send it to parse
        if index >= 1:
            #appends to a list of CDSinfo objects
            genes.append(parse_entry(entry))
            entry = ''
        index += 1
        entry += line
    elif line.find('ORIGIN') >= 0:
        #found sequence start, set the flag on and parses the last entry
        is_seq = True
        genes.append(parse_entry(entry))
    elif is_seq == True:
        #if flag is true keep going, usually sequences are store at the end of the file
        line = line.split()
        sequence.append(line)
    else:
        #this is an entry so append line
        entry += line

str_seq = ''
#make the sequence a string
for i in sequence:
    str_seq += ''.join(i[1:]).upper()

for i in genes:
    if len(i.gi_id) > 2:
        print i.id, i.start, i.end
        output = open(i.gi_id + '.DNA.fasta', 'w')
        output.write('>' + i.gi_id + '\t' + i.id + '\n')
        # if this is a complement, print both 5'-3' and reverse complement sequences
        if i.complement == True:
            output.write(fasta.format_output(fasta.invert(str_seq[int(i.start)-1:int(i.end)]), 80) + '\n')
        else:
            if not i.start.find('join') >= 0:
                output.write(fasta.format_output(str_seq[int(i.start)-1:int(i.end)], 80))

Checking the DNA extract script carefully, we notice that we have an extra function in the fasta module, invert. Update: A small change to the script is that split('\n') can be replaced by splitlines() which is a built-in method for strings. This hasn’t been added yet to the repository version of the file and an explanation on its use will be added shortly.