Question

How can I extract information from tRNA encoding genes only (Python)?

0

Entering edit mode

3.1 years ago

amirah.rafique • 0

Hi! I'm trying to calculate the GC3 content of tRNA encoding genes only. I have the gbff annotated genomes and the tabular versions, an example of which is here: https://www.ncbi.nlm.nih.gov/genome/browse/#!/proteins/665/300274%7CBacillus%20subtilis%20subsp.%20subtilis%20str.%20168/

I have already written a script that can extracts each gene of the whole genome, but how would I modify this to just the tRNA encoding genes using the table? I'm new to Python so any explanation would be great!

My function that pulls out the individual genes is below:

def extractinfo(file_path):

with io.open(file_path, mode="r", encoding="utf-8") as file:
    # skips over the first line
    file.readline()
    # make an empty list
    SSL_list = list()
    # read through the whole file line by line
    for line in file.readlines():
        # separates by comma, pulls out specific headings as a list
        SSL = line.split(",")
        # convert the number string to an integer
        start = int(SSL[2])
        stop = int(SSL[3])
        # replace double quotes with nothing (use single quotes to differentiate)
        locustag = SSL[7].replace('"', "")
        ID = SSL[1].replace('"', "")
        # adds start, stop and locus tags to the end of the empty list every time it loops
        SSL_list.append((start, stop, locustag, ID))
    return SSL_list

def extract_seq(file_path):
with open(file_path) as file:
    genome_sequence = str()
    # True runs forever
    while True:
        # skips lines, if the line starts with origin, it stops, leaving us a line after
        line = file.readline()
        if line.startswith("ORIGIN"):
            break
    for line in file.readlines():
        # replace spaces with nothing
        line = line.replace(" ", "")
        # starts at 0, continues until the end of the line (genome sequence)
        for i in range(0, len(line)):
            # checks if chararacters are actg
            if line[i] in "actg":
                genome_sequence += (line[i])
return genome_sequence
end_file = "/Users/"

def extract_genes(SSL_list, genome_sequence):
for start, stop, locustag, ID in SSL_list:
    # extracts start:stop gene from the sequence
    gene_substring = genome_sequence[start:stop]
    # store in file
    with open(end_file + "/" + ID + "+" + locustag + ".txt", "w") as file:
        file.write(gene_substring)
work_dir = "/Users/"
for path in glob.glob(os.path.join(work_dir, "*.csv")):
SSL_list = extractinfo(path)
txt_file = path.replace(".csv", ".gbff")
sequences = extract_seq(txt_file)
extract_genes(SSL_list, sequences)
print(path)

Python gc3 script • 518 views

ADD COMMENT • link 3.1 years ago by amirah.rafique • 0