Entering edit mode
4.6 years ago
amirah.rafique
•
0
Hi! I'm trying to calculate the GC3 content of tRNA encoding genes only. I have the gbff annotated genomes and the tabular versions, an example of which is here: https://www.ncbi.nlm.nih.gov/genome/browse/#!/proteins/665/300274%7CBacillus%20subtilis%20subsp.%20subtilis%20str.%20168/
I have already written a script that can extracts each gene of the whole genome, but how would I modify this to just the tRNA encoding genes using the table? I'm new to Python so any explanation would be great!
My function that pulls out the individual genes is below:
def extractinfo(file_path):
with io.open(file_path, mode="r", encoding="utf-8") as file:
# skips over the first line
file.readline()
# make an empty list
SSL_list = list()
# read through the whole file line by line
for line in file.readlines():
# separates by comma, pulls out specific headings as a list
SSL = line.split(",")
# convert the number string to an integer
start = int(SSL[2])
stop = int(SSL[3])
# replace double quotes with nothing (use single quotes to differentiate)
locustag = SSL[7].replace('"', "")
ID = SSL[1].replace('"', "")
# adds start, stop and locus tags to the end of the empty list every time it loops
SSL_list.append((start, stop, locustag, ID))
return SSL_list
def extract_seq(file_path):
with open(file_path) as file:
genome_sequence = str()
# True runs forever
while True:
# skips lines, if the line starts with origin, it stops, leaving us a line after
line = file.readline()
if line.startswith("ORIGIN"):
break
for line in file.readlines():
# replace spaces with nothing
line = line.replace(" ", "")
# starts at 0, continues until the end of the line (genome sequence)
for i in range(0, len(line)):
# checks if chararacters are actg
if line[i] in "actg":
genome_sequence += (line[i])
return genome_sequence
end_file = "/Users/"
def extract_genes(SSL_list, genome_sequence):
for start, stop, locustag, ID in SSL_list:
# extracts start:stop gene from the sequence
gene_substring = genome_sequence[start:stop]
# store in file
with open(end_file + "/" + ID + "+" + locustag + ".txt", "w") as file:
file.write(gene_substring)
work_dir = "/Users/"
for path in glob.glob(os.path.join(work_dir, "*.csv")):
SSL_list = extractinfo(path)
txt_file = path.replace(".csv", ".gbff")
sequences = extract_seq(txt_file)
extract_genes(SSL_list, sequences)
print(path)