Skip to content
Snippets Groups Projects
Commit b6f640d4 authored by Marcel Huntemann's avatar Marcel Huntemann
Browse files

If Prodigal and GeneMark predict the same gene, but it gets shortened in two...

If Prodigal and GeneMark predict the same gene, but it gets shortened in two different placed, only the shortened GeneMark gene is kept.
parent bf707ec3
Branches
Tags v5.0.25
No related merge requests found
# Changelog
This document provides info about every changed aspect of the IMG Annotation Pipeline, since switching to the new production version that unifies isolate and metagenome processing (v5.0.0).
## [5.0.25] - 2021-09-10
### Changed
- If Prodigal and GeneMark predict the same gene, but it gets shortened in two different placed, only the shortened GeneMark gene is kept.
## [5.0.24] - 2021-08-20
### Changed
- The filtering script for the LAST results now makes an addional run over the output to check which subjects are actually needed in the MD5 lookup.
......
5.0.24
5.0.25
......@@ -26,6 +26,7 @@ allowed_start_codons = frozenset(["ATG", "GTG", "TTG"])
"""Parse the CDS gene IDs out of the given gff file."""
gene_to_start_type = {}
shortened_genes = {}
genes_to_ignore = set()
fr = open(args.gff, "r")
for line in fr:
if not line or line[0] == "#" or line == "\n":
......@@ -37,13 +38,26 @@ for line in fr:
gene_id = fields[-1].split(";", 1)[0].split("=", 1)[1]
if "shortened" in fields[-1]:
gene_id_parts = gene_id.rsplit("_", 2)
shortened_info = fields[-1].split("shortened=", 1)[1].split(";", 1)[0]
shortened_info = fields[-1].split("shortened=", 1)[1].split(";", 1)[0].rstrip()
pos, coord = shortened_info.split()[1:]
if pos == "start":
gene_id_parts[1] = coord
else:
gene_id_parts[2] = coord
original_gene_id = "_".join(gene_id_parts)
# If a gene that got predicted by both methods gets shortened in a different
# place (start, end) for each method, keep only the GeneMark partial gene.
if original_gene_id in shortened_genes:
if "Prodigal" in fields[1]:
# print(f"Ignoring shortened Prodigal gene {gene_id}, since GeneMark shortened the same " +
# f"original gene {original_gene_id} as well.", file = sys.stderr)
# print('Adding to ignore: ' + gene_id)
genes_to_ignore.add(gene_id)
continue
else:
prodigal_gene_id = shortened_genes[original_gene_id].split("\t")[0]
# print('Adding to ignore: ' + prodigal_gene_id)
genes_to_ignore.add(prodigal_gene_id)
shortened_genes[original_gene_id] = gene_id + "\t" + shortened_info
# print("Found shortened info '" + shortened_info + "' for gene " +
# gene_id + " (original: " + original_gene_id + ")",
......@@ -159,8 +173,9 @@ if suffix == "fna":
if fields[2] == "CDS":
attributes = fields[-1].rstrip().split(";")
gene_id = attributes[0][3:].strip()
attributes.append("start_type=" +
gene_to_start_type[gene_id])
if gene_id in genes_to_ignore:
continue
attributes.append(f"start_type={gene_to_start_type[gene_id]}")
fields[-1] = ";".join(attributes)
line = "\t".join(fields) + "\n"
fw.write(line)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment