If Prodigal and GeneMark predict the same gene, but it gets shortened in two...

If Prodigal and GeneMark predict the same gene, but it gets shortened in two different placed, only the shortened GeneMark gene is kept.

If Prodigal and GeneMark predict the same gene, but it gets shortened in two...
b6f640d4 · Marcel Huntemann · bf707ec3 · b6f640d4 · b6f640d4 · b6f640d4
Commit b6f640d4 authored 3 years ago by Marcel Huntemann
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
 # Changelog
 This document provides info about every changed aspect of the IMG Annotation Pipeline, since switching to the new production version that unifies isolate and metagenome processing (v5.0.0).

+## [5.0.25] - 2021-09-10
+### Changed
+- If Prodigal and GeneMark predict the same gene, but it gets shortened in two different placed, only the shortened GeneMark gene is kept.
+
 ## [5.0.24] - 2021-08-20
 ### Changed
 - The filtering script for the LAST results now makes an addional run over the output to check which subjects are actually needed in the MD5 lookup.

--- a/VERSION
+++ b/VERSION
-5.0.24
+5.0.25
--- a/bin/structural_annotation/fasta_files_merger.py
+++ b/bin/structural_annotation/fasta_files_merger.py
@@ -26,6 +26,7 @@ allowed_start_codons = frozenset(["ATG", "GTG", "TTG"])
 """Parse the CDS gene IDs out of the given gff file."""
 gene_to_start_type = {}
 shortened_genes = {}
+genes_to_ignore = set()
 fr = open(args.gff, "r")
 for line in fr:
    if not line or line[0] == "#" or line == "\n":
@@ -37,13 +38,26 @@ for line in fr:
    gene_id = fields[-1].split(";", 1)[0].split("=", 1)[1]
    if "shortened" in fields[-1]:
        gene_id_parts = gene_id.rsplit("_", 2)
-        shortened_info = fields[-1].split("shortened=", 1)[1].split(";", 1)[0]
+        shortened_info = fields[-1].split("shortened=", 1)[1].split(";", 1)[0].rstrip()
        pos, coord = shortened_info.split()[1:]
        if pos == "start":
            gene_id_parts[1] = coord
        else:
            gene_id_parts[2] = coord
        original_gene_id = "_".join(gene_id_parts)
+        # If a gene that got predicted by both methods gets shortened in a different
+        # place (start, end) for each method, keep only the GeneMark partial gene.
+        if original_gene_id in shortened_genes:
+            if "Prodigal" in fields[1]:
+                # print(f"Ignoring shortened Prodigal gene {gene_id}, since GeneMark shortened the same " +
+                #       f"original gene {original_gene_id} as well.", file = sys.stderr)
+                # print('Adding to ignore: ' + gene_id)
+                genes_to_ignore.add(gene_id)
+                continue
+            else:
+                prodigal_gene_id = shortened_genes[original_gene_id].split("\t")[0]
+                # print('Adding to ignore: ' + prodigal_gene_id)
+                genes_to_ignore.add(prodigal_gene_id)
        shortened_genes[original_gene_id] = gene_id + "\t" + shortened_info
 #        print("Found shortened info '" + shortened_info + "' for gene " +
 #              gene_id + " (original: " + original_gene_id + ")",
@@ -159,8 +173,9 @@ if suffix == "fna":
        if fields[2] == "CDS":
            attributes = fields[-1].rstrip().split(";")
            gene_id = attributes[0][3:].strip()
-            attributes.append("start_type=" +
-                                    gene_to_start_type[gene_id])
+            if gene_id in genes_to_ignore:
+                continue
+            attributes.append(f"start_type={gene_to_start_type[gene_id]}")
            fields[-1] = ";".join(attributes)
            line = "\t".join(fields) + "\n"
        fw.write(line)