feat(IPVC-2276): skip accessions from gff that are not in the tx_info file, log missing acs to file (#18)

sptaylor · web-flow · commit 49e08e9b510d · 2024-04-05T12:07:06.000-07:00
diff --git a/etc/scripts/run-uta-build.sh b/etc/scripts/run-uta-build.sh
@@ -64,8 +64,12 @@ GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_as
 sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/gff.exonsets.gz" 2>&1 | \
   tee "$logs_dir/ncbi-parse-genomic-gff.log"
 
+sbin/filter_exonset_transcripts.py --tx-info "$loading_dir/gbff.txinfo.gz" --exonsets "$loading_dir/gff.exonsets.gz" \
+ --missing-ids "$loading_dir/filtered_tx_acs.txt" | gzip -c > "$loading_dir/gff.filtered_exonsets.gz" 2>&1 | \
+ tee "$logs_dir/filter_exonset_transcripts.log"
+
 # generate seqinfo files from exonsets
-sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \
+sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.filtered_exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \
   tee "$logs_dir/exonset-to-seqinfo.log"
 
 ### update the uta database
@@ -81,7 +85,7 @@ uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-txinfo "$loadi
   tee "$logs_dir/load-txinfo.log"
 
 # gff exon sets
-uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$loading_dir/gff.exonsets.gz" 2>&1 | \
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-exonset "$loading_dir/gff.filtered_exonsets.gz" 2>&1 | \
   tee "$logs_dir/load-exonsets.log"
 
 # align exons
diff --git a/sbin/filter_exonset_transcripts.py b/sbin/filter_exonset_transcripts.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+
+import argparse
+import csv
+import logging.config
+import sys
+
+import importlib_resources
+
+from uta.formats.exonset import ExonSetReader, ExonSetWriter
+from uta.formats.txinfo import TxInfoReader
+from uta.tools.file_utils import open_file
+
+logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
+logging.config.fileConfig(logging_conf_fn)
+logging.getLogger().setLevel(logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+def filter_exonset(exonset_file, transcript_ids, missing_ids_file):
+    with open_file(exonset_file) as es_f, open(missing_ids_file, 'w') as missing_f:
+        exonsets = ExonSetReader(es_f)
+        esw = ExonSetWriter(sys.stdout)
+        writer_missing = csv.writer(missing_f)
+        missing_acs = set()
+
+        for exonset in exonsets:
+            if exonset.tx_ac in transcript_ids:
+                esw.write(exonset)
+            else:
+                logger.warning(f"Exon set transcript {exonset.tx_ac} not found in txinfo file. Filtering out.")
+                writer_missing.writerow([exonset.tx_ac])
+                missing_acs.add(exonset.tx_ac)
+    logger.info(f"Filtered out exon sets for {len(missing_acs)} transcript(s): {','.join(missing_acs)}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Filter exonset data.')
+    parser.add_argument('--tx-info', help='Path to the transcript info file')
+    parser.add_argument('--exonsets', help='Path to the exonset file')
+    parser.add_argument('--missing-ids', help='Path to the missing transcript ids file')
+    args = parser.parse_args()
+
+    with open_file(args.tx_info) as f:
+        tx_reader = TxInfoReader(f)
+        transcript_ids = {row.ac for row in tx_reader}
+    filter_exonset(args.exonsets, transcript_ids, args.missing_ids)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/sbin/ncbi_parse_genomic_gff.py b/sbin/ncbi_parse_genomic_gff.py
@@ -24,27 +24,16 @@
 """
 
 import argparse
-import gzip
 import logging.config
 import sys
 from collections import defaultdict
-from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import List, Optional
 
 import pkg_resources
 
 from uta.formats.exonset import ExonSet, ExonSetWriter
-
-
-@contextmanager
-def open_file(filename):
-    if filename.endswith(".gz"):
-        with gzip.open(filename, "rt") as f:
-            yield f
-    else:
-        with open(filename) as f:
-            yield f
+from uta.tools.file_utils import open_file
 
 
 @dataclass
@@ -115,7 +104,7 @@ def _get_exon_number_from_id(alignment_id: str) -> int:
     return int(alignment_id.split("-")[-1])
 
 
-def parse_gff_file(file_paths: List[str]) -> dict[str, List[GFFRecord]]:
+def parse_gff_files(file_paths: List[str]) -> dict[str, List[GFFRecord]]:
     tx_data = defaultdict(list)
     for file_path in file_paths:
         with open_file(file_path) as f:
@@ -152,7 +141,7 @@ def get_zero_based_exon_ranges(transcript_exons: List[GFFRecord]) -> str:
     gff_files = args.gff_files
     esw = ExonSetWriter(sys.stdout)
 
-    transcript_alignments = parse_gff_file(gff_files)
+    transcript_alignments = parse_gff_files(gff_files)
     logger.info(
         f"read {len(transcript_alignments)} transcript alignments from file(s): {', '.join(gff_files)}"
     )
diff --git a/src/uta/tools/file_utils.py b/src/uta/tools/file_utils.py
@@ -0,0 +1,12 @@
+import gzip
+from contextlib import contextmanager
+
+
+@contextmanager
+def open_file(filename):
+    if filename.endswith(".gz"):
+        with gzip.open(filename, "rt") as f:
+            yield f
+    else:
+        with open(filename) as f:
+            yield f
diff --git a/tests/test_filter_exonset_transcripts.py b/tests/test_filter_exonset_transcripts.py
@@ -0,0 +1,46 @@
+import contextlib
+import io
+import unittest
+from tempfile import NamedTemporaryFile
+from unittest.mock import patch
+
+from sbin.filter_exonset_transcripts import filter_exonset
+
+
+class TestFilterExonsetTranscripts(unittest.TestCase):
+
+    @patch('sbin.filter_exonset_transcripts.logger')
+    def test_filter_exonset(self, mock_logger):
+        # Test NR_046571.1 is filtered out
+        lines = [
+            "tx_ac\talt_ac\tmethod\tstrand\texons_se_i\n",
+            "NR_122113.1\tNC_000022.10\tsplign\t-1\t16192905,16193009;16190680,16190791;16189263,16189378;16189031,16189143;16187164,16187302;16186810,16186953;16162396,16162487;16150528,16151821\n",
+            "NR_133911.1\tNC_000022.10\tsplign\t1\t16157078,16157342;16164481,16164569;16171951,16172265\n",
+            "NR_046571.1\tNC_000022.10\tsplign\t1\t16274608,16275003;16276480,16277577\n"
+        ]
+        with NamedTemporaryFile(delete=False) as temp_exonsets:
+            with open(temp_exonsets.name, "wt") as f:
+                for line in lines:
+                    f.write(line)
+            temp_exonsets.seek(0)
+        missing_ids_file = NamedTemporaryFile()
+
+        transcript_ids = {"NR_122113.1", "NR_133911.1"}
+        stdout = io.StringIO()
+        with contextlib.redirect_stdout(stdout):
+            filter_exonset(temp_exonsets.name, transcript_ids, missing_ids_file.name)
+
+        # Assert the record for NR_046571.1 is filtered out
+        self.assertEqual(stdout.getvalue(), ''.join(lines[0:3]))
+
+        # Confirm filtered transcript is present in missing_ids_file
+        with open(missing_ids_file.name, 'r') as f:
+            contents = f.read()
+        self.assertEqual(contents, 'NR_046571.1\n')
+
+        mock_logger.warning.assert_called_with('Exon set transcript NR_046571.1 not found in txinfo file. Filtering out.')
+        mock_logger.info.assert_called_with('Filtered out exon sets for 1 transcript(s): NR_046571.1')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/tests/test_ncbi_parse_genomic_gff.py b/tests/test_ncbi_parse_genomic_gff.py
@@ -7,7 +7,7 @@
 from sbin.ncbi_parse_genomic_gff import (
     get_zero_based_exon_ranges,
     GFFRecord,
-    parse_gff_file,
+    parse_gff_files,
     parse_gff_record,
 )
 
@@ -160,7 +160,7 @@ def test_parse_gff_record_raises_unparseable_id(self):
     def test_parse_gff_file(self):
         # Test parsing the entire uncompressed GFF file
         expected_result = {"rna-NR_046018.2": self.gff_records}
-        parsed_result = parse_gff_file([self.temp_gff.name])
+        parsed_result = parse_gff_files([self.temp_gff.name])
         self.assertEqual(parsed_result, expected_result)
 
     def test_parse_gff_file_accepts_gzipped_files(self):
@@ -171,7 +171,7 @@ def test_parse_gff_file_accepts_gzipped_files(self):
 
         # Test parsing the gzipped GFF file
         expected_result = {"rna-NR_046018.2": self.gff_records}
-        parsed_result = parse_gff_file([self.temp_gff.name + ".gz"])
+        parsed_result = parse_gff_files([self.temp_gff.name + ".gz"])
         self.assertEqual(parsed_result, expected_result)
 
     def test_get_zero_based_exon_ranges(self):