Skip to content

Commit 49e08e9

Browse files
authored
feat(IPVC-2276): skip accessions from gff that are not in the tx_info file, log missing acs to file (#18)
1 parent 2111c42 commit 49e08e9

File tree

6 files changed

+121
-19
lines changed

6 files changed

+121
-19
lines changed

etc/scripts/run-uta-build.sh

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,8 +64,12 @@ GFF_files=$(ls $ncbi_dir/genomes/refseq/vertebrate_mammalian/Homo_sapiens/all_as
6464
sbin/ncbi_parse_genomic_gff.py "$GFF_files" | gzip -c > "$loading_dir/gff.exonsets.gz" 2>&1 | \
6565
tee "$logs_dir/ncbi-parse-genomic-gff.log"
6666

67+
sbin/filter_exonset_transcripts.py --tx-info "$loading_dir/gbff.txinfo.gz" --exonsets "$loading_dir/gff.exonsets.gz" \
68+
--missing-ids "$loading_dir/filtered_tx_acs.txt" | gzip -c > "$loading_dir/gff.filtered_exonsets.gz" 2>&1 | \
69+
tee "$logs_dir/filter_exonset_transcripts.log"
70+
6771
# generate seqinfo files from exonsets
68-
sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \
72+
sbin/exonset-to-seqinfo -o NCBI "$loading_dir/gff.filtered_exonsets.gz" | gzip -c > "$loading_dir/seqinfo.gz" 2>&1 | \
6973
tee "$logs_dir/exonset-to-seqinfo.log"
7074

7175
### update the uta database
@@ -81,7 +85,7 @@ uta --conf=etc/global.conf --conf=etc/[email protected] load-txinfo "$loadi
8185
tee "$logs_dir/load-txinfo.log"
8286

8387
# gff exon sets
84-
uta --conf=etc/global.conf --conf=etc/[email protected] load-exonset "$loading_dir/gff.exonsets.gz" 2>&1 | \
88+
uta --conf=etc/global.conf --conf=etc/[email protected] load-exonset "$loading_dir/gff.filtered_exonsets.gz" 2>&1 | \
8589
tee "$logs_dir/load-exonsets.log"
8690

8791
# align exons

sbin/filter_exonset_transcripts.py

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
#!/usr/bin/env python
2+
3+
import argparse
4+
import csv
5+
import logging.config
6+
import sys
7+
8+
import importlib_resources
9+
10+
from uta.formats.exonset import ExonSetReader, ExonSetWriter
11+
from uta.formats.txinfo import TxInfoReader
12+
from uta.tools.file_utils import open_file
13+
14+
logging_conf_fn = importlib_resources.files("uta").joinpath("etc/logging.conf")
15+
logging.config.fileConfig(logging_conf_fn)
16+
logging.getLogger().setLevel(logging.INFO)
17+
logger = logging.getLogger(__name__)
18+
19+
20+
def filter_exonset(exonset_file, transcript_ids, missing_ids_file):
21+
with open_file(exonset_file) as es_f, open(missing_ids_file, 'w') as missing_f:
22+
exonsets = ExonSetReader(es_f)
23+
esw = ExonSetWriter(sys.stdout)
24+
writer_missing = csv.writer(missing_f)
25+
missing_acs = set()
26+
27+
for exonset in exonsets:
28+
if exonset.tx_ac in transcript_ids:
29+
esw.write(exonset)
30+
else:
31+
logger.warning(f"Exon set transcript {exonset.tx_ac} not found in txinfo file. Filtering out.")
32+
writer_missing.writerow([exonset.tx_ac])
33+
missing_acs.add(exonset.tx_ac)
34+
logger.info(f"Filtered out exon sets for {len(missing_acs)} transcript(s): {','.join(missing_acs)}")
35+
36+
37+
def main():
38+
parser = argparse.ArgumentParser(description='Filter exonset data.')
39+
parser.add_argument('--tx-info', help='Path to the transcript info file')
40+
parser.add_argument('--exonsets', help='Path to the exonset file')
41+
parser.add_argument('--missing-ids', help='Path to the missing transcript ids file')
42+
args = parser.parse_args()
43+
44+
with open_file(args.tx_info) as f:
45+
tx_reader = TxInfoReader(f)
46+
transcript_ids = {row.ac for row in tx_reader}
47+
filter_exonset(args.exonsets, transcript_ids, args.missing_ids)
48+
49+
50+
if __name__ == '__main__':
51+
main()

sbin/ncbi_parse_genomic_gff.py

Lines changed: 3 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -24,27 +24,16 @@
2424
"""
2525

2626
import argparse
27-
import gzip
2827
import logging.config
2928
import sys
3029
from collections import defaultdict
31-
from contextlib import contextmanager
3230
from dataclasses import dataclass
3331
from typing import List, Optional
3432

3533
import pkg_resources
3634

3735
from uta.formats.exonset import ExonSet, ExonSetWriter
38-
39-
40-
@contextmanager
41-
def open_file(filename):
42-
if filename.endswith(".gz"):
43-
with gzip.open(filename, "rt") as f:
44-
yield f
45-
else:
46-
with open(filename) as f:
47-
yield f
36+
from uta.tools.file_utils import open_file
4837

4938

5039
@dataclass
@@ -115,7 +104,7 @@ def _get_exon_number_from_id(alignment_id: str) -> int:
115104
return int(alignment_id.split("-")[-1])
116105

117106

118-
def parse_gff_file(file_paths: List[str]) -> dict[str, List[GFFRecord]]:
107+
def parse_gff_files(file_paths: List[str]) -> dict[str, List[GFFRecord]]:
119108
tx_data = defaultdict(list)
120109
for file_path in file_paths:
121110
with open_file(file_path) as f:
@@ -152,7 +141,7 @@ def get_zero_based_exon_ranges(transcript_exons: List[GFFRecord]) -> str:
152141
gff_files = args.gff_files
153142
esw = ExonSetWriter(sys.stdout)
154143

155-
transcript_alignments = parse_gff_file(gff_files)
144+
transcript_alignments = parse_gff_files(gff_files)
156145
logger.info(
157146
f"read {len(transcript_alignments)} transcript alignments from file(s): {', '.join(gff_files)}"
158147
)

src/uta/tools/file_utils.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import gzip
2+
from contextlib import contextmanager
3+
4+
5+
@contextmanager
6+
def open_file(filename):
7+
if filename.endswith(".gz"):
8+
with gzip.open(filename, "rt") as f:
9+
yield f
10+
else:
11+
with open(filename) as f:
12+
yield f
Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import contextlib
2+
import io
3+
import unittest
4+
from tempfile import NamedTemporaryFile
5+
from unittest.mock import patch
6+
7+
from sbin.filter_exonset_transcripts import filter_exonset
8+
9+
10+
class TestFilterExonsetTranscripts(unittest.TestCase):
11+
12+
@patch('sbin.filter_exonset_transcripts.logger')
13+
def test_filter_exonset(self, mock_logger):
14+
# Test NR_046571.1 is filtered out
15+
lines = [
16+
"tx_ac\talt_ac\tmethod\tstrand\texons_se_i\n",
17+
"NR_122113.1\tNC_000022.10\tsplign\t-1\t16192905,16193009;16190680,16190791;16189263,16189378;16189031,16189143;16187164,16187302;16186810,16186953;16162396,16162487;16150528,16151821\n",
18+
"NR_133911.1\tNC_000022.10\tsplign\t1\t16157078,16157342;16164481,16164569;16171951,16172265\n",
19+
"NR_046571.1\tNC_000022.10\tsplign\t1\t16274608,16275003;16276480,16277577\n"
20+
]
21+
with NamedTemporaryFile(delete=False) as temp_exonsets:
22+
with open(temp_exonsets.name, "wt") as f:
23+
for line in lines:
24+
f.write(line)
25+
temp_exonsets.seek(0)
26+
missing_ids_file = NamedTemporaryFile()
27+
28+
transcript_ids = {"NR_122113.1", "NR_133911.1"}
29+
stdout = io.StringIO()
30+
with contextlib.redirect_stdout(stdout):
31+
filter_exonset(temp_exonsets.name, transcript_ids, missing_ids_file.name)
32+
33+
# Assert the record for NR_046571.1 is filtered out
34+
self.assertEqual(stdout.getvalue(), ''.join(lines[0:3]))
35+
36+
# Confirm filtered transcript is present in missing_ids_file
37+
with open(missing_ids_file.name, 'r') as f:
38+
contents = f.read()
39+
self.assertEqual(contents, 'NR_046571.1\n')
40+
41+
mock_logger.warning.assert_called_with('Exon set transcript NR_046571.1 not found in txinfo file. Filtering out.')
42+
mock_logger.info.assert_called_with('Filtered out exon sets for 1 transcript(s): NR_046571.1')
43+
44+
45+
if __name__ == "__main__":
46+
unittest.main()

tests/test_ncbi_parse_genomic_gff.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
from sbin.ncbi_parse_genomic_gff import (
88
get_zero_based_exon_ranges,
99
GFFRecord,
10-
parse_gff_file,
10+
parse_gff_files,
1111
parse_gff_record,
1212
)
1313

@@ -160,7 +160,7 @@ def test_parse_gff_record_raises_unparseable_id(self):
160160
def test_parse_gff_file(self):
161161
# Test parsing the entire uncompressed GFF file
162162
expected_result = {"rna-NR_046018.2": self.gff_records}
163-
parsed_result = parse_gff_file([self.temp_gff.name])
163+
parsed_result = parse_gff_files([self.temp_gff.name])
164164
self.assertEqual(parsed_result, expected_result)
165165

166166
def test_parse_gff_file_accepts_gzipped_files(self):
@@ -171,7 +171,7 @@ def test_parse_gff_file_accepts_gzipped_files(self):
171171

172172
# Test parsing the gzipped GFF file
173173
expected_result = {"rna-NR_046018.2": self.gff_records}
174-
parsed_result = parse_gff_file([self.temp_gff.name + ".gz"])
174+
parsed_result = parse_gff_files([self.temp_gff.name + ".gz"])
175175
self.assertEqual(parsed_result, expected_result)
176176

177177
def test_get_zero_based_exon_ranges(self):

0 commit comments

Comments
 (0)