Skip to content

Commit 93e79f7

Browse files
authored
feat(IPVC-2408): Mito processing updates and bug fixes (#31)
1 parent 875e428 commit 93e79f7

File tree

7 files changed

+48
-24
lines changed

7 files changed

+48
-24
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -347,14 +347,14 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts.
347347
docker compose run ncbi-download
348348
docker compose run uta-extract
349349
docker compose run seqrepo-load
350-
UTA_ETL_SKIP_GENE_LOAD=false docker compose run uta-load
350+
docker compose run uta-load
351351
```
352352

353353
#### 2B. Mitochondrial transcripts
354354
```
355355
docker compose run mito-extract
356356
docker compose run seqrepo-load
357-
UTA_ETL_SKIP_GENE_LOAD=true docker compose run uta-load
357+
docker compose run uta-load
358358
```
359359

360360
#### 2C. Manual splign transcripts

docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ services:
4242
network_mode: host
4343
uta-load:
4444
image: uta-update
45-
command: sbin/uta-load ${UTA_ETL_OLD_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs ${UTA_ETL_SKIP_GENE_LOAD}
45+
command: sbin/uta-load ${UTA_ETL_OLD_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs
4646
depends_on:
4747
uta:
4848
condition: service_healthy

sbin/ncbi_process_mito.py

Lines changed: 34 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@
5858
import importlib_resources
5959
import logging
6060
import logging.config
61-
from typing import Dict, List, Optional
61+
from typing import Dict, Iterable, List, Optional
6262

6363
from Bio.Seq import Seq
6464
import Bio.SeqIO
@@ -68,6 +68,7 @@
6868
from more_itertools import one
6969

7070
from uta.formats.geneaccessions import GeneAccessions, GeneAccessionsWriter
71+
from uta.formats.geneinfo import GeneInfo, GeneInfoWriter
7172
from uta.formats.seqinfo import SeqInfo, SeqInfoWriter
7273
from uta.formats.txinfo import TxInfo, TxInfoWriter
7374
from uta.formats.exonset import ExonSet, ExonSetWriter
@@ -79,6 +80,9 @@ class MitoGeneData:
7980
gene_id: int
8081
gene_symbol: str
8182
name: str
83+
synonym: str
84+
xrefs: List[str]
85+
type: str
8286
tx_ac: str
8387
tx_seq: str
8488
tx_start: int
@@ -110,7 +114,7 @@ def alt_exons_se_i(self) -> str:
110114
logger = logging.getLogger(__name__)
111115

112116

113-
def parse_args():
117+
def parse_args() -> argparse.Namespace:
114118
parser = argparse.ArgumentParser(description=__doc__)
115119
parser.add_argument("accession", type=str)
116120
parser.add_argument("--output-dir", "-o", default=".", type=str)
@@ -166,7 +170,7 @@ def parse_nomenclature_value(gb_feature: SeqFeature) -> Dict[str, str]:
166170
return nomenclature_results
167171

168172

169-
def get_mito_genes(gbff_filepath: str):
173+
def get_mito_genes(gbff_filepath: str) -> Iterable[MitoGeneData]:
170174
logger.info(f"processing NCBI GBFF file from {gbff_filepath}")
171175
with open(gbff_filepath) as fh:
172176
# Bio.SeqIO.parse(fh, "gb") returns an empty iterator for .fna files and does not fail
@@ -199,11 +203,14 @@ def get_mito_genes(gbff_filepath: str):
199203
# retrieve sequence, and reverse compliment if on reverse strand
200204
ac = f"{record.id}_{feature.location.start:05}_{feature.location.end:05}"
201205
feature_seq = record.seq[feature_start:feature_end]
206+
gene_synonym = feature.qualifiers.get("gene_synonym", "")
207+
type = feature.type
202208
if feature.location.strand == -1:
203209
feature_seq = feature_seq.reverse_complement()
204210

205211
if feature.type == "CDS":
206212
# override defaults for CDS features
213+
type = "protein-coding"
207214
pro_ac = one(feature.qualifiers["protein_id"])
208215
pro_seq = str(one(feature.qualifiers["translation"]))
209216
transl_table = one(feature.qualifiers["transl_table"])
@@ -219,6 +226,9 @@ def get_mito_genes(gbff_filepath: str):
219226
gene_id=gene_id,
220227
gene_symbol=hgnc,
221228
name=name,
229+
synonym=gene_synonym,
230+
xrefs=[f"{k}:{v}" for k, v in xrefs.items()],
231+
type=type,
222232
tx_ac=ac,
223233
tx_seq=str(feature_seq),
224234
tx_start=0,
@@ -234,15 +244,34 @@ def get_mito_genes(gbff_filepath: str):
234244
)
235245

236246

237-
def main(ncbi_accession: str, output_dir: str):
247+
def main(ncbi_accession: str, output_dir: str) -> None:
238248
# get input files
239249
input_files = download_mito_files(output_dir=output_dir, accession=ncbi_accession)
240250

241251
# extract Mitochondrial gene information
242252
mito_genes = [mg for mf in input_files.values() for mg in get_mito_genes(mf)]
243253
logger.info(f"found {len(mito_genes)} genes from parsing {input_files['gbff']}")
244254

245-
# write gene accessions
255+
# write gene information
256+
with gzip.open(f"{output_dir}/geneinfo.gz", "wt") as o_file:
257+
giw = GeneInfoWriter(o_file)
258+
for mg in mito_genes:
259+
giw.write(
260+
GeneInfo(
261+
mg.gene_id,
262+
mg.gene_symbol,
263+
9606,
264+
mg.gene_symbol,
265+
"",
266+
mg.synonym,
267+
mg.type,
268+
mg.name,
269+
mg.name,
270+
mg.xrefs,
271+
)
272+
)
273+
274+
# write gene accession associations
246275
with gzip.open(f"{output_dir}/assocacs.gz", "wt") as o_file:
247276
gaw = GeneAccessionsWriter(o_file)
248277
for mg in mito_genes:

sbin/seqrepo-load

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,9 +13,11 @@ then
1313
exit 1
1414
fi
1515

16-
## Load SeqRepo with new sequences
16+
# find all fasta files in the working directory
17+
mapfile -t FASTA_FILES < <(find "$sequence_dir" -type f -name "*.f[an]a*")
18+
19+
# Load SeqRepo with new sequences
1720
seqrepo --root-directory "$seqrepo_root" \
1821
load -n NCBI --instance-name "$seqrepo_version" \
19-
"$sequence_dir"/*.fna.gz \
20-
"$sequence_dir"/*.faa.gz 2>& 1 | \
22+
"${FASTA_FILES[@]}" 2>& 1 | \
2123
tee "$log_dir/seqrepo-load.log"

sbin/uta-diff

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ cmp_cols.update({
1515
"associated_accessions": "tx_ac pro_ac origin".split(),
1616
"exon_aln": "exon_aln_id tx_exon_id alt_exon_id cigar added".split(),
1717
"gene": "gene_id".split(),
18+
"seq_anno": "seq_anno_id seq_id origin_id ac added".split(),
1819
"transcript": "ac".split(),
1920
})
2021

@@ -41,7 +42,7 @@ def cmp1(con, tbl, s1, s2):
4142

4243

4344
if __name__ == "__main__":
44-
logging.basicConfig(level=logging.DEBUG)
45+
logging.basicConfig(level=logging.INFO)
4546
logger = logging.getLogger()
4647

4748
url = "postgresql://uta_admin@localhost/uta"

sbin/uta-load

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
# ncbi_dir is where the script looks for NCBI data files.
66
# working_dir stores intermediate data files and the final database dump.
77
# log_dir stores log files.
8-
# skip_load_genes, if truthy, will skip the gene loading step
98

109
# Note that the uta loading code uses the seqrepo location defined in the conf files, under [sequences].seqrepo.
1110

@@ -15,7 +14,6 @@ source_uta_v=$1
1514
ncbi_dir=$2
1615
working_dir=$3
1716
log_dir=$4
18-
skip_load_genes=$5
1917

2018
if [ -z "$source_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
2119
then
@@ -43,13 +41,8 @@ sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-
4341
tee "$log_dir/assoc-acs-merge.log"
4442

4543
# Load genes into gene table.
46-
if [ "$skip_load_genes" = "true" ]
47-
then
48-
echo "Skipping load-geneinfo"
49-
else
50-
uta --conf=etc/global.conf --conf=etc/[email protected] load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \
51-
tee "$log_dir/load-geneinfo.log"
52-
fi
44+
uta --conf=etc/global.conf --conf=etc/[email protected] load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \
45+
tee "$log_dir/load-geneinfo.log"
5346

5447
# Load accessions into associated_accessions table.
5548
uta --conf=etc/global.conf --conf=etc/[email protected] load-assoc-ac "$working_dir/assoc-ac.gz" 2>&1 | \
@@ -73,6 +66,3 @@ uta --conf=etc/global.conf --conf=etc/[email protected] align-exons 2>&1 |
7366

7467
### run diff
7568
sbin/uta-diff "$source_uta_v" "$loading_uta_v"
76-
77-
### psql_dump
78-
pg_dump -U uta_admin -h localhost -d uta -t "$loading_uta_v.gene" | gzip -c > "$working_dir/uta.pgd.gz"

src/uta/loading.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,7 @@ def _upsert_seq(si):
523523
session.merge(u_seqanno)
524524
else:
525525
# create the new annotation
526+
logger.debug("creating seq_anno({si.origin},{si.ac},{si.md5})".format(si=si))
526527
u_seqanno = usam.SeqAnno(origin_id=u_ori.origin_id, seq_id=si.md5,
527528
ac=si.ac, descr=si.descr)
528529
session.add(u_seqanno)
@@ -533,6 +534,7 @@ def _upsert_seq(si):
533534
logger.info("{n_created} annotations created/{i_md5} sequences seen ({p:.1f}%)/{n_rows} sequences total".format(
534535
n_created=n_created, i_md5=i_md5, n_rows=n_rows, md5=md5, p=i_md5 / n_rows * 100))
535536
session.commit()
537+
session.commit()
536538

537539

538540
def load_sequences(session, opts, cf):

0 commit comments

Comments
 (0)