feat(IPVC-2408): Mito processing updates and bug fixes (#31)

bsgiles73 · web-flow · commit 93e79f7751e6 · 2024-05-05T09:12:48.000-06:00
diff --git a/README.md b/README.md
@@ -347,14 +347,14 @@ See 2A for nuclear transcripts and 2B for mitochondrial transcripts.
 docker compose run ncbi-download
 docker compose run uta-extract
 docker compose run seqrepo-load
-UTA_ETL_SKIP_GENE_LOAD=false docker compose run uta-load
+docker compose run uta-load
 ```
 
 #### 2B. Mitochondrial transcripts
 ```
 docker compose run mito-extract
 docker compose run seqrepo-load
-UTA_ETL_SKIP_GENE_LOAD=true docker compose run uta-load
+docker compose run uta-load
 ```
 
 #### 2C. Manual splign transcripts
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -42,7 +42,7 @@ services:
     network_mode: host
   uta-load:
     image: uta-update
-    command: sbin/uta-load ${UTA_ETL_OLD_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs ${UTA_ETL_SKIP_GENE_LOAD}
+    command: sbin/uta-load ${UTA_ETL_OLD_UTA_VERSION} /ncbi-dir /uta-load/work /uta-load/logs
     depends_on:
       uta:
         condition: service_healthy
diff --git a/sbin/ncbi_process_mito.py b/sbin/ncbi_process_mito.py
@@ -58,7 +58,7 @@
 import importlib_resources
 import logging
 import logging.config
-from typing import Dict, List, Optional
+from typing import Dict, Iterable, List, Optional
 
 from Bio.Seq import Seq
 import Bio.SeqIO
@@ -68,6 +68,7 @@
 from more_itertools import one
 
 from uta.formats.geneaccessions import GeneAccessions, GeneAccessionsWriter
+from uta.formats.geneinfo import GeneInfo, GeneInfoWriter
 from uta.formats.seqinfo import SeqInfo, SeqInfoWriter
 from uta.formats.txinfo import TxInfo, TxInfoWriter
 from uta.formats.exonset import ExonSet, ExonSetWriter
@@ -79,6 +80,9 @@ class MitoGeneData:
     gene_id: int
     gene_symbol: str
     name: str
+    synonym: str
+    xrefs: List[str]
+    type: str
     tx_ac: str
     tx_seq: str
     tx_start: int
@@ -110,7 +114,7 @@ def alt_exons_se_i(self) -> str:
 logger = logging.getLogger(__name__)
 
 
-def parse_args():
+def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(description=__doc__)
     parser.add_argument("accession", type=str)
     parser.add_argument("--output-dir", "-o", default=".", type=str)
@@ -166,7 +170,7 @@ def parse_nomenclature_value(gb_feature: SeqFeature) -> Dict[str, str]:
     return nomenclature_results
 
 
-def get_mito_genes(gbff_filepath: str):
+def get_mito_genes(gbff_filepath: str) -> Iterable[MitoGeneData]:
     logger.info(f"processing NCBI GBFF file from {gbff_filepath}")
     with open(gbff_filepath) as fh:
         # Bio.SeqIO.parse(fh, "gb") returns an empty iterator for .fna files and does not fail
@@ -199,11 +203,14 @@ def get_mito_genes(gbff_filepath: str):
                     # retrieve sequence, and reverse compliment if on reverse strand
                     ac = f"{record.id}_{feature.location.start:05}_{feature.location.end:05}"
                     feature_seq = record.seq[feature_start:feature_end]
+                    gene_synonym = feature.qualifiers.get("gene_synonym", "")
+                    type = feature.type
                     if feature.location.strand == -1:
                         feature_seq = feature_seq.reverse_complement()
 
                     if feature.type == "CDS":
                         # override defaults for CDS features
+                        type = "protein-coding"
                         pro_ac = one(feature.qualifiers["protein_id"])
                         pro_seq = str(one(feature.qualifiers["translation"]))
                         transl_table = one(feature.qualifiers["transl_table"])
@@ -219,6 +226,9 @@ def get_mito_genes(gbff_filepath: str):
                         gene_id=gene_id,
                         gene_symbol=hgnc,
                         name=name,
+                        synonym=gene_synonym,
+                        xrefs=[f"{k}:{v}" for k, v in xrefs.items()],
+                        type=type,
                         tx_ac=ac,
                         tx_seq=str(feature_seq),
                         tx_start=0,
@@ -234,15 +244,34 @@ def get_mito_genes(gbff_filepath: str):
                     )
 
 
-def main(ncbi_accession: str, output_dir: str):
+def main(ncbi_accession: str, output_dir: str) -> None:
     # get input files
     input_files = download_mito_files(output_dir=output_dir, accession=ncbi_accession)
 
     # extract Mitochondrial gene information
     mito_genes = [mg for mf in input_files.values() for mg in get_mito_genes(mf)]
     logger.info(f"found {len(mito_genes)} genes from parsing {input_files['gbff']}")
 
-    # write gene accessions
+    # write gene information
+    with gzip.open(f"{output_dir}/geneinfo.gz", "wt") as o_file:
+        giw = GeneInfoWriter(o_file)
+        for mg in mito_genes:
+            giw.write(
+                GeneInfo(
+                    mg.gene_id,
+                    mg.gene_symbol,
+                    9606,
+                    mg.gene_symbol,
+                    "",
+                    mg.synonym,
+                    mg.type,
+                    mg.name,
+                    mg.name,
+                    mg.xrefs,
+                )
+            )
+
+    # write gene accession associations
     with gzip.open(f"{output_dir}/assocacs.gz", "wt") as o_file:
         gaw = GeneAccessionsWriter(o_file)
         for mg in mito_genes:
diff --git a/sbin/seqrepo-load b/sbin/seqrepo-load
@@ -13,9 +13,11 @@ then
     exit 1
 fi
 
-## Load SeqRepo with new sequences
+# find all fasta files in the working directory
+mapfile -t FASTA_FILES < <(find "$sequence_dir" -type f -name "*.f[an]a*")
+
+# Load SeqRepo with new sequences
 seqrepo --root-directory "$seqrepo_root" \
     load -n NCBI --instance-name "$seqrepo_version" \
-    "$sequence_dir"/*.fna.gz \
-    "$sequence_dir"/*.faa.gz 2>& 1 | \
+    "${FASTA_FILES[@]}" 2>& 1 | \
     tee "$log_dir/seqrepo-load.log"
diff --git a/sbin/uta-diff b/sbin/uta-diff
@@ -15,6 +15,7 @@ cmp_cols.update({
     "associated_accessions": "tx_ac pro_ac origin".split(),
     "exon_aln": "exon_aln_id tx_exon_id alt_exon_id cigar added".split(),
     "gene": "gene_id".split(),
+    "seq_anno": "seq_anno_id seq_id origin_id ac added".split(),
     "transcript": "ac".split(),
     })
 
@@ -41,7 +42,7 @@ def cmp1(con, tbl, s1, s2):
 
 
 if __name__ == "__main__":
-    logging.basicConfig(level=logging.DEBUG)
+    logging.basicConfig(level=logging.INFO)
     logger = logging.getLogger()
 
     url = "postgresql://uta_admin@localhost/uta"
diff --git a/sbin/uta-load b/sbin/uta-load
@@ -5,7 +5,6 @@
 # ncbi_dir is where the script looks for NCBI data files.
 # working_dir stores intermediate data files and the final database dump.
 # log_dir stores log files.
-# skip_load_genes, if truthy, will skip the gene loading step
 
 # Note that the uta loading code uses the seqrepo location defined in the conf files, under [sequences].seqrepo.
 
@@ -15,7 +14,6 @@ source_uta_v=$1
 ncbi_dir=$2
 working_dir=$3
 log_dir=$4
-skip_load_genes=$5
 
 if [ -z "$source_uta_v" ] || [ -z "$ncbi_dir" ] || [ -z "$working_dir" ] || [ -z "$log_dir" ]
 then
@@ -43,13 +41,8 @@ sbin/assoc-acs-merge "$working_dir/assocacs.gz" | gzip -c > "$working_dir/assoc-
     tee "$log_dir/assoc-acs-merge.log"
 
 # Load genes into gene table.
-if [ "$skip_load_genes" = "true" ]
-then
-    echo "Skipping load-geneinfo"
-else
-    uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \
-        tee "$log_dir/load-geneinfo.log"
-fi
+uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-geneinfo "$working_dir/geneinfo.gz" 2>&1 | \
+    tee "$log_dir/load-geneinfo.log"
 
 # Load accessions into associated_accessions table.
 uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf load-assoc-ac "$working_dir/assoc-ac.gz" 2>&1 | \
@@ -73,6 +66,3 @@ uta --conf=etc/global.conf --conf=etc/uta_dev@localhost.conf align-exons 2>&1 |
 
 ### run diff
 sbin/uta-diff "$source_uta_v" "$loading_uta_v"
-
-### psql_dump
-pg_dump -U uta_admin -h localhost -d uta -t "$loading_uta_v.gene" | gzip -c > "$working_dir/uta.pgd.gz"
diff --git a/src/uta/loading.py b/src/uta/loading.py
@@ -523,6 +523,7 @@ def _upsert_seq(si):
                     session.merge(u_seqanno)
             else:
                 # create the new annotation
+                logger.debug("creating seq_anno({si.origin},{si.ac},{si.md5})".format(si=si))
                 u_seqanno = usam.SeqAnno(origin_id=u_ori.origin_id, seq_id=si.md5,
                                          ac=si.ac, descr=si.descr)
                 session.add(u_seqanno)
@@ -533,6 +534,7 @@ def _upsert_seq(si):
             logger.info("{n_created} annotations created/{i_md5} sequences seen ({p:.1f}%)/{n_rows} sequences total".format(
                 n_created=n_created, i_md5=i_md5, n_rows=n_rows, md5=md5, p=i_md5 / n_rows * 100))
             session.commit()
+    session.commit()
 
 
 def load_sequences(session, opts, cf):