5858import importlib_resources
5959import logging
6060import logging .config
61- from typing import Dict , List , Optional
61+ from typing import Dict , Iterable , List , Optional
6262
6363from Bio .Seq import Seq
6464import Bio .SeqIO
6868from more_itertools import one
6969
7070from uta .formats .geneaccessions import GeneAccessions , GeneAccessionsWriter
71+ from uta .formats .geneinfo import GeneInfo , GeneInfoWriter
7172from uta .formats .seqinfo import SeqInfo , SeqInfoWriter
7273from uta .formats .txinfo import TxInfo , TxInfoWriter
7374from uta .formats .exonset import ExonSet , ExonSetWriter
@@ -79,6 +80,9 @@ class MitoGeneData:
7980 gene_id : int
8081 gene_symbol : str
8182 name : str
83+ synonym : str
84+ xrefs : List [str ]
85+ type : str
8286 tx_ac : str
8387 tx_seq : str
8488 tx_start : int
@@ -110,7 +114,7 @@ def alt_exons_se_i(self) -> str:
110114logger = logging .getLogger (__name__ )
111115
112116
113- def parse_args ():
117+ def parse_args () -> argparse . Namespace :
114118 parser = argparse .ArgumentParser (description = __doc__ )
115119 parser .add_argument ("accession" , type = str )
116120 parser .add_argument ("--output-dir" , "-o" , default = "." , type = str )
@@ -166,7 +170,7 @@ def parse_nomenclature_value(gb_feature: SeqFeature) -> Dict[str, str]:
166170 return nomenclature_results
167171
168172
169- def get_mito_genes (gbff_filepath : str ):
173+ def get_mito_genes (gbff_filepath : str ) -> Iterable [ MitoGeneData ] :
170174 logger .info (f"processing NCBI GBFF file from { gbff_filepath } " )
171175 with open (gbff_filepath ) as fh :
172176 # Bio.SeqIO.parse(fh, "gb") returns an empty iterator for .fna files and does not fail
@@ -199,11 +203,14 @@ def get_mito_genes(gbff_filepath: str):
199203 # retrieve sequence, and reverse compliment if on reverse strand
200204 ac = f"{ record .id } _{ feature .location .start :05} _{ feature .location .end :05} "
201205 feature_seq = record .seq [feature_start :feature_end ]
206+ gene_synonym = feature .qualifiers .get ("gene_synonym" , "" )
207+ type = feature .type
202208 if feature .location .strand == - 1 :
203209 feature_seq = feature_seq .reverse_complement ()
204210
205211 if feature .type == "CDS" :
206212 # override defaults for CDS features
213+ type = "protein-coding"
207214 pro_ac = one (feature .qualifiers ["protein_id" ])
208215 pro_seq = str (one (feature .qualifiers ["translation" ]))
209216 transl_table = one (feature .qualifiers ["transl_table" ])
@@ -219,6 +226,9 @@ def get_mito_genes(gbff_filepath: str):
219226 gene_id = gene_id ,
220227 gene_symbol = hgnc ,
221228 name = name ,
229+ synonym = gene_synonym ,
230+ xrefs = [f"{ k } :{ v } " for k , v in xrefs .items ()],
231+ type = type ,
222232 tx_ac = ac ,
223233 tx_seq = str (feature_seq ),
224234 tx_start = 0 ,
@@ -234,15 +244,34 @@ def get_mito_genes(gbff_filepath: str):
234244 )
235245
236246
237- def main (ncbi_accession : str , output_dir : str ):
247+ def main (ncbi_accession : str , output_dir : str ) -> None :
238248 # get input files
239249 input_files = download_mito_files (output_dir = output_dir , accession = ncbi_accession )
240250
241251 # extract Mitochondrial gene information
242252 mito_genes = [mg for mf in input_files .values () for mg in get_mito_genes (mf )]
243253 logger .info (f"found { len (mito_genes )} genes from parsing { input_files ['gbff' ]} " )
244254
245- # write gene accessions
255+ # write gene information
256+ with gzip .open (f"{ output_dir } /geneinfo.gz" , "wt" ) as o_file :
257+ giw = GeneInfoWriter (o_file )
258+ for mg in mito_genes :
259+ giw .write (
260+ GeneInfo (
261+ mg .gene_id ,
262+ mg .gene_symbol ,
263+ 9606 ,
264+ mg .gene_symbol ,
265+ "" ,
266+ mg .synonym ,
267+ mg .type ,
268+ mg .name ,
269+ mg .name ,
270+ mg .xrefs ,
271+ )
272+ )
273+
274+ # write gene accession associations
246275 with gzip .open (f"{ output_dir } /assocacs.gz" , "wt" ) as o_file :
247276 gaw = GeneAccessionsWriter (o_file )
248277 for mg in mito_genes :
0 commit comments