#!/usr/bin/env python

from __future__ import print_function

import logging
import re
import sys

import psycopg2
import psycopg2.extras

import recordtype

dsn = "host=localhost dbname=uta_dev"
hgmd_version = "2013.04"
ensembl_version = "v70"
grch_version = "GRCh37.p13"
add_notes = False

############################################################################

cigarop_re = re.compile('(?P<count>\d+)(?P<op>[=XDI])')

class Tx(recordtype.recordtype('Tx',
                               ['ac','cds_start_i','cds_end_i'])):
    pass


class CIGAROp(recordtype.recordtype('CIGAROp',
                                    ['tx','exon','exon_start_i','op','count','r_start_i'])):
    @property
    def src_len(self):
        return 0 if self.op == 'I' else self.count

    @property
    def tgt_len(self):
        return 0 if self.op == 'D' else self.count

    @property
    def r_end_i(self):
        return self.r_start_i + self.src_len

    @property
    def c_start_i(self):
        return self.r_start_i - self.tx.cds_start_i

    @property
    def c_end_i(self):
        return self.r_end_i - self.tx.cds_start_i

    @property
    def in_cds(self):
        return self.r_start_i<=self.tx.cds_end_i and self.r_end_i>=self.tx.cds_start_i

    def as_string(self):
        if self.op == 'I':
            c_start,c_end = self.c_start_i   , self.c_end_i+1
        else:
            c_start,c_end = self.c_start_i+1 , self.c_end_i
        c_pos = str(c_start) if c_start == c_end else "{c_start}_{c_end}".format(c_start=c_start,c_end=c_end)
        return "{self.exon}:{self.count}{self.op}(c.{c_pos})".format(
            self = self,
            c_pos = c_pos
            )


def cigar_to_ops(tx,cigars):
    """for a list of cigar ops, return a list of CIGAROp objects"""
    ops = []
    r_pos = 0
    for i,cigar in enumerate(cigars):
        exon_pos = 0
        for op_d in [ op_m.groupdict() for op_m in cigarop_re.finditer(cigar)]:
            op = CIGAROp(
                tx = tx,
                exon = i+1,
                exon_start_i = exon_pos,
                op = op_d['op'],
                count = int(op_d['count']),
                r_start_i = r_pos,
                )
            r_pos += op.src_len
            exon_pos += op.src_len
            ops.append(op)
    return ops


############################################################################

def build_sel_sql(preds=[]):
    sel_preds = [ 
        ]
    sel_preds += preds
    query = ' '.join([
            'SELECT * FROM bermuda.bermuda_data_mv',
            '' if len(sel_preds) == 0 else 'WHERE ' + ' AND '.join(sel_preds),
            'ORDER BY hgnc,tx_ac',
            ])
    return query

def row_str(row):
    avoid = []
    warn = []
    notes = []

    tx = Tx(row['tx_ac'], row['cds_start_i'], row['cds_end_i'])

    ops = [] if row['s_cigars'] is None else cigar_to_ops(tx,row['s_cigars'].split(';'))
    subs =       [ o for o in ops    if     o.op == 'X'  ]
    indels =     [ o for o in ops    if     o.op in 'DI' ]
    cds_indels = [ o for o in indels if     o.in_cds ]
    utr_indels = [ o for o in indels if not o.in_cds ]

    # alignment quality
    sb = (row['s_se_i'] is not None, row['b_se_i'] is not None)
    if sb == (True,True):
        if row['s_se_i'] == row['b_se_i']:
            alignment_quality = 'high'
            notes += [ 'splign and blat coordinates are identical' ]
        else:
            alignment_quality = 'intermediate'
            warn += [ 'ambiguous alignment; talk to Reece' ]

    elif sb == (True,False):
        notes += ['no blat alignment', 'splign is '+row['s_status']]
        if row['s_status'] == 'NLxdi':
            alignment_quality = 'high'
        elif 'n' in row['s_status']:
            alignment_quality = 'poor'
            avoid += [ 'fatal alignment problem' ]
        else:
            alignment_quality = 'intermediate'
            warn += [ 'reference disagreement (subs and indels at right)' ]

    elif sb == (False,True):
        alignment_quality = 'poor'
        notes += ['no splign alignment', 'blat is '+row['b_status']]
        avoid += [ 'fatal alignment problem' ]

    else:
        alignment_quality = 'poor'
        avoid += [ 'no alignments available' ]

    assert alignment_quality is not None, "Failed to set alignment quality"


    # splign_refagree
    if row['s_status'] == 'NLxdi':
        splign_refagree = 'exact'
    elif row['s_minor']:
        edits = []
        if 'X' in row['s_status']:
            edits += [ 'sub' ]
        if 'D' in row['s_status']:
            edits += [ 'del' ]
        if 'I' in row['s_status']:
            edits += [ 'ins' ]
        splign_refagree = 'minor ('+','.join(edits)+')'
    else:
        splign_refagree = 'poor'


    # ensembl equivalents
    enst_equivs = 'no'
    if row['enst_equivs'] is not None:
        enst_equivs = 'CDS+UTR' if '/CE' in row['enst_equivs'] else 'CDS only'
    else:
        avoid += [ 'no ENST available' ]

    is_hgmd_tx = 'yes' if row['is_hgmd_tx'] else 'no'

    grch37_coords = ''
    if row['alt_ac'] is not None:
        grch37_coords = "{row[alt_ac]}:{row[alt_bounds]} ({pm})".format(
            row = row, pm = '+' if row['alt_strand'] == 1 else '-')

    if len(avoid) > 0:
        summary = 'avoid: ' + '; '.join(avoid)
    elif len(warn) > 0:
        summary = 'warn: ' + '; '.join(warn)
    else:
        summary = 'pass: no issues found'

    notes += [
        's_status: ' + str(row['s_status']),
        's_cigars: ' + str(row['s_cigars']),
        ]

    cols = [
        row['hgnc'],
        row['tx_ac'],
        is_hgmd_tx,
        
        alignment_quality,
        splign_refagree,
        enst_equivs,
        
        summary,
        
        ','.join([op.as_string() for op in cds_indels]),
        ','.join([op.as_string() for op in utr_indels]),
        ','.join([op.as_string() for op in subs]),
        grch37_coords,
        row['patches'] or '',
        row['enst_equivs'] or '',
        ]
    if add_notes:
        cols += '; '.join(notes),

    return '\t'.join(cols)


############################################################################

if __name__ == '__main__':
    logging.basicConfig(level=logging.DEBUG)
    logger = logging.getLogger(__name__)

    con = psycopg2.connect(dsn)

    cols = [
        [ "Gene",                     "HGNC",           ],
        [ "Accession",                "",               ],
        [ "is HGMD transcript?",      hgmd_version,     ],

        [ "Alignment Quality",        "splign",         ],
        [ "Reference Agreement",      "splign",         ],
        [ "has Ensembl equivalents?", ensembl_version,  ],

        [ "Summary",                  "",               ],

        [ "CDS indels",               "",               ],
        [ "UTR indels",               "",               ],
        [ "Substitutions",            "",               ],
        [ "GRCh37 coordinates",       "",               ],
        [ "Patch Alignments",         grch_version,     ],
        [ "Ensembl Equivalents",      ensembl_version,  ],

        ]

    if add_notes:
        cols += [ "Notes",                    "",               ],

    for hrow in zip(*cols):
        print( '\t'.join(hrow))

    sel_cur = con.cursor(cursor_factory=psycopg2.extras.DictCursor)

    if len(sys.argv) > 1:
        sel_sql = build_sel_sql(preds = [ 'hgnc = %(hgnc)s' ])
        for hgnc in sys.argv[1:]:
            sel_cur.execute(sel_sql,{'hgnc': hgnc})
            for row in sel_cur:
                print( row_str(row) )
    else:
        sel_sql = build_sel_sql()
        sel_cur.execute(sel_sql)
        for row in sel_cur:
            print( row_str(row) )



## <LICENSE>
## Copyright 2014 UTA Contributors (https://bitbucket.org/invitae/uta)
## 
## Licensed under the Apache License, Version 2.0 (the "License");
## you may not use this file except in compliance with the License.
## You may obtain a copy of the License at
## 
##     http://www.apache.org/licenses/LICENSE-2.0
## 
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
## </LICENSE>
