#!/bin/bash -e
# make a sqlite database from uta1
# use: uta-pg-to-sqlite <-t>
# -t: make a test DB instead

make_test=0
nm="('NM_006158.3', 'NM_000399.3', 'NM_000257.2', 'NM_024740.2', 'NM_000314.4', 'NM_152274.3', 'NM_003060.3', 'NM_000249.3', 'NM_020451.2')"
#np="('NP_689487.2', 'NP_000390.2', 'NP_079016.2', 'NP_003051.1', 'NP_000248.2', 'NP_006149.2', 'NP_000305.3', 'NP_000240.1', 'NP_065184.2')"

while test $# -gt 0
do
    case $1 in
        -t) make_test=1 ;;
        *) break ;;
    esac
    shift
done

# NOTE - may need to update the psql call depending on the host, port and user
psql_do () {
	psql -h uta.invitae.com -d uta_dev -U uta_public -AtX -F$'\t' -c "$@";
}

cleanup () {
	action="rm -fv"
	[ -n "$data_tmpfn" ] && $action "$data_tmpfn"
	[ -n "$sqlitedb_tmpfn" ] && $action "$sqlitedb_tmpfn"
}
trap cleanup EXIT
data_tmpfn=$(mktemp /tmp/XXXXX.tsv)
sqlitedb_tmpfn=$(mktemp /tmp/XXXXX.db)

schema_version=$(psql_do "select value from uta1.meta where key='schema_version'")
version=$schema_version
if [ $make_test = 0 ]; then
	sqlite_fn="/tmp/uta$testname-$version.db"
else
	sqlite_fn="/tmp/uta-test-$version.db"
fi


sqlite3 $sqlitedb_tmpfn <<EOF
create table meta (key text,value text);

create table exon_set (exon_set_id integer not null primary key, tx_ac text not null, alt_ac text not null, alt_strand smallint not null, alt_aln_method text not null, added timestamp without time zone not null);
create index exon_set_alt_aln_method_ix on exon_set(alt_aln_method);

create table exon (exon_id integer not null primary key, exon_set_id integer not null, start_i integer not null, end_i integer not null, ord integer not null, name text);
create index end_i_must_be_unique_in_exon_set on exon(exon_set_id, end_i);
create index start_i_must_be_unique_in_exon_set on exon(exon_set_id, start_i);
create index exon_exon_set_id_ix on exon(exon_set_id);

create table exon_aln (exon_aln_id integer not null primary key, tx_exon_id integer not null, alt_exon_id integer not null, cigar text not null, added timestamp without time zone not null, tx_aseq text not null, alt_aseq text not null);
create index exon_aln_alt_exon_id_ix on exon_aln(alt_exon_id);
create index exon_aln_tx_exon_id_ix on  exon_aln(tx_exon_id);

create table seq (seq_id text primary key, len integer not null, seq text);

create table seq_anno (seq_anno_id integer not null primary key, seq_id text, origin_id integer not null, ac text not null, descr text not null, added timestamp without time zone not null);
create index seq_anno_ac_unique_in_origin on seq_anno(origin_id, ac);

create table gene (hgnc text not null primary key, maploc text, descr text, summary text, aliases text, added timestamp without time zone not null);

create table transcript (ac text not null primary key, origin_id integer not null, hgnc text, cds_start_i integer not null, cds_end_i integer not null, cds_md5 text not null, added timestamp without time zone not null);
create index transcript_cds_md5_ix on transcript(cds_md5);
create index transcript_hgnc_ix on transcript(hgnc);

create table tx_exon_set_summary_mv (hgnc text, cds_md5 text, es_fingerprint text, tx_ac text, alt_ac text, alt_aln_method text, alt_strand smallint, exon_set_id integer, n_exons bigint, se_i text, starts_i integer[], ends_i integer[], lengths integer[]);
create index tx_exon_set_summary_mv_alt_ac_ix on tx_exon_set_summary_mv(alt_ac);
create index tx_exon_set_summary_mv_alt_aln_method_ix on tx_exon_set_summary_mv(alt_aln_method);
create index tx_exon_set_summary_mv_cds_md5_ix on tx_exon_set_summary_mv(cds_md5);
create index tx_exon_set_summary_mv_es_fingerprint_ix on tx_exon_set_summary_mv(es_fingerprint);
create index tx_exon_set_summary_mv_tx_ac_ix on tx_exon_set_summary_mv(tx_ac);

create view tx_exon_aln_v as  SELECT t.hgnc,
    t.ac AS tx_ac,
    aes.alt_ac,
    aes.alt_aln_method,
    aes.alt_strand,
    te.ord,
    te.start_i AS tx_start_i,
    te.end_i AS tx_end_i,
    ae.start_i AS alt_start_i,
    ae.end_i AS alt_end_i,
    ea.cigar,
    ea.tx_aseq,
    ea.alt_aseq,
    tes.exon_set_id AS tx_exon_set_id,
    aes.exon_set_id AS alt_exon_set_id,
    te.exon_id AS tx_exon_id,
    ae.exon_id AS alt_exon_id,
    ea.exon_aln_id
   FROM transcript t
   JOIN exon_set tes ON t.ac = tes.tx_ac AND tes.alt_aln_method = 'transcript'
   JOIN exon_set aes ON t.ac = aes.tx_ac AND aes.alt_aln_method <> 'transcript'
   JOIN exon te ON tes.exon_set_id = te.exon_set_id
   JOIN exon ae ON aes.exon_set_id = ae.exon_set_id AND te.ord = ae.ord
   LEFT JOIN exon_aln ea ON te.exon_id = ea.tx_exon_id AND ae.exon_id = ea.alt_exon_id;

create view tx_def_summary_v as  SELECT tess.hgnc,
    tess.cds_md5,
    tess.es_fingerprint,
    tess.tx_ac,
    tess.alt_ac,
    tess.alt_aln_method,
    tess.alt_strand,
    tess.exon_set_id,
    tess.n_exons,
    tess.se_i,
    tess.starts_i,
    tess.ends_i,
    tess.lengths,
    t.cds_start_i,
    t.cds_end_i
   FROM tx_exon_set_summary_mv tess
   JOIN transcript t ON tess.tx_ac = t.ac
  WHERE tess.alt_aln_method = 'transcript';


EOF



if [ $make_test = 0 ]; then
	nm_where_seq=""
	nm_where_exon=""
	nm_where_seq_anno=""
	nm_where_gene=""
	nm_where_exon_set=""
	nm_where_transcript=""
	nm_where_exon_aln=""
	nm_where_tx_exon_set_summary_mv=""
else
	nm_where_seq="JOIN uta1.seq_anno AS sa ON s.seq_id=sa.seq_id AND sa.ac IN ${nm}"
	nm_where_exon="JOIN uta1.exon_set AS es ON e.exon_set_id=es.exon_set_id AND es.tx_ac IN ${nm}"
	nm_where_seq_anno="WHERE sa.ac IN ${nm}"
	nm_where_gene="JOIN uta1.transcript as t ON t.hgnc=g.hgnc AND t.ac IN ${nm}"
	nm_where_exon_set="WHERE es.tx_ac IN ${nm}"
	nm_where_transcript="WHERE t.ac IN ${nm}"
	nm_where_exon_aln="JOIN uta1.exon AS e ON ea.tx_exon_id=e.exon_id JOIN uta1.exon_set AS es ON e.exon_set_id=es.exon_set_id AND es.tx_ac IN ${nm}"
	nm_where_tx_exon_set_summary_mv="WHERE tessm.tx_ac IN ${nm}"
fi

psql_do 'SELECT key,value FROM uta1.meta'  >$data_tmpfn
sqlite3 -separator $'\t' "$sqlitedb_tmpfn" ".import $data_tmpfn meta"
printf "%d/%d rows loaded into meta\n" $(sqlite3 "$sqlitedb_tmpfn" 'select count(*) from meta') $(wc -l <$data_tmpfn)

psql_do "SELECT s.seq_id, s.len, s.seq FROM uta1.seq AS s ${nm_where_seq}" >$data_tmpfn
sqlite3 -separator $'\t' "$sqlitedb_tmpfn" ".import $data_tmpfn seq"
printf "%d/%d rows loaded into seq\n" $(sqlite3 "$sqlitedb_tmpfn" 'select count(*) from seq') $(wc -l <$data_tmpfn)

psql_do "SELECT e.exon_id, e.exon_set_id, e.start_i, e.end_i, e.ord, e.name FROM uta1.exon AS e ${nm_where_exon}" >$data_tmpfn
sqlite3 -separator $'\t' "$sqlitedb_tmpfn" ".import $data_tmpfn exon"
printf "%d/%d rows loaded into exon\n" $(sqlite3 "$sqlitedb_tmpfn" 'select count(*) from exon') $(wc -l <$data_tmpfn)

psql_do "SELECT sa.seq_anno_id, sa.seq_id, sa.origin_id, sa.ac, sa.descr, sa.added FROM uta1.seq_anno AS sa ${nm_where_seq_anno}" >$data_tmpfn
sqlite3 -separator $'\t' "$sqlitedb_tmpfn" ".import $data_tmpfn seq_anno"
printf "%d/%d rows loaded into seq_anno\n" $(sqlite3 "$sqlitedb_tmpfn" 'select count(*) from seq_anno') $(wc -l <$data_tmpfn)

psql_do "SELECT g.hgnc, g.maploc, g.descr, g.summary, g.aliases, g.added FROM uta1.gene AS g ${nm_where_gene}" >$data_tmpfn
sqlite3 -separator $'\t' "$sqlitedb_tmpfn" ".import $data_tmpfn gene"
printf "%d/%d rows loaded into gene\n" $(sqlite3 "$sqlitedb_tmpfn" 'select count(*) from gene') $(wc -l <$data_tmpfn)

psql_do "SELECT es.exon_set_id, es.tx_ac, es.alt_ac, es.alt_strand, es.alt_aln_method, es.added FROM uta1.exon_set AS es ${nm_where_exon_set}" >$data_tmpfn
sqlite3 -separator $'\t' "$sqlitedb_tmpfn" ".import $data_tmpfn exon_set"
printf "%d/%d rows loaded into exon_set\n" $(sqlite3 "$sqlitedb_tmpfn" 'select count(*) from exon_set') $(wc -l <$data_tmpfn)

psql_do "SELECT t.ac, t.origin_id, t.hgnc, t.cds_start_i, t.cds_end_i, t.cds_md5, t.added FROM uta1.transcript AS t ${nm_where_transcript}" >$data_tmpfn
sqlite3 -separator $'\t' "$sqlitedb_tmpfn" ".import $data_tmpfn transcript"
printf "%d/%d rows loaded into transcript\n" $(sqlite3 "$sqlitedb_tmpfn" 'select count(*) from transcript') $(wc -l <$data_tmpfn)

psql_do "SELECT ea.exon_aln_id, ea.tx_exon_id, ea.alt_exon_id, ea.cigar, ea.added, ea.tx_aseq, ea.alt_aseq from uta1.exon_aln AS ea ${nm_where_exon_aln}" >$data_tmpfn
sqlite3 -separator $'\t' "$sqlitedb_tmpfn" ".import $data_tmpfn exon_aln"
printf "%d/%d rows loaded into exon_aln\n" $(sqlite3 "$sqlitedb_tmpfn" 'select count(*) from exon_aln') $(wc -l <$data_tmpfn)

psql_do "SELECT tessm.hgnc, tessm.cds_md5, tessm.es_fingerprint, tessm.tx_ac, tessm.alt_ac, tessm.alt_aln_method, tessm.alt_strand, tessm.exon_set_id, tessm.n_exons, tessm.se_i, tessm.starts_i, tessm.ends_i, tessm.lengths FROM uta1.tx_exon_set_summary_mv AS tessm ${nm_where_tx_exon_set_summary_mv}" >$data_tmpfn
sqlite3 -separator $'\t' "$sqlitedb_tmpfn" ".import $data_tmpfn tx_exon_set_summary_mv"
printf "%d/%d rows loaded into tx_exon_set_summary_mv\n" $(sqlite3 "$sqlitedb_tmpfn" 'select count(*) from tx_exon_set_summary_mv') $(wc -l <$data_tmpfn)

sqlite3 $sqlitedb_tmpfn <<EOF
insert into meta (key,value) values ('exported',strftime('%Y-%m-%d %H:%M:%SZ'));
insert into meta (key,value) values ('exported by','$0');
EOF

/bin/mv -v "$sqlitedb_tmpfn" "$sqlite_fn"
echo "wrote database to $sqlite_fn"
/bin/ls -l "$sqlite_fn"

## <LICENSE>
## Copyright 2014 UTA Contributors (https://bitbucket.org/invitae/uta)
## 
## Licensed under the Apache License, Version 2.0 (the "License");
## you may not use this file except in compliance with the License.
## You may obtain a copy of the License at
## 
##     http://www.apache.org/licenses/LICENSE-2.0
## 
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
## </LICENSE>
