#!/usr/bin/env python
# generate json mapping of seguid => [protein accessions]

import collections
import hashlib
import re
import sys

from Bio import SeqIO
from Bio.SeqUtils.CheckSum import seguid

refseq_ac_re = re.compile(r'ref\|([^|]+)')

print('\t'.join(['seguid','md5','ac']))
for record in SeqIO.parse(sys.stdin, "fasta") :
    seq_seguid = seguid(record.seq.tostring())
    seq_md5 = hashlib.md5(record.seq.tostring()).hexdigest()
    ac = refseq_ac_re.search(record.id).group(1)
    print('\t'.join([seq_seguid,seq_md5,ac]))

## <LICENSE>
## Copyright 2014 UTA Contributors (https://bitbucket.org/invitae/uta)
## 
## Licensed under the Apache License, Version 2.0 (the "License");
## you may not use this file except in compliance with the License.
## You may obtain a copy of the License at
## 
##     http://www.apache.org/licenses/LICENSE-2.0
## 
## Unless required by applicable law or agreed to in writing, software
## distributed under the License is distributed on an "AS IS" BASIS,
## WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
## See the License for the specific language governing permissions and
## limitations under the License.
## </LICENSE>
