# -*- coding: UTF-8 -*-
#!/usr/bin/env python3

# v0.1:fixing the problem that the feature of exon/mRNA is before gene
# v0.2: Use CDS to decide which transcript is primary
# v0.3: Fix the bug that if the CDS_parent include '\r' , the lst_len will be 0

# GFF must have CDS feature
# GFF must have ID and Parent in column 9

import re
import fileinput
from collections import defaultdict

gene_dict = defaultdict(list)
tx_pos_dict = defaultdict(list)
CDS_dict = defaultdict(list)

for line in fileinput.input():
    if line.startswith("#"):
        continue
    content = line.split("\t")
    if len(content) <= 8:
        continue
    #print(content)
    '''
    if content[2] == 'gene':
        gene_id = re.search(r'ID=(.*?)[;\n]', content[8]).group(1)
        #print(gene_id)
        if gene_id not in gene_dict:
            gene_dict[gene_id] = []
    '''
    if content[2] == "transcript" or content[2] == "mRNA":

        # use regual expression to extract the gene ID
        # match the pattern ID=xxxxx; or ID=xxxxx

        tx_id = re.search(r'ID=(.*?)[;\n]',content[8]).group(1)
        tx_parent = re.search(r'Parent=(.*?)[;\n]',content[8]).group(1)
        tx_parent = tx_parent.strip() # remove the 'r' and '\n'

        # if the parent of transcript is not in the gene_dict, create it rather than append
        if tx_parent in gene_dict:
            gene_dict[tx_parent].append(tx_id)
        else:
            gene_dict[tx_parent] = [tx_id]
        tx_pos_dict[tx_id] = [content[0],content[3], content[4], content[6] ]
    # GFF must have CDS feature
    if content[2] == 'CDS':
        width = int(content[4]) - int(content[3])
        CDS_parent = re.search(r'Parent=(.*?)[;\n]',content[8]).group(1)
        CDS_parent = CDS_parent.strip() # strip the '\r' and '\n'
        CDS_dict[CDS_parent].append(width)

for gene, txs in gene_dict.items():
    tmp = 0
    for tx in txs:
        tx_len = sum(CDS_dict[tx])
        if tx_len > tmp:
            lst_tx = tx
            tmp = tx_len
    tx_chrom = tx_pos_dict[lst_tx][0]
    tx_start = tx_pos_dict[lst_tx][1]
    tx_end   = tx_pos_dict[lst_tx][2]
    tx_strand = tx_pos_dict[lst_tx][3]
    print("{gene}\t{tx}\t{chrom}\t{start}\t{end}\t{strand}".format(gene=gene,tx=lst_tx,chrom=tx_chrom,start=tx_start,end=tx_end, strand=tx_strand))