#!/usr/local/bin/perl -w

# read as STDIN output of cmpexons.pl (eventually filtered)
# and as parameter a ST real data file
# FP, FN and TN  files
# compute per sequence the length of coding region = SumLg
# length of predicted coding region in the good frame = SumTP
# length of coding region not predicted in good frame = SumLg - SumTP = SumFN
# ration SumFN / SumLg

if ($#ARGV != 3)
  {
    die "need ST real data file, FP FN and TN files as parameters";
  }

# read ST real data file and compute SumLg
open(RDATA, $ARGV[0]) || die "can't open $ARGV[0]";
$prevSeq=0;
while (<RDATA>)
  {
    if (($seq_id, $Length) = (/^seq(\d+)\s+.{4}\s+[+-]\s+\d+\s+\d+\s+(\d+)/))
      {
	$seq_id = int($seq_id);
	if ($seq_id != $prevSeq)
	  {
	    $sLg{$seq_id} = 0;
	    $Rexons{$seq_id} = 0;
	    $prevSeq = $seq_id;
	  }
	$Rexons{$seq_id}++;
	$sLg{$seq_id} += $Length;
      }
  }
close(RDATA);

# read FP data
open(FPDATA, $ARGV[1]) || die "can't open $ARGV[1]";
while (<FPDATA>)
  {
    if (($seq_id, $fp) = (/^seq(\d+)\s+(\d+)/))
      {
	$FP{int($seq_id)} = $fp;
      }
  }
close(FPDATA);

# read FN data
open(FNDATA, $ARGV[2]) || die "can't open $ARGV[2]";
while (<FNDATA>)
  {
    if (($seq_id, $fn) = (/^seq(\d+)\s+(\d+)/))
      {
	$FN{int($seq_id)} = $fn;
      }
  }
close(FNDATA);

# read TN data
open(TNDATA, $ARGV[3]) || die "can't open $ARGV[3]";
while (<TNDATA>)
  {
    if (($seq_id, $tn) = (/^seq(\d+)\s+(\d+)/))
      {
	$TN{int($seq_id)} = $tn;
      }
  }
close(TNDATA);

# read output of cmpexons.pl
$prevSeq=0;
while (<STDIN>)
  {
    if (($seq_id, $R, $P, $TP) =
	(/^seq(\d+)\s+(\d+)\s+\[\s*\d+,\s*\d+\]\s+\d+\s+(\d+)\s+\[\s*\d+,\s*\d+\]\s+(\d+)/))
      {
	$seq_id = int($seq_id);
	if ($prevSeq != $seq_id)
	  {
	    $sTP{$seq_id} = 0;
	    $prevSeq = $seq_id;
	  }
	$sTP{$seq_id} += $TP;
	push(@{$spt{$seq_id}{$R}},$P);
	push(@{$cnt{$seq_id}{$P}},$R);
      }
  }

# compute output
print "Contig\t";
print "sLg\t";
print "Nb Ex\t";
print "Nb In\t";
# TP
print "#TPf\t";
print "sTPf\t";
print "sTPf/sLg\t";
# FP
print "#FPf\t";
# TN
print "#TN\t";
# FN
print "#FNf\t";
print "sFNf\t";
print "sFNf/sLg\t";
# split and # exons split 
print "#split\t";
print "#ExR split\t";
# concat and # exons concatenated 
print "#concat\t";
print "#ExR concat\n";

@sequences = sort {$a <=> $b} keys %sLg;
foreach $s (@sequences)
  {
    printf "seq%03d\t%d\t%d\t%d\t", $s, $sLg{$s}, $Rexons{$s}, $Rexons{$s} + 1;
    
    # number of FPf
    $nFPf = exists($FP{$s}) ? $FP{$s} : 0;
    
    # number of FNf
    $nFNf = exists($FN{$s}) ? $FN{$s} : 0;
    
    # sum of TP
    if (!exists($sTP{$s}))
      {
	$sTP{$s}=0;
      }
    
    # number of TN
    if (!exists($TN{$s}))
      {
	$TN{$s}=0;
      }
    
    # sum of FN
    $sFN = $sLg{$s} - $sTP{$s};
    
    # number of TP
    @pred = keys %{$spt{$s}};
    $nTPf = 1 + $#pred;
    
    # TP
    print "$nTPf\t";
    print "$sTP{$s}\t";
    printf "%.2f\t", $sTP{$s} / $sLg{$s} * 100;
    # FP
    print "$nFPf\t";
    # TN
    print "$TN{$s}\t";
    # FN
    print "$nFNf\t";
    print "$sFN\t";
    printf "%.2f\t", $sFN / $sLg{$s} * 100;
    
    # compute number of split
    $nb=0;
    $nbEx=0;
    foreach $r (keys %{$spt{$s}})
      {
	@pred = @{$spt{$s}{$r}};
	$nb += $#pred;
	if ($#pred != 0) {$nbEx++;}
      }
    printf "%d\t%d\t", $nb, $nbEx;
    
    # compute number of concat
    $nb=0;
    $nbEx=0;
    foreach $p (keys %{$cnt{$s}})
      {
	@pred = @{$cnt{$s}{$p}};
	$nb += $#pred;
	if ($#pred != 0) {$nbEx += $#pred + 1;}
      }
    printf "%d\t%d\n", $nb, $nbEx;
  }

