#!/usr/local/bin/perl -w

# take 3 files as parameters
# a standard real data file
# a standard prediction file
# a data list file

if ($#ARGV != 2)
  {
    die "usage: param1: ST real data file, param2: ST prediction file, param3: data list";
  }

# read sequence length
open(LDATA, $ARGV[2]) || die "can't open $ARGV[2]";
while (<LDATA>)
  {
    if (($seq_id, $Length) = (/^seq(\d+)\s+(\d+)/))
      {
	$seq_length{int($seq_id)} = $Length;
      }
  }
close(LDATA);

# read Standard reql data file
open (RDATA, $ARGV[0]) || die "can't open $ARGV[0]";
$prevSeq = 0;
while (<RDATA>)
  {
    if (($seq_id, $Lend, $Rend, $Length, $Frame) =
	(/^seq(\d+)\s+.{0,4}\s+[+-]\s+(\d+)\s+(\d+)\s+(\d+)\s+[+-]?\d?\s+([+-]\d)/))
      {
	$seq_id = int($seq_id);
	if ($prevSeq != $seq_id)
	  {
	    $exonNb = 0;
	    $prevSeq = $seq_id;
	  }
	$rdata{$seq_id}{$exonNb}{'Lend'} = $Lend;
	$rdata{$seq_id}{$exonNb}{'Rend'} = $Rend;
	$rdata{$seq_id}{$exonNb}{'Length'} = $Length;
	$rdata{$seq_id}{$exonNb}{'Frame'} = int($Frame);
	$exonNb++;
      }
  }
close(RDATA);

# read prediction
open (PDATA, $ARGV[1]) || die "can't open $ARGV[1]";
$prevSeq = 0;
while (<PDATA>)
  {
    if (($seq_id, $Lend, $Rend, $Length) = # , $Frame
	(/^seq(\d+)\s+.{0,4}\s+[+-]\s+(\d+)\s+(\d+)\s+(\d+)/)) #\s+[+-]?\d?\s+([+-]\d)
      {
	$seq_id = int($seq_id);
	if ($prevSeq != $seq_id)
	  {
	    $exonNb = 0;
	    $prevSeq = $seq_id;
	  }
	$pdata{$seq_id}{$exonNb}{'Lend'} = $Lend;
	$pdata{$seq_id}{$exonNb}{'Rend'} = $Rend;
	$pdata{$seq_id}{$exonNb}{'Length'} = $Length;
	# $pdata{$seq_id}{$exonNb}{'Frame'} = int($Frame);
	$exonNb++;
      }
  }
close(PDATA);

# compute TP FP TN FN
print "Contig\tLength\tTP\tFP\tTN\tFN\tSn\tSp\n";
@sequences = sort {$a <=> $b} keys %seq_length;
foreach $s (@sequences)
  {
    print STDERR ".";
    $real = 'I' x (1+$seq_length{$s});
    $pred = $real;
    $TP = $FP = $TN = $FN = 0;
    
    if (!exists($rdata{$s}))
      {
	print STDERR "sequence $s not in $ARGV[0]\n";
	next;
      }
    @Rexons = sort {$rdata{$s}{$a}{'Lend'} <=> $rdata{$s}{$b}{'Lend'}} keys %{$rdata{$s}};
    foreach $e (@Rexons)
      {
	$l = $rdata{$s}{$e}{'Lend'};
	$r = $rdata{$s}{$e}{'Rend'};
	$c = 'C' x ($r - $l + 1);
	$real = substr($real, 0, $l) . $c . substr($real, $r + 1);
      }
    
    if (!exists($pdata{$s}))
      {
	print STDERR "sequence $s not in $ARGV[1]\n";
      }
    else
      {
	@Pexons = sort {$pdata{$s}{$a}{'Lend'} <=> $pdata{$s}{$b}{'Lend'}} keys %{$pdata{$s}};
	foreach $e (@Pexons)
	  {
	    $l = $pdata{$s}{$e}{'Lend'};
	    $r = $pdata{$s}{$e}{'Rend'};
	    $c = 'C' x ($r - $l + 1);
	    $pred = substr($pred, 0, $l) . $c . substr($pred, $r + 1);
	  }
      }
    
    for ($i=1; $i <= $seq_length{$s}; $i++)
      {
	$rb = substr($real, $i, 1);
	$pb = substr($pred, $i, 1);
	if ($rb eq 'C')
	  {
	    if ($pb eq 'C')
	      {
		$TP++;
	      }
	    else
	      {
		$FN++;
	      }
	  }
	else
	  {
	    if ($pb eq 'C')
	      {
		$FP++;
	      }
	    else
	      {
		$TN++;
	      }
	  }
      }
    $Sn = $TP + $FN;
    if ($Sn == 0)
      {
	$Sn = "Nan";
	$Snf = "%s";
      }
    else
      {
	$Sn = $TP / $Sn;
	$Snf = "%.2f";
      }
    
    $Sp = $TP + $FP;
    if ($Sp == 0)
      {
	$Sp = "Nan";
	$Spf = "%s";
      }
    else
      {
	$Sp = $TP / $Sp;
	$Spf = "%.2f";
      }
    printf "seq%03d\t%d\t%d\t%d\t%d\t%d\t$Snf\t$Spf\n", $s, $seq_length{$s}, $TP, $FP, $TN, $FN, $Sn, $Sp;
  }
print STDERR "\n";


