#!/usr/local/bin/perl -w

# cmpseqPi.pl

# read as STDIN output of cmpexons.pl (eventually filtered)
# and as parameter a ST real data file and a prediction file
# compute per sequence:
# 1) Contig: id of sequence
# 2) #Ex: number of exons
# 3) #Pred: number of predicted exons
# 4) #Correct: number of exons predicted with good borders (%TP=100 && %FP=0)
# 5) sCorrect: nb bases in exons predicted with good borders 
# 6) #Partial: number of exons predicted with wrong borders (%TP!=100 || %FP!=0)
# 7) sPartial: nb base in exons predicted with wrong borders
# 8) #Wrong: number of predicted exons which match no real exons
# 9) sWrong: nb bases in predicted exons which match no real exons
#10) #CompMiss: number of exons not predicted at all
#11) sCompMiss: nb bases in exons not predicted at all
#12) #split: number of splits
#13) #ExR split: number of Real exons that have been split
#14) #concat: number of concat
#15) #ExR concat: number of Real exons that have been concatenated

if ($#ARGV != 1)
  {
    die "need ST real data file and prediction file";
  }

# read ST real data file and compute SumLg
open(RDATA, $ARGV[0]) || die "can't open $ARGV[0]";
$prevSeq=0;
while (<RDATA>)
  {
    if (($seq_id, $Length) = (/^seq(\d+)\s+.{4}\s+[+-]\s+\d+\s+\d+\s+(\d+)/))
      {
	$seq_id = int($seq_id);
	if ($seq_id != $prevSeq)
	  {
	    $seq{$seq_id}{'#Ex'} = 0;
	    $seq{$seq_id}{'sLg'} = 0;
	    $seq{$seq_id}{'#Pred'} = 0;
	    $seq{$seq_id}{'#Correct'} = 0;
	    $seq{$seq_id}{'sCorrect'} = 0;
	    $seq{$seq_id}{'#Partial'} = 0;
	    $seq{$seq_id}{'sPartial'} = 0;
	    $seq{$seq_id}{'#Wrong'} = 0;
	    $seq{$seq_id}{'sWrong'} = 0;
	    $seq{$seq_id}{'#CompMiss'} = 0;
	    $seq{$seq_id}{'sCompMiss'} = 0;
	    $seq{$seq_id}{'split'} = 0;
	    $seq{$seq_id}{'concat'} = 0;
	    $prevSeq = $seq_id;
	  }
	$seq{$seq_id}{'sLg'} += $Length;
	$n = $seq{$seq_id}{'#Ex'}++;
	$seq{$seq_id}{'Length'}{$n} = $Length;
      }
  }
close(RDATA);

# read prediction file
open(PDATA, $ARGV[1]) || die "can't open $ARGV[1]";
$prevSeq=0;
while (<PDATA>)
  {
    if (($seq_id, $Lend, $Rend, $Length) = (/^seq(\d+)\s+.{0,4}\s+[+-]?\s+(\d+)\s+(\d+)\s+(\d+)/))
      {
	$seq_id = int($seq_id);
	if ($seq_id != $prevSeq)
	  {
	    $nbEx = 0;
	    $prevSeq = $seq_id;
	  }
	$pred{$seq_id}{$nbEx}{'Lend'} = $Lend;
	$pred{$seq_id}{$nbEx}{'Rend'} = $Rend;
	$pred{$seq_id}{$nbEx}{'Length'} = $Length;
	$nbEx++;
	$seq{$seq_id}{'#Pred'} = $nbEx;
      }
  }
close(PDATA);

# read output of cmpexons.pl
while (<STDIN>)
  {
    if (($seq_id, $R, $P, $TP, $pTP, $FP) =
	(/^seq(\d+)\s+(\d+)\s+\[\s*\d+,\s*\d+\]\s+\d+\s+(\d+)\s+\[\s*\d+,\s*\d+\]\s+(\d+)\s+(\d+\.?\d*)\s+(\d+)/))
      {
	$seq_id = int($seq_id);
	
	if ($pTP == 100 && $FP == 0)
	  { # compute number of correct exons
	    $seq{$seq_id}{'#Correct'}++;
	    $seq{$seq_id}{'sCorrect'} += $pred{$seq_id}{$P}{'Length'};
	  }
	else
	  { # compute number of partial exons
	    $seq{$seq_id}{'#Partial'}++;
	    $seq{$seq_id}{'sPartial'} += $TP;
	  }
	
	# prepare computation of #split and #concat
	push(@{$spt{$seq_id}{$R}},$P);
	push(@{$cnt{$seq_id}{$P}},$R);
      }
  }

# compute output
print "Contig\t";
print "Nb Ex\t";
print "sLg\t";
print "#Pred\t";
# Correct
print "#Correct\t";
print "sCorrect\t";
# Partial
print "#Partial\t";
print "sPartial\t";
# Wrong
print "#Wrong\t";
print "sWrong\t";
# CompMiss
print "#CompMiss\t";
print "sCompMiss\t";
# split and # exons split 
print "#split\t";
print "#ExR split\t";
# concat and # exons concatenated 
print "#concat\t";
print "#ExR concat\n";

@sequences = sort {$a <=> $b} keys %seq;
foreach $s (@sequences)
  {
    printf "seq%03d\t%d\t%d\t%d\t", $s, $seq{$s}{'#Ex'}, $seq{$s}{'sLg'}, $seq{$s}{'#Pred'};
    
    # Correct
    printf "%d\t%d\t", $seq{$s}{'#Correct'}, $seq{$s}{'sCorrect'};
    
    # Partial
    printf "%d\t%d\t", $seq{$s}{'#Partial'}, $seq{$s}{'sPartial'};
    
    # Wrong
    if (exists($pred{$s}))
      {
	@pexons = keys %{$pred{$s}}; # all predicted exons
	foreach $e (@pexons)
	  {
	    if (!exists($cnt{$s}{$e})) # predicted exons non overlapping real exon
	      {
		$seq{$s}{'#Wrong'}++;
		$seq{$s}{'sWrong'} += $pred{$s}{$e}{'Length'};
	      }
	  }
      }
    printf "%d\t%d\t", $seq{$s}{'#Wrong'}, $seq{$s}{'sWrong'};
    
    # Comp Missing
    @rexons = keys %{$seq{$s}{'Length'}};
    foreach $e (@rexons)
      {
	if (!exists($spt{$s}{$e})) # real exon which has no overlapping prediction
	  {
	    $seq{$s}{'#CompMiss'}++;
	    $seq{$s}{'sCompMiss'} += $seq{$s}{'Length'}{$e};
	  }
      }
    printf "%d\t%d\t", $seq{$s}{'#CompMiss'}, $seq{$s}{'sCompMiss'};
    
    # compute number of split
    $nb=0;
    $nbEx=0;
    foreach $r (keys %{$spt{$s}})
      {
	@pred = @{$spt{$s}{$r}};
	$nb += $#pred;
	if ($#pred != 0) {$nbEx++;}
      }
    printf "%d\t%d\t", $nb, $nbEx;
    
    # compute number of concat
    $nb=0;
    $nbEx=0;
    foreach $p (keys %{$cnt{$s}})
      {
	@pred = @{$cnt{$s}{$p}};
	$nb += $#pred;
	if ($#pred != 0) {$nbEx += $#pred + 1;}
      }
    printf "%d\t%d\n", $nb, $nbEx;
  }

