#!/usr/local/bin/perl -w

# read as STDIN the output (eventually filtered) of cmpexons.pl
# takes as parameter a ST prediction file
# generates the number of FP per sequences
# (total number of predicted exon minus the number of distinct predicted
# exons overlapping real exon and beeing in frame in the output of cmpexons.pl

if ($#ARGV != 0)
  {
    die "need a prediction file as parameter";
  }

open(PDATA, $ARGV[0]) || die "can't open $ARGV[0]";

# read prediction file and compute total number of predicted exon per sequence
$prevSeq = 0;
while (<PDATA>)
  {
    if (($seq_id) = (/^seq(\d+)/))
      {
	$seq_id = int($seq_id);
	if ($prevSeq != $seq_id)
	  {
	    $TpredExons{$seq_id} = 1;
	    $prevSeq = $seq_id;
	  }
	else
	  {
	    $TpredExons{$seq_id}++;
	  }
      }
  }

close(PDATA);

# read cmpexons.pl output
while (<STDIN>)
  {
    if (($seq_id, $R) = (/^seq(\d+)\s+\d+\s+\[\s*\d+,\s*\d+\]\s+\d+\s+(\d+)/))
      {
	$predExons{int($seq_id)}{$R} = "";
      }
  }

# produce output
print "Contig\t#FPf\n";

@sequences = sort {$a <=> $b} keys %TpredExons;
foreach $s (@sequences)
  {
    if (exists($predExons{$s}))
      {
	@pred = keys %{$predExons{$s}};
	$nbpred = $#pred + 1;
      }
    else 
      {
	$nbpred = 0;
      }
    
    printf "seq%03d\t%d\n", $s, $TpredExons{$s} - $nbpred;
  }

