#!/usr/local/bin/perl


######################################################################
# script to compar two standart output files, one of exon prediction, 
# and the other of splice sites predictions
# author: C.Mathe, 06/05/99, modified 07/05/99, modified 4/06/99
#######################################################################


require "/home/camat/BIOCOMP/Perl/util.pl";


if($#ARGV!=1) {die "\nPerl script to compar two standart outputs.\n 
USAGE:\ncompar_pred.pl file1 file2
\nFile1 is an exon predictor and file2 is a splice site predictor. \nFile1 is taken as the *best* one, and its exon labels (Init,Intr,Ter,Sngl) are kept\n"}

open(PRED1, $ARGV[0]) || die "can't open $ARGV[0]";
open(PRED2, $ARGV[1]) || die "can't open $ARGV[1]";

# read exon predictions (file1)

$prevSeq = 0;
while (<PRED1>)
{
    if (($seq_id, $type,$strand,$Lend, $Rend, $Length, $Frame) =
	(/^seq(\d+)\s+(.{0,4})\s+([+-])\s+(\d+)\s+(\d+)\s+(\d+)\s+[+-]?\d?\s+([+-]\d)/))
    {
	$seq_id = int($seq_id);
	if ($prevSeq != $seq_id)
	{
	    $exonNb = 0;
	    $prevSeq = $seq_id;
	}
	$pdata1{$seq_id}{$exonNb}{'Lend'} = $Lend;
	$pdata1{$seq_id}{$exonNb}{'Rend'} = $Rend;
	$pdata1{$seq_id}{$exonNb}{'Length'} = $Length;
	$pdata1{$seq_id}{$exonNb}{'Frame'} = $Frame;
	$pdata1{$seq_id}{$exonNb}{'Type'} = $type;
	$pdata1{$seq_id}{$exonNb}{'Strand'} = $strand;
	$exonNb++;
    }
}
close(PRED1);

# read splice sites predictions (file2)
$prevSeq = 0;
$nb_glob_Ac_plus=$nb_glob_Ac_minus=0;
$nb_glob_Ac_plus=$nb_glob_Ac_minus=0;

while (<PRED2>)
{
    $line=$_;
    $Lend=$Rend="";
    if (($seq_id,$strand,$Lend,$Rend) =
	(/^seq(\d+)\s+([+-])\t(\d*)\t(\d*)\t\t[012]?\t[+-]?[123]?\t(\d*)\t(\d*)\t(\d\.)?\d{1,2}/))
   {
	$seq_id = int($seq_id);
	if ($prevSeq != $seq_id)
	{
	    $nb_spliceL=$nb_spliceR=0;
	    $prevSeq = $seq_id;
	}
	if ($Lend ne "") {$pdataL{$seq_id}{$nb_spliceL}{'Lend'} = $Lend;
			  $pdataL{$seq_id}{$nb_spliceL}{'Strand'}=$strand;
			  ($strand eq'+')? $nb_glob_Ac_plus++:$nb_glob_Do_minus++;
			  $pdataL{$seq_id}{$nb_spliceL}{'all'}=$line;
			  $nb_spliceL++;
		      }
	elsif ($Rend ne "") {$pdataR{$seq_id}{$nb_spliceR}{'Rend'} = $Rend;
			     $pdataR{$seq_id}{$nb_spliceR}{'Strand'}=$strand;
			     ($strand eq'+')? $nb_glob_Do_plus++:$nb_glob_Ac_minus++;
			     $pdataR{$seq_id}{$nb_spliceR}{'all'}=$line;
			     $nb_spliceR++;
			 }
    }
}
close(PRED2);

# build consensus predictions
print "Contig\tType\tS\tLend\tRend\tLength\tPhase\tFrame\tAc\tDo\tProba\n";
$lastseq=0;
$nb_Do_plus=$nb_Do_minus=0;
$nb_Ac_plus=$nb_Ac_minus=0;

@sequences = sort {$a <=> $b} keys %pdata1;
foreach $s (@sequences) 
{
    @Pexons1 = sort {$pdata1{$s}{$a}{'Lend'} <=> $pdata1{$s}{$b}{'Lend'}} keys %{$pdata1{$s}};
    #sort in ascending order for Lend
    if ((!exists($pdataL{$s}))&&((!exists($pdataR{$s}))))
    {
	print STDERR "sequence $s not in prediction\n";
	next;
    }
    @PspliceL = sort {$pdataL{$s}{$a}{'Lend'}<=> $pdataL{$s}{$b}{'Lend'}} keys %{$pdataL{$s}};
    @PspliceR = sort {$pdataR{$s}{$a}{'Rend'} <=> $pdataR{$s}{$b}{'Rend'}} keys %{$pdataR{$s}};
    foreach $r (@Pexons1) # loop on prediction1 and compare to prediction2
    {
	$splice_site=0;
	foreach $p (@PspliceL) # loop then on predictionL
	{
	    # check if prediction1 = prediction2
	    if (($pdata1{$s}{$r}{'Lend'} == $pdataL{$s}{$p}{'Lend'}) && ($pdata1{$s}{$r}{'Strand'} eq $pdataL{$s}{$p}{'Strand'}))
	    {
		if (($pdata1{$s}{$r}{'Type'} eq 'Intr')||(($pdata1{$s}{$r}{'Type'} eq 'Init')&&($pdata1{$s}{$r}{'Strand'} eq '-'))||(($pdata1{$s}{$r}{'Type'} eq 'Term')&&($pdata1{$s}{$r}{'Strand'} eq '+'))) 
		{
		    $splice_site++;
		    ($pdata1{$s}{$r}{'Strand'} eq '+')? $nb_Ac_plus++:$nb_Do_minus++;
		    if ($lastseq != $s) {print "\n";
					 $lastseq=$s; }
		    print $pdataL{$s}{$p}{'all'};
		}
	    }
	}
	foreach $q (@PspliceR) # loop then on predictionR
	  {
	      if (($pdata1{$s}{$r}{'Rend'} == $pdataR{$s}{$q}{'Rend'})&&($pdata1{$s}{$r}{'Strand'} eq $pdataR{$s}{$q}{'Strand'}))
	      {
		  if (($pdata1{$s}{$r}{'Type'} eq 'Intr')||(($pdata1{$s}{$r}{'Type'} eq 'Init')&&($pdata1{$s}{$r}{'Strand'} eq '+'))||(($pdata1{$s}{$r}{'Type'} eq 'Term')&&($pdata1{$s}{$r}{'Strand'} eq '-'))) 
		  {
		      ($pdata1{$s}{$r}{'Strand'} eq '+')? $nb_Do_plus++:$nb_Ac_minus++;
		      $splice_site++;
		      if ($lastseq != $s) {print "\n";
					   $lastseq=$s; }
		      print $pdataR{$s}{$q}{'all'};
		  }
	      }
	  }
    }
}

$FP_Ac_p=$nb_glob_Ac_plus-$nb_Ac_plus;
$FP_Ac_m=$nb_glob_Ac_minus-$nb_Ac_minus;
$FP_Do_p=$nb_glob_Do_plus-$nb_Do_plus;
$FP_Do_m=$nb_glob_Do_minus-$nb_Do_minus;

print stderr "\tAc+\tAc-\tDo+\tDo-
TP\t$nb_Ac_plus\t$nb_Ac_minus\t$nb_Do_plus\t$nb_Do_minus
FP\t$FP_Ac_p\t$FP_Ac_m\t$FP_Do_p\t$FP_Do_m\n",
