#!/usr/local/bin/perl -w

# compute the longest stretch for a sequence:
# number of maximum nucleotides that predicted in correct frame
# and belongs to contigous real exons

# take as input a ST real data file and a an output of cmpexons.pl

sub debug
  {
    if ($_[0] == 5 || $_[0] == 6)
      {
	print STDERR "$_[1]\n";
      }
  }

#============================================================

sub max
  {
    return $_[0] > $_[1] ? $_[0] : $_[1];
  }

#============================================================\

if ($#ARGV != 1)
  {
    die "need param1: ST real data, param2: output of cmpexons.pl";
  }

# read ST real data
open(DATA, $ARGV[0]) || die "can't open $ARGV[0]";
while (<DATA>)
  {
    if (($seq_id, $S) = (/^seq(\d+)\s+.{0,4}\s+([+-])\s+/))
      {
	$strand{int($seq_id)} = $S;
      }
  }
close(DATA);

# read output of cmpexons.pl
open(DATA,$ARGV[1]) || die "can't open $ARGV[0]";
while (<DATA>)
  {
    if (($seq_id, $Rex, $RL, $RR, $PL, $PR, $TPf) = 
	(/^seq(\d+)\s+(\d+)\s+\[\s*(\d+),\s*(\d+)\]\s+\d+\s+\d+\s+\[\s*(\d+),\s*(\d+)\]\s+(\d+)/))
      {
	$seq_id = int($seq_id);
	if (exists($pred{$seq_id}{$Rex}))
	  {
	    print STDERR "seq $seq_id: exon $Rex is predicted more than one time\n";
	  }
	$pred{$seq_id}{$Rex}{'Rbegin'} = $strand{$seq_id} eq '+' ? $RL : $RR;
	$pred{$seq_id}{$Rex}{'Rend'} = $strand{$seq_id} eq '+' ? $RR : $RL;
	$pred{$seq_id}{$Rex}{'Pbegin'} = $strand{$seq_id} eq '+' ? $PL : $PR;
	$pred{$seq_id}{$Rex}{'Pend'} = $strand{$seq_id} eq '+' ? $PR : $PL;
	$pred{$seq_id}{$Rex}{'TPf'} = $TPf;
      }
  }
close(DATA);

print "Contig\tlg stretch\n";

@sequences = sort {$a <=> $b} keys %pred;
foreach $s (@sequences)
  {
    if ($strand{$s} eq '+')
      {
	@exons = sort {$pred{$s}{$a}{'Rbegin'} <=> $pred{$s}{$b}{'Rbegin'}} keys %{$pred{$s}};
      }
    else
      {
	@exons = sort {$pred{$s}{$b}{'Rbegin'} <=> $pred{$s}{$a}{'Rbegin'}} keys %{$pred{$s}};
      }
    
    # first predicted exon
    $prev_ex = shift(@exons);
    if (!exists($pred{$s}{$prev_ex}{'TPf'}))
      {
	print "pred{$s}{$prev_ex}{'TPf'} not defined\n";
	next;
      }
    $prev_lgs = $pred{$s}{$prev_ex}{'TPf'};
    $lgs{$s} = $prev_lgs;
    
    # following exons
    foreach $e (@exons)
      {
	if (abs($prev_ex - $e) == 1 &&
	    $pred{$s}{$prev_ex}{'Rend'} == $pred{$s}{$prev_ex}{'Pend'} &&
	    $pred{$s}{$e}{'Rbegin'} == $pred{$s}{$e}{'Pbegin'})
	  {
	    # current prediction can be concatenated with previous one
	    # because (1) it matches two consecutive exons
	    # (2) ending borders of both previous real and predicted exons are equal
	    # (3) begining borders of both current real and predicted exons are equal
	    $prev_lgs += $pred{$s}{$e}{'TPf'};
	    $lgs{$s} = &max($lgs{$s}, $prev_lgs);
	  }
	else
	  {
	    # current prediction begins a new stretch
	    $prev_lgs = $pred{$s}{$e}{'TPf'};
	    $lgs{$s} = &max($lgs{$s}, $prev_lgs);
	  }
	
	$prev_ex = $e;
      }
    printf "seq%03d\t%d\n", $s, $lgs{$s};
  }

