#!/usr/local/bin/perl -w
# takes as first parameter path to nathalie real data file
# exported from Excel as TEXT standardized
# takes as second parameter the standardized output file of a prediction program
# WARNING!!! this file must NOT contain "concat" kind of sequences
# So clean it before with a "grep -v concat" before

# For each sequence in the real data file, compute gene begin (-300) end 
# gene end (+300)
# Given a sequence seq[i], a gene G[i]{j} of this sequence
# and its borders (+/- Marge=300) Aj=G[i]{j}{"begin"} and Bj=G[i]{j}{"end"}
# The purpose of the program is to split the predicted exons in numbered sets
# corresponding to real partition in genes (genes are numbered from 1 to n,
# considering sequences in order (seq01, seq02,...) and their genes in ascending
# left border

# If a predicted exon [a,b] fulfills the following rule
# a > Aj and b < Bj
# then this exon is associated to Gene j
# If one one the border of the exon is out of the gene border, the exon is cut
# and its new border set to the corresponding border of the gene 

# Because of distance between genes can be less than 600 bases (Marge*2),
# and prediction can overlap two real genes or more, an exon can be added 
# to two sets

# output files are generated:
# a correspondance.txt file which gives the new numbering of the sequences
# .../...
# seqi from a to b becomes seqi
# .../...

#============================================================

$Marge = 300;

# check parameters
if ($#ARGV != 1)
  {
    die "need 2 parameters: real data file, and standardized output from gene prediction program";
  }

open (REALDATA, $ARGV[0]) || die "can't open $ARGV[0]";
open (PREDICTION, $ARGV[1]) || die "can't open $ARGV[1]";

# read real data and store genes
$geneNb = 0;
while (<REALDATA>)
  {
    if (($seq_id, $type, $strand, $Lend, $Rend) = 
	($_ =~ /^seq(\d+)\s+(.{4})\s+([+-])\s+(\d+)\s+(\d+)/))
      {
	$seq_id = int($seq_id);
	
	if ($type eq "Sngl")
	  {
	    $seq{$seq_id}{$geneNb}{'begin'} = $Lend - $Marge;
	    $seq{$seq_id}{$geneNb}{'end'} = $Rend + $Marge;
	    $geneNb++;
	  }
	else
	  {
	    $beginLabel = $strand eq '+'?'Init':'Term';
	    $endLabel = $strand eq '+'?'Term':'Init';
	    
	    if ($type eq $beginLabel)
	      {
		$seq{$seq_id}{$geneNb}{'begin'} = $Lend - $Marge;
	      }
	    elsif ($type eq $endLabel)
	      {
		$seq{$seq_id}{$geneNb}{'end'} = $Rend + $Marge;
		$geneNb++;
	      }
	  }
      }
  }



close(REALDATA);

print STDERR "file ReadDataCut.txtST generated\n";

# read predictions

while (<PREDICTION>)
  {
    if (($seq_id) = ($_ =~ /^seq(\d+)/))
      {
	if (/\n$/)
	  {
	    chomp;
	  }
	push(@{$pred{int($seq_id)}}, $_);
      }
  }

close(PREDICTION);

# begin treatments
$geneNb=0;
print "Contig\tType\tS\tLend\tRend\tLength\tPhase\tFrame\tAc\tDo\tProba\n\n";
@sequences = sort {$a <=> $b} keys %seq;

if (!open(CORRESPONDANCE,">correspondance.txt"))
  {
    die "can't create correspondance.txt file";
  }

foreach $s (@sequences) # for each real sequence
  {
    
    @genes = sort {$seq{$s}{$a}{'begin'} <=> $seq{$s}{$b}{'begin'}} keys %{$seq{$s}};
    foreach $g (@genes) # for each gene os this sequence
      {
	$geneNb++;
	printf CORRESPONDANCE "seq%02d gene from %d to %d becomes seq%03d\n", $s, $seq{$s}{$g}{'begin'}, $seq{$s}{$g}{'end'}, $geneNb;
	if (!exists $pred{$s})
	  {
	    printf STDERR "sequence $s is not in prediction file\n";
	  }
	foreach $l (@{$pred{$s}}) # compare it to the prediction
	  {
	    ($before, $Lend, $Rend, $after) = 
	      ($l =~ /^seq\d+(.*[+-])\s+(\d+)\s+(\d+)(.*)/);
	    if ($Rend < $seq{$s}{$g}{'begin'} || $Lend > $seq{$s}{$g}{'end'})
	      {
		next; # exon out of gene region
	      }
	    if ($Lend < $seq{$s}{$g}{'begin'})
	      {
		$Lend = $seq{$s}{$g}{'begin'}; # overlaping exon, keep common part
	      }
	    if ($Rend > $seq{$s}{$g}{'end'})
	      {
		$Rend = $seq{$s}{$g}{'end'}; # overlaping exon, keep common part
	      }
	    printf "seq%03d%s\t%d\t%d%s\n", $geneNb, $before, $Lend, $Rend, $after;
	  }
	print "\n";
      }
  }

close(CORRESPONDANCE);

print STDERR "file correspondance.txt generated\n";


