#!/usr/local/bin/perl 

###############################################################################
# Perl script
# Author C.Mathe 21/10/98
# this script extract data from a GeneMark output file and write the exon listing in a standard file format
###############################################################################


($dir,$name_script)= ($0 =~/(.+\/)*(.+\.pl)$/);
$need=$dir."util.pl";
require "$need";

#require "/home/camat/BIOCOMP/Perl/util.pl";

sub calcRF{ #calculation of the frame in the special case of GeneMark.
    local($GM_f,$strand)=@_;
    local($provF,$Frame);    
    if ($strand eq '-'){
	$provF = 4 + $Ltot%3 - $GM_f;
	if ($provF == 5){$Frame = 2;}
	else {if ($provF == 4) {$Frame = 1;}
	      else {$Frame = $provF;}}
    }
    else {$Frame = $GM_f;}
    return($Frame);
}


sub not_found   #routine to check if an exon was already read (GeneMark.lst often give redondant results)
{
    local ($nb, $i, @tab);
    @tab = @{$_[4]};
    $nb = @tab;
    for ($i=0;$i<$nb;$i++)
    {
	if (($tab[$i]{'Lend'} == $_[0])&&($tab[$i]{'Rend'} == $_[1])&&($tab[$i]{'Frame'} == $_[2])&&($tab[$i]{'strand'} == $_[3]))
	{return 0;
	}
    }
    return 1;
}

sub redond #routine to stock Acceptor and Donor sites, so that they appear only once in the standart file.
{
local($nb,$i,@allsite);
      @allsite=@{$_[1]};
      $nb=@allsite;
    for ($i=0;$i<$nb;$i++){
	if ($allsite[$i]==$_[0]) {return 1;}}
    return 0;
}

sub readGM{
    open(GM,"<@_")||die "Unable to open @_.";
                             #open each GeneMark file
    @Exons_list=("");        #initialisation of variables
    @acceptor_list=("");
    pop @acceptor_list;      #remove last element of the array (to be sure it is empty)
    @donor_list=("");
    pop @donor_list; 
    $nbA=$nbD=0;
    pop @Exons_list;
    $nb_exons=0;
    while(<GM>)
    {
	$type=$ph="";
	@line=split;
	if (/^List of Protein-Coding Exons/) #look for this line in the GeneMark output.
	{$etat=1;}
	if ($etat==1){
	    if (($#line==5)&&($line[3] eq 'fr')) #read lines with boudaries, frame, strand, proba. 
	    {
		($Lend,$Rend,$strand,$fr,$GF,$proba)=@line;
		if ($strand eq 'direct') {$str="+";}
		elsif ($strand eq 'complement') {$str="-";}
		($lg,$A,$D)=&calcAD($Lend,$Rend,$str);
		        #note : the frame calculated here is not the good one.
		$F=&calcRF($GF,$str); #calculate the good frame.
		if (&redond($A,\@acceptor_list)) {$A="";}
		else {$acceptor_list[$nbA]=$A;
		      $nbA++;}
		if (&redond($D,\@donor_list)) {$D="";}
		else {$donor_list[$nbD]=$D;
		      $nbD++;}
		if (&not_found($Lend, $Rend, $F,$str, \@Exons_list))
		{
		        $Exons_list[$nb_exons]{'Lend'}=$Lend;
			$Exons_list[$nb_exons]{'Rend'}=$Rend;
			$Exons_list[$nb_exons]{'strand'}=$str;
			$Exons_list[$nb_exons]{'Frame'}=$F;
			$nb_exons++; 
			if ($proba>=$threshold){
			    &EcrireST($seq,$type,$str,$Lend,$Rend,$lg,$ph,$F,$A,$D,$proba);}
		}
	    }
	    	
	elsif ($#line==2) { #read lines without the frame and strand information.
		($Lend,$Rend,$proba)=@line;
		($lg,$A,$D)=&calcAD($Lend,$Rend,$str);
		if (&redond($A,\@acceptor_list)) {$A="";}
		else {$acceptor_list[$nbA]=$A;
		      $nbA++;}
		if (&redond($D,\@donor_list)) {$D="";}
		else {$donor_list[$nbD]=$D;
		      $nbD++;}
		if (&not_found($Lend, $Rend, $F,$str, \@Exons_list))
		{    $Exons_list[$nb_exons]{'Lend'}=$Lend;
		     $Exons_list[$nb_exons]{'Rend'}=$Rend;
		     $Exons_list[$nb_exons]{'strand'}=$str;
		     $Exons_list[$nb_exons]{'Frame'}=$F;
		     $nb_exons++; 
		     if ($proba>=$threshold){
			 &EcrireST($seq,$type,$str,$Lend,$Rend,$lg,$ph,$F,$A,$D,$proba);}
		 }
	    }
	}
    }
    close(GM);
    $etat=0;
}

sub readLS{
open(LS,"< $liste") || die "Unable to open $liste.\n";
$ST=&openST("genemark");

while(<LS>)
#read the file containing all the GeneMark output files
{
    $count++;
    ($seq,$Ltot)=split;   # read each sequence name and each sequence length
    if (($seq eq "")||($Ltot eq "")) #check if the list is correctly formatted
    {
	&usage("GeneMark");
	die "!!! Incorrect input list at line $count!!!\n";}
    #$file=$seq . "gm.txt";
    @matchlist = glob "${seq}*.tfa.lst";
    if ($#matchlist>0) {die "Problem! several files exist : @matchlist."}
    $file = $matchlist[0];
    &readGM($file);
    print ST "\n";
}
close LS;
}


sub createST{
@data=&Start("GeneMark");
if ($data[1]==0) { 
    $liste=$data[0];
    &readLS;
    print "output file: $standart\n";
}
else 
{
    $Ltot=$data[1];
    $file=$data[0];
    ($seq) = ( $file =~ /(seq\d{2})/ );
    $file2=$seq."gm.txtST";
    open(ST,">$file2");
    print ST "Contig\tType\tStrand\tLend\tRend\tlength\tPhase\tFrame\tAcceptor\tDonor\tProba\n";
    &readGM($file);
    print "output file : $file2\n";
}
close ST;


}

$threshold=$ARGV[0];
&createST;












