#!/usr/local/bin/perl 

###############################################################################
# Perl script
# Author C.Mathe 21/10/98, 
# this script extract data (regions) from a GeneMark output file and write them in a standard file format
###############################################################################



($dir,$name_script)= ($0 =~/(.+\/)*(.+\.pl)$/);
$need=$dir."util.pl";
require "$need";

#require "/home/camat/BIOCOMP/Perl/util.pl";

sub calcRF{ #calculation of the frame in the special case of GeneMark.
    local($GM_F,$strand)=@_;
    local($provF,$Frame);
    if ($strand eq '-'){
	$provF=4+$Ltot%3-$GM_F;
	if ($provF==5){$Frame=2;}
	else {if ($provF==4) {$Frame=1;}
	      else {$Frame=$provF;}}
    }
    else {$Frame=$GM_F;}
    return($Frame)
}
##########

sub not_found   #routine to check if an exon was already read (GeneMark.lst often give redondant results)
{
    local ($nb, $i, @tab);
    @tab = @{$_[4]};
    $nb = @tab;
    for ($i=0;$i<$nb;$i++)
    {
	if (($tab[$i]{'Lend'} == $_[0])&&($tab[$i]{'Rend'} == $_[1])&&($tab[$i]{'Frame'} == $_[2])&&($tab[$i]{'strand'} == $_[3]))
	{return 0;
	}
    }
    return 1;
}

##########

sub readGM{
    open(GM,"<@_")||die "Unable to open @_.";
                             # open each GeneMark file
    @Exons_list=("");
    pop @Exons_list;
    $nb_exons=0;
    while(<GM>)
    {
	$type=$ph="";
	@line=split;
	if (/^List of Regions of interest/) # look for this line in the GeneMark output.
	{$etat=1;}
	if ($etat==1){
	    if (($#line==4)&&($line[3] eq 'fr')) # read lines with boudaries, frame, strand, proba. 
	    {
		($Lend,$Rend,$strand,$fr,$GF)=@line;
		if ($strand eq 'direct') {$str="+";}
		elsif ($strand eq 'complement') {$str="-";}
		#$Lend=$Lend+3; #delete Stop at begining of regions
		#$Rend=$Rend-3; #delete Stop at end of regions
		($lg,$A,$D)=&calcAD($Lend,$Rend,$str);
		        # note : the frame calculated here is not the good one.
		$F=&calcRF($GF,$str); # calculate the good frame.
		if (&not_found($Lend, $Rend, $F,$str, \@Exons_list)) # create exons list to avoid writing several time the same exon (GeneMark output often gives more than once the same exon !)
		{
		        $Exons_list[$nb_exons]{'Lend'}=$Lend;
			$Exons_list[$nb_exons]{'Rend'}=$Rend;
			$Exons_list[$nb_exons]{'strand'}=$str;
			$Exons_list[$nb_exons]{'Frame'}=$F;
			&EcrireST($seq,$type,$str,$Lend,$Rend,$lg,$ph,$F,$A,$D,$proba);
			$nb_exons++; 
		    }
	    }   
	}
    }
    close(GM);
    $etat=0;
}
##########

sub readLS{
open(LS,"< $liste") || die "Unable to open $liste.\n";
$ST=&openST("genemarkregion");

while(<LS>)
# read the file containing all the GeneMark output files
{
    $count++;
    ($seq,$Ltot)=split;   # read each sequence name and each sequence length
    if (($seq eq "")||($Ltot eq "")) # check if the list is correctly formatted
    {
	&usage("GeneMark");
	die "!!! Incorrect input list at line $count!!!\n";}
    push(@list_seq,$seq);
   @matchlist = glob "${seq}*.tfa.lst";  # look for a GeneMark file ending with .tfa.lst.
    if ($#matchlist>0) {die "Problem! several files exist : @matchlist."}
    $file = $matchlist[0];
    &readGM($file);
    print ST "\n";
}
close LS;
}
##########

sub createST{
@data=&Start("GeneMark");
if ($data[1]==0) { # user chose a list "L"
    $liste=$data[0];
    &readLS;
    foreach $seqname (@list_seq){
	$seq=$seqname;
    }
    print "output file: $standart\n";
}
else { # user tape "S" for a single GeneMark file
    $Ltot=$data[1];
    $file=$data[0];
    ($seq) = ( $file =~ /(seq\d{2})/ );
    $file2=$seq."gmregion.txtST";
    open(ST,">$file2");
    print ST "Contig\tType\tStrand\tLend\tRend\tlength\tPhase\tFrame\tAcceptor\tDonor\tProba\n";
    &readGM($file);
    print ST "\n";
}
close ST;
}
##########

#main:
$threshold=$ARGV[0];
&createST;













