#!/usr/local/bin/perl 

###############################################################################
# Perl script
# Author C.Mathe 07/10/98
# this script extract data from a GeneMark.hmm output file and write them in a standard file format
# file of results : dataset_GMhmmST
###############################################################################

($dir,$name_script)= ($0 =~/(.+\/)*(.+\.pl)$/);
$need=$dir."util.pl";
require "$need";

#require "/home/camat/BIOCOMP/Perl/util.pl";

sub calcGMF { #calculate Frame for GM.hmm, taking into account indication of exon beg and end in codon position 
    local ($rest,$L,$R,$strand)=@_;
    local($newR,$newL);
    if ($strand eq '+') {
	$newL = $L - $rest + 1;
	$F=&calcF($newL,$R,$strand);}
    else {$newR = $R + $rest -1;
	  $F=&calcF($L,$newR,$strand);}
    return $F;
}

sub readGMHMM{                      #read a GeneMark.hmm output file.
    open(GM,"< $file") || die "Unable to open $file.\n";
                                    #open each GMhmm output, one after the other
	while(<GM>)                 #reading of the GMhmm output file
	    {
		$type=$str=$Lend=$Rend=$lg=$ph=$F=$A=$D=$proba="";
		if (/^ID/) {$etat=1;}
		($id,$Lend,$Rend,$lg)=split;
		if ($etat==1) 
		{
		    if ((/^E/)||(/^I/)||(/^T/)||(/^G/)){
			if (!(/^ID/)){
			    foreach $i (1..3){
			    if ($id =~ /.d$i./) {$str= "+";
						  $ph=$i;}
			    if ($id =~ /.r.$i/) {$str= "-";
						 $ph=$i;}}

			    if ($id =~ /I.*/) {$type= "Init";}
			    if ($id =~ /E.*/) {$type= "Intr";}
			    if ($id =~ /T.*/) {$type= "Term";}
			    if ($id =~ /G.*/) {$type= "Sngl";}
			    
			    ($lg,$A,$D)=&calcAD($Lend,$Rend,$str);
			    $F=&calcGMF($ph,$Lend,$Rend,$str);
			    if (($type eq 'Term')||($type eq 'Sngl')) {$D="";}
			    if (($type eq 'Init')||($type eq 'Sngl')) {$A="";}
			    &EcrireST($seq,$type,$str,$Lend,$Rend,$lg,$ph,$F,$A,$D,$proba);
			}
		    }
		    else {$etat=0;}
		}
	    }
	close GM;
}

sub readLS{
    open(LS,"< $liste") || die "Unable to open $liste.\n";
    $ST=&openST("gmhmm");
    while(<LS>){
	$count++;
	($seq,$Ltot)=split;              # read each GMhmm file name and each corresponding sequence length
	if (($seq eq "")||($Ltot eq "")) #check if the GMhmm_list is correctly formatted
	{
	    &usage("GeneMark.hmm");
	    die "!!! Incorrect input list at line $count!!!\n";}
	$file=$seq . "gmhmm.txt";
	&readGMHMM($file);
	print ST "\n";}
close LS;	
}

sub createST{
    @data=&Start("GeneMark.hmm");
if ($data[1]==0) { 
    $liste=$data[0];
    &readLS;
    print "output file: $standart\n";
}
else 
{
    $Ltot=$data[1];
    $file=$data[0];
    @tmp=split('gmhmm',$file);
    $seq=$tmp[0];
    $file2 =$file . "ST";
    open(ST,">$file2");
    print ST "Contig\tType\tStrand\tLend\tRend\tlength\tPhase\tFrame\tAcceptor\tDonor\tProba\n";
    &readGMHMM($file);
    print "output file: $file2\n";
}
close ST;
}

&createST;


