#!/usr/local/bin/perl -w

# launch GeneMark HMM on all fasta sequence present in the directory given as parameter

# GeneMark HMM server URL
$GMHMM = "http://genemark.biology.gatech.edu/GeneMark/hum.cgi";
$POST = "/usr/local/bin/POST";
$HTML2TXT = "/share2/biocomp/Perl/html2txt";
$GMREFORM = "/share2/biocomp/Perl/GMHMMreformat.pl";

if (!-x $POST)
{
    die "need program $POST";
}
if (!-x $HTML2TXT)
{
    die "need program $HTML2TXT";
}

#------------------------------------------------------------


if ($#ARGV < 1)
{
    die "usage: param1 = directory where FASTA sequence files are; param2 = file containing a list of sequence name and length";
}

if (!-d $ARGV[0])
{
    die "$ARGV[0] is not a directory";
}

$fasta_dir = $ARGV[0];

if ($fasta_dir !~ /\/$/)
{
    $fasta_dir .= "/";
}

# retrieve fasta files
@fasta_files = glob "${fasta_dir}*.tfa";

if ($#fasta_files == -1)
{
    die "not FASTA files (*.tfa) in $fasta_dir";
}

if (!-f $ARGV[1])
{
    die "$ARGV[1] is not a file";
}

# read sequence name to be computed
open (F_IN, "$ARGV[1]") ||
    die "can't open $ARGV[1]";
while (<F_IN>)
{
    ($seq_id) = ($_ =~ /seq(\d+)\s+.+/);
    $seq_id = int $seq_id;
    $good_seq{$seq_id} = 1;
}
close F_IN;

# get PID for temp file
$temp_file = "$$.form_out";

# loop on FASTA files
for ($seq_no=0;$seq_no<=$#fasta_files;$seq_no++)
{
    # retrieve sequence number
    ($seq_id) = ($fasta_files[$seq_no] =~ /seq(\d+)/);
    $seq_id = int $seq_id;

    # check if this sequence has to be computed
    if (!exists $good_seq{$seq_id})
    {
	next;
    }

    print STDOUT "compute seq $seq_id\n";

    # read the sequence file
    if (!-r $fasta_files[$seq_no])
    {
	die "can't read file $fasta_files[$seq_no]";
    }

    open (F_IN, "$fasta_files[$seq_no]") ||
	die "can't open $fasta_files[$seq_no]";

    $seq = "";
    $first=0;
    while (<F_IN>)
    {
	if ($first == 0)
	{
	    $first=1;
	    next;
	}
	$_ =~ s/[\s\n\r\f]//g;
	$seq .= uc($_);
    }
    close F_IN;

    $seq_length = length($seq);

    # generate form output file
    open (F_OUT, ">$temp_file") ||
	die "can't create temporary file $temp_file";
    
    print F_OUT "title=";
    print F_OUT "&sequence=$seq";
    print F_OUT "&seq_file=";
    print F_OUT "&org=A.thaliana";
    print F_OUT "&address=";
    print F_OUT "&.cgifields=org";
    print F_OUT "&.cgifields=postscript";
    print F_OUT "&.cgifields=gmlst";
    print F_OUT "&Action=Start+GeneMark.hmm";
    close F_OUT;
    
    # launch the program
    $cmd = sprintf "cat $temp_file|$POST -e $GMHMM | $HTML2TXT | $GMREFORM > seq%02dgmhmm.txt", $seq_id;
    system "$cmd";
}


unlink "$temp_file";
