#!/usr/bin/perl -w
use File::Temp qw||;
use Carp qw|cluck :DEFAULT|;
use Cwd qw||;
use Data::Dumper;
use Pod::Usage;
#------------------------------------------------------------------------------#
#                        predictNLS                                            #
#	Copyright				        	2001-2011      #
#	Rajesh Nair		nair@rostlab.org                               #
#       Burkhard Rost           rost@rostlab.org        		       #
#	Rost Lab	        http://rostlab.org/     	               #
#       Columbia University                                                    #
#------------------------------------------------------------------------------#
# predictNLS interface to pp.
#usage fileIn='fileIn' dirOut='dirOut' fileOut='fileOut' html='0|1|2' fileTrace='fileTrace'
#html=0 ; 1 email output file
#html=1 ; 1 html output file
#html=2 ; 2 output files, html followed by email
#output: 2 files: 1)with html output 2) email output
# Set $dirOut in iniDef() to working dir 
# Set correct path to grep_NLS_update in system call
# all other command line arguments are specified in initialization subroutine iniDef

our $config;
my $pkgdatadir;
BEGIN {
	use Config::IniFiles;
	
	my ( $defaultconfig, $etcconfig );
	if( -e "__pkgdatadir__/predictnlsrc.default" ) { $defaultconfig = Config::IniFiles->new( -file => "__pkgdatadir__/predictnlsrc.default" ); }
	if( -e "__sysconfdir__/predictnlsrc" ) { $etcconfig = Config::IniFiles->new( -file => "__sysconfdir__/predictnlsrc", -import => $defaultconfig ); } else { $etcconfig = $defaultconfig; }
	if( ( $ENV{PREDICTNLSCONF} && -e "$ENV{PREDICTNLSCONF}" ) || -e "$ENV{HOME}/.predictnlsrc" ) { $config = Config::IniFiles->new( -file => $ENV{PREDICTNLSCONF} || "$ENV{HOME}/.predictnlsrc", -import => $etcconfig ); } else { $config = $etcconfig; }

	$pkgdatadir = glob( $config->val('predictnls', 'pkgdatadir') );
}

# popularity contest
if( system('pp_popcon_cnt', '-p', 'predictnls') == -1 ){ warn("The Rost Lab recommends you install the pp-popularity-contest package that provides pp_popcon_cnt:\n\nsudo apt-get install pp-popularity-contest\n"); }

#$scr=$0;
our $debug = $config->val('predictnls', 'debug');
&iniDef();

# open inFile and read in seq

our $protId;
open($fhin,$inFile)|| die( "Input file=$inFile unreadable. Execution terminated." );
$sequence= ""; # the var with seq in one letter amino acid code
while(<$fhin>){
  chomp($_);
  if($_=~ /^>(.*)$/o){
    if( !$protId ){ $protId= $1; next; }
    else{ die("Error: multiple-sequence fasta file - please give only one (1) sequence per file.".( $debug ? '' : "\n" ) ); }
  }
  $_=~s/\s//g;
  $sequence.=$_;
  $sequence=~s/\!//g;
}

close($fhin);
$sequence=~tr/[a-z]/[A-Z]/;# translating sequence to capital letters.
#---------------------------------------------------------------------------
#inFile readable! open filehandles for html and email output

if(defined $fileHtml){
  open($fhHtml,">".$fileHtml) || die( "Output file=$fileHtml cannot be opened. Dir not writable." );
  #
  print $fhHtml "<html><head><title>Results for $inFile</title></head>\n";
  
  print $fhHtml "<BODY BGCOLOR='#FFFFFF'>";
  print $fhHtml "<H3><STRONG>Results of Nuclear Localization Signal Prediction(NLS):</STRONG></H3>";
  print $fhHtml "<LI><A HREF='https://rostlab.org/owiki/index.php/PredictNLS#Help' TARGET='blank'>Help </A> on interpretation of results.</LI><BR><BR>";
}
if(defined $fileEmail){
  open($fhEmail,">".$fileEmail) || die( "Output file=$fileEmail cannot be opened. Dir not writable." );
  print $fhEmail "Results of Nuclear Localization Signal Prediction(NLS)\n";
  print $fhEmail "For help on interpretation of results visit the predictNLS help page: https://rostlab.org/owiki/index.php/PredictNLS\n"; 
  print $fhEmail "-----------------------------------------------------------------\n";
}
if(defined $fileSummary){
    open($fhSum,">".$fileSummary) || die( "Output file=$fileSummary cannot be opened. Dir not writable." );
}
#---------------------------------------------------------------------------


#system("$exeNLS mode=$mode '$sequence'  out  '$dirOut'  '$jobId'  >'$runOut'"); # system call to main program   
 
&grep_NLS($sequence);


$filein= $FoundNlsOut;
#print STDOUT "fhHtml=$fhHtml\tfileHtml=$fileHtml\thtml=$html\n";

my $out = undef;
if(-e  "$filein")
{
  if( !open($fhin,'<',$filein) )
  {
    print STDERR "$filein not readable: $!. Check dir permissions.\n";
    die;
  }
 
  # open file for reading
  while(<$fhin>){
    next if /^#/o;
    if ($_=~ /input/o) {
      $out=$_;
      #print STDOUT "$out\n";
    }
  }
  close ($fhin);
}

if( !$out && $debug ){ warn( "no results in file (cwd=".Cwd::getcwd().") $filein - perhaps no NLS in query" ); }

my @result = ();
if( $out ){ @result= split(/[\s]+/,$out); } # the results
# print STDOUT "<LI>@result<BR>";
# processing starts here
if( $out && $out =~ /\w/){		# if true, this seq has NLS. 
  
  if(defined $fileSummary){
    print $fhSum "1\n";
  }
  @nlsList = split(/;/o, $result[3] );
  @posList = map{ --$_; }split(/;/o, $result[4] ); # list of positions, 0-based
  if(defined $fileHtml){
    
    print $fhHtml "<TABLE CELLPADDING=2 WIDTH='100%'>\n";
    print $fhHtml "<TR VALIGN=TOP>\n",
    "<TD VALIGN=TOP BGCOLOR='#0000FF' WIDTH='15%'>\n",
    "<B><FONT COLOR='#FFFFFF'> Input Sequence (NLS's in Red)</FONT></B></TD>\n",
    "<TD VALIGN=TOP WIDTH='85%'>\n",
    "<PRE>";
  }
  &sortNlsList();		# sub sorts the positions of NLS's in numerical order. Used for highlighting NLS residues
  #print out highlighted regions to htmlfile
  @sequence= split(//o,$sequence);
  foreach $x (0..($shadeMin[0]-1)){
    if(defined $fileHtml){
      print $fhHtml "$sequence[$x]"; # the unhighlighted region at the N-terminal.
    }
    if((($x+1)%60)==0) {
      if(defined $fileHtml){
	print $fhHtml "\n";
      }
    }
  }
     foreach $k (0..$#shadeMin){
     foreach $x ($shadeMin[$k]..$shadeMax[$k]){
      if(defined $fileHtml){
	print $fhHtml qq|<span style="font-weight:bold;color:red;">$sequence[$x]</span>|; # highlighted region
      }
      if((($x+1)%60)==0) {
	if(defined $fileHtml){
	  print $fhHtml "\n";
	}
      }
    }
	     if($k<$#shadeMin){
      $last=$shadeMin[$k+1]-1;
    }
    else{
      $last= $#sequence;
    }
    foreach $x (($shadeMax[$k]+1)..$last){
      if(defined $fileHtml){
	print $fhHtml "$sequence[$x]"; #unshaded
      }
      if((($x+1)%60)==0) {
	if(defined $fileHtml){
	  print $fhHtml "\n";
	}
      }
    }
   }
  if(defined $fileHtml){
    print $fhHtml "</PRE>";
    print $fhHtml "</TD></TR>";
    print $fhHtml "<TR VALIGN=TOP>",
    "<TD VALIGN=TOP BGCOLOR='#0000FF' WIDTH='15%'>",
    "<B><FONT COLOR='#FFFFFF'> Sequence Length</FONT></B></TD>",
    "<TD VALIGN=TOP WIDTH='85%'>",
    "<EM>$result[2]</EM>",
    "</TD></TR>";
  }
  # print to email file
  if(defined $fileEmail){ print $fhEmail "----------------------------------------------------------------------\n"; }
  if(defined $protId){
     if(defined $fileEmail){
        print $fhEmail "Input sequence Id: $protId\n";
     }
  }
  if(defined $fileEmail){
    #print $fhEmail "Input Sequence : $sequence\n";
    print $fhEmail "Sequence Length: $result[2]\n";
    print $fhEmail "----------------------------------------------------------------------\n";
  }
  if(defined $fileHtml){
    print $fhHtml "<TR VALIGN=TOP>\n",
    "<TD VALIGN=TOP BGCOLOR='#0000FF' WIDTH='15%'>\n",
    "<B><FONT COLOR='#FFFFFF'> NLS's found.<I> Number gives position of Motif </I></FONT></B></TD>\n",
    "<TD VALIGN=TOP WIDTH='85%'>\n",
    "<UL>\n";  
  }
  if(defined $fileEmail){
    print $fhEmail "List of NLS's found in sequence\n";
    print $fhEmail "----------------------------------------------------------------------\n";
    printf $fhEmail "%1s%19s%1s%20s%1s\n","|","NLS","|","Position in sequence","|";
    
  }
  foreach $i (0..$#nlsList) {
    #Convert from NLS start from zero-based to one-based counting:
    if(defined $fileHtml){
      print $fhHtml "<LI> <EM> $nlsSeq[$i] </EM><EM>".($posList[$i]+1)."</EM> </LI>\n";
    }
    if(defined $fileEmail){
      printf $fhEmail "|%19s|%20s|\n", $nlsSeq[$i], $posList[$i]+1;
    }
    
  }
   if(defined $fileHtml){
    print $fhHtml "</UL></TD></TR></TABLE>\n";
  }
   if(-e $NlsDat ){
     if(defined $fileHtml){
       print $fhHtml "<H3><B>Statistical data for Nuclear Localization Signals present in the Input Sequence</B></H3>\n";
       print $fhHtml "<TABLE BORDER=2>";
       print $fhHtml "<TR><TD ALIGN=CENTER>\n",
       "<B><FONT >Generalized NLS<BR> ( <A HREF=\'#H_Res_NLS\'> notation </A>)</FONT></B></TD>\n",
       "<TD COLSPAN=1>Type</TD>",
       "<TD COLSPAN=1>No with NLS</TD>",
       "<TD COLSPAN=1>%Nuc Proteins</TD>",
       "<TD COLSPAN=1>%NonNuc Proteins</TD>",
       "<TD COLSPAN=1>Protein Swiss Id</TD>",
       "<TD COLSPAN=1>Protein Localizations(Swiss anno.)</TD></TR>\n";
     }
     if(defined $fileEmail){
        print $fhEmail "----------------------------------------------------------------------\n";
        print $fhEmail "\n\nStatistical data for the NLS's found in the Input Sequence:\n\n";
        print $fhEmail "----------------------------------------------------------------------\n\n";
        printf $fhEmail "%10s%25s%15s%10s%20s%10s\n","NLS","Type","NumWithNLS","%NucProt","ProtList","Prot.Loci";
     }
    if( $debug ){ cluck("opening '<$NlsDat'"); }
    open($fhin,'<', $NlsDat) || die( "failed to open $NlsDat for reading: $!" );
    while(<$fhin>)
    {
      #File contains Motif stat from db for NLS found in unk sequence
      #Motif_Name     No_Found                     %oE             %oX     Protein List
      # 0             1                            2               3       4       5 6                                         7
      # inputSequence YLTQETNKVETYKEQPLKTPGKKKKGKP Experimental    0       0       0
      # inputSequence [STQM]RRRK[STQM]             Potential       4       100     0 prt_antgr,ve2_hpv38,cpr2_petcr,msn4_yeast nuc,nuc,nuc,nuc
      if(/^#/o){ next; }
      $nlsData=$_;
      if ($_=~ /input/o){
	@tmp=split(/\s+/o,$_);
        if( @tmp < 6 ){ confess("unexpected line from file '$NlsDat': '$_'"); }
        # lkajan: so field 6 and 7 are not always present
	@nlsList = ( $tmp[6] ? split(/,/o, $tmp[6] ) : () );
	@locList = ( $tmp[7] ? split(/,/o, $tmp[7] ) : () );
	$nlsNo=$#nlsList+1;	   #no of NLS's found.
	if(defined $fileHtml){
	  print $fhHtml "<TR>",
	  qq|<TD ROWSPAN="$nlsNo" VALIGN="TOP">|,
	  "<B><FONT><EM>$tmp[1]</EM></FONT></B></TD>\n";
	}
	
	
	
	     if($tmp[2]=~ /Exp/) {
	  undef $exp;
	  $expNls= $tmp[1];
	  $expNls=~s/\[/\\[/g ;
	  $expNls=~s/\{/\\{/g ;
	  $expNls=~s/\]/\\]/g;
	  $expNls=~s/\}/\\}/g;
	  $fileRdb= $dirData."ExptNls.rdb";

          my $cmd = "grep -E '$expNls' '$fileRdb'";
          if($debug){ cluck( $cmd ); }
	  $exp=`$cmd`;
	  
          if(defined $exp && $exp ){
	    if( $exp!~ m/(\d+)$/o ){ cluck("warning: exp '$exp' did not match pattern m/(\\d+)\$/o, sequence is '$sequence'"); }
	    $uid=$1;
	    if(defined $fileHtml){
	        print $fhHtml "<TD COLSPAN=1 ROWSPAN='$nlsNo'  VALIGN='TOP'><A HREF=\"http://www.ncbi.nlm.nih.gov:80/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=$uid&dopt=Abstract\" TARGET=\"http://www.ncbi.nlm.nih.gov:80/entrez/query.fcgi?cmd=Retrieve&db=PubMed&list_uids=$uid&dopt=Abstract\">$tmp[2]</A></TD>\n";
	    }
	  }
	  else {
	    if(defined $fileHtml){
	        print $fhHtml "<TD COLSPAN=1 ROWSPAN='$nlsNo'  VALIGN='TOP'>$tmp[2] </TD>";
	    }
	  }
	}
	else{
	  $potNls= $tmp[1];
	  $fileData= $dirData."nlsTree";
	  undef $flagExp;
          if( $debug ){ cluck("opening '<$fileData'"); }
	  open($fhdata, '<', $fileData) || die( "Server Error: Tree file $fileData not found: $!" );
	  
	  while(<$fhdata>){
	    # lkajan: sometimes there are lines with no second field!
	    # ExptNLSList
	    # GGGx{3}KNRRx{6}RGGRN
	    @tmpdata= split(/\s+/o,$_);
	    if( $tmpdata[1] && $tmpdata[1] eq $potNls){
	      $flagExp=1;
	    }
	  }
	  close $fhdata;
	     if($flagExp){
	       if(defined $fileHtml){
	    print $fhHtml "<TD COLSPAN=1 ROWSPAN='$nlsNo'  VALIGN='TOP'>$tmp[2]</TD>\n";
	  }
	  }
	  else{
	    if(defined $fileHtml){
	    print $fhHtml "<TD COLSPAN=1 ROWSPAN='$nlsNo'  VALIGN='TOP'>$tmp[2] </TD>\n";
	  }
	  }
	}
	foreach $i (3..5){
	 if(defined $fileHtml){ 
	  print $fhHtml "<TD COLSPAN=1 ROWSPAN='$nlsNo'  VALIGN='TOP'>$tmp[$i] </TD>";
	}
	}

	  if(defined $fileEmail){
	    
	    	
		  printf $fhEmail "%25s%15s%10s%10s%20s%5s\n", $tmp[1],$tmp[2],$tmp[3],$tmp[4],( $nlsList[0] || ''), ($locList[0] || '');
		  foreach $j (1..$#nlsList){
		    foreach $k (1..60){
		      print $fhEmail  " ";
		    }
		    printf $fhEmail "%20s%5s\n",$nlsList[$j],$locList[$j];
		  }
		}
	  
	if(defined $fileHtml){
	  print $fhHtml qq|<td colspan="1" rowspan="1">|, ( $nlsList[0] ? qq|$nlsList[0]| : '' ), qq|</td>\n|;
	  print $fhHtml qq|<td colspan="1" rowspan="1">|, ( $locList[0] ? $locList[0] : '' ), qq|</td></TR>\n|;
        }
	foreach $j (1..$#nlsList){
	  if(defined $fileHtml){
	  print $fhHtml qq|<tr><td colspan="1" rowspan="1">$nlsList[$j]</td>\n|;
	  print $fhHtml "<TD COLSPAN=1 ROWSPAN=1>$locList[$j]</TD></TR>\n";
	}
	}
	if(defined $fileHtml){
	print $fhHtml "</TR>\n";
      }
	     if( $tmp[8] && $tmp[8]=~ /DNA_BIND/){
	  chomp($nlsData);
	  $dnaBind{$tmp[1]}= $nlsData;
	  $dnaFlag=1;
	} 
        if(defined $fileEmail){
            print $fhEmail "\n";
        }
      }
    }
    close $fhin;
  }
  if(defined $fileHtml){ print $fhHtml "</TABLE>"; }

	     if($dnaFlag==1){
    # DNA binding protein.
	        if(defined $fileHtml){
    print $fhHtml "<H3><B><I> This protein is predicted to bind to DNA.</I></B></H3>\n";
    print $fhHtml "<TABLE COLS=5 CELLPADDING=2 WIDTH='100%'>\n";
    print $fhHtml "<TR VALIGN=TOP>",
    "<TD VALIGN=TOP BGCOLOR='#0000FF' WIDTH='25%'>\n",
    "<B><FONT COLOR='#FFFFFF'><EM>DNA Binding signals.</EM></FONT></B></TD>\n";
                }
    if(defined $fileEmail){
		 print $fhEmail "======================================================================\n";
		 print $fhEmail "This protein is predicted to bind DNA\n";
		 print $fhEmail "======================================================================\n";
		 print $fhEmail "DNA Binding signals:\t";
    }
    
    foreach $key (keys %dnaBind){
      if(defined $fileHtml){
      print $fhHtml "<TD VALIGN=TOP WIDTH='17%'>$key </TD>\n";
    }
      if(defined $fileEmail){
      print $fhEmail "$key\t";
    }
    }
	        
	       if(defined $fileHtml){
    print $fhHtml "</TR></TABLE>\n";
    print $fhHtml "<H3> DNA binding statistics for the binding NLS signals.</H3>\n";
  }
	       if(defined $fileEmail){
    print $fhEmail "\n\nDNA binding statistics for the binding NLS  signals\n";
    printf $fhEmail "%25s%15s%10s%20s%10s%30s\n","DnaBindNLS","NumBindingDNA","%bindDNA","NumInBindDomain","%InBindDom","%Associated Binding Domains";
  }
	       if(defined $fileHtml){
    print $fhHtml  "<TABLE COLS=6 CELLPADDING=2 WIDTH='100%'>\n";
    print $fhHtml "<TR VALIGN=TOP><TD VALIGN=TOP BGCOLOR='#0000FF' WIDTH='15%'>\n",
    "<B><FONT COLOR='#FFFFFF'><EM>DnaBindNLS</EM></FONT></B></TD>\n",
    "<TD VALIGN=TOP WIDTH='15%'>No binding DNA</TD>\n",
    "<TD VALIGN=TOP WIDTH='15%'>%binding DNA</TD>\n",
    "<TD VALIGN=TOP WIDTH='15%'>No in binding Domain</TD>\n",
    "<TD VALIGN=TOP WIDTH='15%'>%in binding Domain</TD>\n",
    "<TD VALIGN=TOP WIDTH='25%'>% of associated DNA binding domains</TD></TR>\n";
  }
    foreach $key (keys %dnaBind){
      if(defined $fileHtml){
      print $fhHtml "<TR VALIGN=TOP>",
      "<TD VALIGN=TOP BGCOLOR='#0000FF' WIDTH='15%'>\n",
      "<B><FONT COLOR='#FFFFFF'><EM>$key</EM></FONT></B></TD>\n";
    }
     
      @tmp=split(/\t/o,$dnaBind{$key});
      foreach $i (9..12){
	if(defined $fileHtml){
	print $fhHtml "<TD VALIGN=TOP WIDTH='15%'>$tmp[$i]</TD>\n";
      }
      }
	if(defined $fileEmail){
	printf $fhEmail "%25s%15s%10s%20s%10s",$key,$tmp[9],$tmp[10],$tmp[11],$tmp[12];
      }
      
      if(defined $fileHtml){
      print $fhHtml "<TD VALIGN=TOP WIDTH='25%'>";
    }
      if(defined $fileEmail){
      print $fhEmail "    ";
    }
      foreach $i (13..$#tmp){
	$tmp[$i]=~m /(.+):(.+)/;
	if(defined $fileHtml){
	print $fhHtml "$binAnno{$1}:$2";
      }
	if(defined $fileEmail){
	print $fhEmail "$binAnno{$1}:$2";
      }
      }
      if(defined $fileEmail){
      print $fhEmail "\n";
    }
      if(defined $fileHtml){
      print $fhHtml "</TD></TR>";
    }
    }
	       if(defined $fileHtml){
    print $fhHtml "</TABLE>\n"; 
}
	   }
  if(defined $fileHtml){
  print $fhHtml "<H3><A NAME= 'H_Res_NLS'>  Symbols</A> used in representing the NLS are explained below:</H3>\n",
    
      "An x (or X) implies any amino acid residue can be present at this position.<BR>
     <BR> <TABLE BORDER='2' >",
     "<TR>",
     "<TD COLSPAN='3' ALIGN='CENTER' ><strong> Example Motifs </strong></TD>",
     "</TR>\n",
     "<TR>",
     "<TD> Example</TD>",
     "<TD> Read</TD>",
     "<TD> Equivalent Motifs </TD>",
     "</TR>\n",
     "<TR>",
     "<TD> [KR]KRKK </TD>",
     "<TD> \"K or R\" KRKK </TD>",
     "<TD> KKRKK,RKRKK",
     "</TR>\n",
     "<TR>",
     "<TD> K{5} </TD>",
     "<TD> 5 times K </TD>",
     "<TD> KKKKK </TD>",
     "</TR>\n",
     "<TR>",
     "<TD> [KR]{3,5} </TD>",
     "<TD> between 3 and 5 times K or R </TD>",
     "<TD> KKRR, RRKKR, RRR,KKK ...</TD>",
     "</TR>\n",
     "<TR>",
     "<TD> K{3,}? </TD>",
     "<TD> 3 or more K's </TD>",
     "<TD> KKK,KKKK,KKKKK ... </TD>",
     "</TR>",
     "</TABLE>";
}
  if(defined $fileEmail){
      print $fhEmail "\n\n";
      print $fhEmail "======================================================================\n";
      print $fhEmail "Symbols used in representing the NLS are explained below:\n\n";
      print $fhEmail "An x (or X) implies any amino acid residue can be present at this position.\n";
      print $fhEmail "..................................................\n";
      print $fhEmail "\t\tExample Motifs\n";
      print $fhEmail "..................................................\n";
      printf $fhEmail  "%10s%25s%40s\n","Example","Read","Equivalent Motifs";
      printf $fhEmail  "%10s%25s%40s\n","[KR]KRKK ","\"K or R\" KRKK","KKRKK,RKRKK";
      printf $fhEmail  "%10s%25s%40s\n","K{5}     ","   5 times K "," KKKKK     ";
      printf $fhEmail  "%10s%25s%40s\n","[KR]{3,5} ","between 3 and 5 times K or R ","KKRR,RRKKR,RRR,KKK ..";
      printf $fhEmail  "%10s%25s%40s\n","K{3,}?   ","   3 or more K's ","KKK,KKKK,KKKKK ..";
  }
}  #if($out=~ /\w/){
else {
    if(defined $fileSummary){
	print $fhSum "0\n";
    }
  if(defined $fileHtml){
  print $fhHtml "<TABLE COLS=2 CELLPADDING=2 WIDTH='100%'>\n";
  print $fhHtml "<TR VALIGN=TOP>\n",
  "<TD VALIGN=TOP BGCOLOR='#0000FF' WIDTH='15%'>\n",
  "<B><FONT COLOR='#FFFFFF'> Input Sequence</FONT></B></TD>\n",
  "<TD VALIGN=TOP WIDTH='85%'>\n",
  "<EM>$sequence</EM>",
  "</TD></TR>\n";
}
  if(defined $fileEmail){
  print $fhEmail "Input Sequence: $sequence\n";
}
  if(defined $fileHtml){
  print $fhHtml "<TR VALIGN=TOP>",
  "<TD VALIGN=TOP BGCOLOR='#0000FF' WIDTH='15%'>",
  "<B><FONT COLOR='#FFFFFF'> Output</FONT></B></TD>",
  "<TD VALIGN=TOP WIDTH='85%'>",
  "This protein does not contain a nuclear localization signal.",
  "</TD></TR>";
  print $fhHtml "</UL></TD></TR></TABLE>";
  print $fhHtml "</BODY>\n";
}
  if(defined $fileEmail){
  print $fhEmail "This sequence does not contain any nuclear localization signal in database\n";
}
}

exit(0);

#=====================================================================

sub iniDef {

#--------------------------------------------------------------------- 
#  iniDef       initialize defaults
#---------------------------------------------------------------------
    # first initialize command line arguments
  
#    # $html =0=> only email file; $html=1 => only html file

  foreach $i (0..$#ARGV){
    $ARGV[$i]=~m/(\w+)(=(\S+))?/o;
    $arg=$1;
    $par=$3;
    if($arg=~ /fileIn/i){
      $inFile= $par; # the fasta file
    }
    elsif($arg=~ /dirOut/i){
      $dirOut= $par;
      if(!($dirOut=~ /\/$/)){
	 # print STDOUT "yes\n";
	  $dirOut=$dirOut."/";
      }
    }
    elsif($arg=~ /fileOut/i){
      $fileOut= $par;
    }
    elsif($arg=~/^-*help$/o)
    {
        pod2usage(0);
    }
    elsif($arg=~ /html/){
      $html=$par;
    }
    elsif($arg=~ /trace/i){
      $trace= $par;
    }
    elsif($arg=~ /sum/i){
	$sum= $par; #if defined a summary file is produced with 1 if NLS is present else 0.
    }
    elsif($arg =~ /^-*debug$/o ){
      $debug = $par;
    }
    elsif($arg =~ /^-*nlsdat$/o ){
      $NlsDat = $par;
    }
    elsif($arg =~ /^-*version$/o )
    {
        print "__PACKAGE_STRING__\n"; exit(0);
    }
    else{
        pod2usage(-msg  => "Unacceptable command line argument.", -exitval => 2);
    }
    
  }

  if(! defined $inFile){
    die( "please provide input file (protein sequence)" );
  }
  
  if(! defined $html){
    $html=0;#by default generate text output
  }

  if(! defined $fileOut){ die("Error: required option fileOut is missing\n"); }

  if(!defined $dirOut){
    #$dirOut= "./";
    $dirOut = File::Temp::tempdir( CLEANUP => !$debug ).'/';
    #print STDOUT "Output directory set to current directory\n";
  }
  if( $trace ){
    $fileTrace= $trace;
  }
  if( $sum ){
    $fileSummary = $sum;
  }

  if(defined $fileTrace){
    $fileErr= $fileTrace;
    open(STDERR,">".$fileErr) || die( "specified dir not writable. Could not open $fileErr" );
  }
  
  if(!(defined $inFile)){
    die( "Input fasta file not defined" );
  }
  elsif(!(defined $fileOut)){
     die( "OutFile not defined" );
  }
  
  
  # get a random job_id (between 1 and 10000)
  $jobId=int(rand 30000) + 1;
  # initialize working directories and files.

  #$scr=~m /(.*\/).*?/;
  #$dirPrg = $1;
  #$dirPrg = "$pkgdatadir/perl";
  $dirData= "$pkgdatadir/data/";
  #$exeNLS = $dirPrg."grep_NLS_update.pl";


  #set parameter options
  $par{"dir"}= "$pkgdatadir/";
  $par{"motList"}= $par{"dir"}."data/My_NLS_list"; # the default database of Motifs.
  #$par{"protList"}= $par{"dir"}."data/prot_list"; # the default list of proteins with known localization.
  #$par{"protSeq"}=$par{"dir"}."data/allProt.fasta"; # flat file containing all sequences in format suitable for egrep.
  #$par{"lociList"}= $par{"dir"}."data/Loci.dat"; # the file with Localizations.
  #$par{"seq"}= "fasta"; # default sequence type.
  #$par{"fastaDir"}= "/data/derived/big/splitSwiss/"; # path to fasta files.
  #$par{"swissDir"}= "/data/swissprot/current/"; # path to swissprot.
  
  #undefine vars used by grep_NLS
  # hashes used
  undef %op; # observed and predicted nuclear.
  undef %nop; # Not obs but predicted nuclear.
  undef %prot_list; # list of prot containing a given motif.
  undef $dbFlag; # flag activated only when scanning through entire database.
  undef $tmp_motif;
  undef @motif;
  undef $fileMotif; # given file with Motifs.

  # files generated by grep_NLS_update
  $FoundNlsOut= $dirOut."FoundNLS".$jobId;# random id tags to output files
  $MotifOut= $dirOut."Motif_Stat".$jobId;
  $NlsDat ||= $dirOut."MotifDat".$jobId;
  #$mode=2; # sequence mode in grep_NLS_update
  #+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  # in and out files and vars
  $fhin= "FHIN";
  $fhdata="FHDATA";
  our $fhHtml;
  our $fhEmail;
  $fhSum= "FHSUM";
  undef $fileEmail;
  undef $fileHtml;
  if($html==0){
    $fileEmail =$fileOut;
  }
  elsif($html==1){
    $fileHtml =$fileOut;
  }
  else{
    
    $fileHtml =  "$fileOut.html";
    $fileEmail = $fileOut;
  }
  
  #++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
  # used to interpret annotations.
  #------------------------------------------------------------------------------------------
  #DNA Bind Motif anno list
  $binAnno{'hbox'}="HOMEOBOX";$binAnno{'bas'}= "BASIC DOMAIN";
  $binAnno{'fork'}= "FORK-HEAD"; $binAnno{'hlh'}= "HELIX-LOOP-HELIX";
  $binAnno{'myb'}= "MYB"; $binAnno{'hmg'}="HMG BOX";
  $binAnno{'znc'}= "ZINC FINGERS"; $binAnno{'ets'}="ETS-DOMAIN";
  $binAnno{'fun'}="FUNGAL-TYPE"; $binAnno{'hook'}= "A.T HOOK";
  #--------------------------------------------------------------------------------------------
  undef $protId;
  undef $sequence;
  undef $out;
  undef %dnaBind;# var used to record dna binding
  $dnaFlag=0;
}
#end of iniDef
#--------------------------------------------------------------------

#=====================================================================
sub sortNlsList {
#---------------------------------------------------------------------
#   sortNlsList     sorts the NLS's according to position of their occurrences.
#---------------------------------------------------------------------

 #buble sort
  $tag=$#posList;
  for ($tag=$#posList;$tag>0;$tag--){
    
    foreach  $k (0..($tag-1)){
      if ( $posList[$k] > $posList[($k+1)] ){
	
	$savPos=$posList[$k+1];
	$savNls=$nlsList[$k+1];
	$posList[$k+1]=$posList[$k];
	$nlsList[$k+1]= $nlsList[$k];
	$posList[$k]=$savPos;
	$nlsList[$k]=$savNls;
      }
    }
    
  }
  # end sort
  #print $fhout1 "<LI>@posList";
  foreach $k (0..$#posList){
    $tmpnls=$nlsList[$k];
    $tmpnls=~s/x/[A-Z]/go; # replace x with [A-Z]. Perl syntax.
    if( $sequence!~m/($tmpnls)/ ){ confess("sequence '$sequence' did not match nls '$tmpnls' - assertion failed"); }
    $nlsSeq[$k]=$1;
    $len= length($nlsSeq[$k]);
    $highLightMax[$k]=$posList[$k]+$len-1;
    #     print $fhout1 "<LI>$nlsList[$k] $1 $posList[$k] $highLightMax[$k]";
  }
  $cnt=0;
  $shadeMin[$cnt]=$posList[$cnt];# start pos for highlighting.
  $shadeMax[$cnt]=$highLightMax[$cnt];# finish for highlighting
  foreach $k (1..$#posList){
    if($posList[$k]<=$shadeMax[$cnt]){
      if($highLightMax[$k]>$shadeMax[$cnt]){
	$shadeMax[$cnt]=$highLightMax[$k];
      }
    }
    else{
      $cnt++;
      $shadeMin[$cnt]=$posList[$k];
      $shadeMax[$cnt]=$highLightMax[$k];
    }
  }
     
}
# end of sub sortNlsList     


#----------------------------------------------------------------------------
# subroutine grep_NLS
sub grep_NLS {
#---------------------------------------------------------------------
#   grep_NLS     subroutine for finding NLS's in given sequence.
#---------------------------------------------------------------------

  local($inSeq)=@_;
# fileHandle declarations
  local $fhin=   "FHIN";
  local $fhout=  "FHOUT";# fh for FoundNLS
  local $fhout1=  "FHOUT1";#fh for Motif_Stat
  local $fhout2= "FHOUT2";# fh for MotifDat
  local $fhdata= "DATA"; # file handle for reading input
  # Output files.
  local $file_out= $FoundNlsOut; # file with proteins containing given motif
  local $file_out1= $MotifOut;
  local $file_out2= $NlsDat;
  local($resDef,$id,$res,$prot,$nls);
  if($debug){ cluck("opening prot_file_with_motif (\$fhout) to '$file_out'"); }
  open($fhout,">".$file_out) || die "*** failed opening prot_file_with_motif = $file_out\n";

  print $fhout "# File with Id's and Loc of proteins containing NLS motifs\n";
  print $fhout "# prot_id\tLoc\tmotif\tpos\n";
    
  if($debug){ cluck("opening stat file (\$fhout1) to '$file_out1'"); }
  open($fhout1,">".$file_out1)|| die "*** failed opening Stat file $file_out1\n";
  print $fhout1 "# Statistics for various motifs\n";
  print $fhout1 "#Motif_Name\tNo_Found\t%oE\t%oX\n";

  our( $motDat, $tmp_motif ) = ( {}, undef );
  ( $motDat, $tmp_motif ) = &motifRd();
  &motifManager();
  $resDef= "no"; # No sequence Read.
  $id= "inputSequence";
  $res= $inSeq;
  &findMotif();
  # prnt out MotifDat
  if($debug){ cluck("opening MotifDat (\$fhout2) to '$file_out2'"); }
  open($fhout2,">", $file_out2)|| die"***failed opening MotifDat file $file_out2\n";
  print $fhout2 "#File contains Motif stat from db for NLS found in unk sequence\n";
  print $fhout2 "#Motif_Name\tNo_Found\t%oE\t%oX\tProtein List\n";
  foreach $prot (keys %foundMotif ){
    foreach $nls (keys %{ $foundMotif{$prot} } ){
      print $fhout2 "$prot\t$foundMotif{$prot}{$nls}\n";
    }
  }
  close $fhout;
  close $fhout1;
  close $fhout2;
}



#===============================================================================
#===============================================================================

sub motifRd {
#-------------------------------------------------------------------------------
#    motifRd                    read database of Motifs
#-------------------------------------------------------------------------------
    local $fh;
    local($in,@tmp,$i,$fh);
    $fh= "FHIN";
    if(!(defined $fileMotif)) {
	$fileMotif= $par{"motList"}; # use def motif db.
	if(!(defined $dbFlag)){
	$motFlag=1;
      }
    }
    if($debug){ cluck("opening motif file '$fileMotif'"); }
    open($fh,$fileMotif) || die"** could not open motifs file (cwd=".Cwd::getcwd().") $fileMotif";
    $i=0;
    undef $tmp_motif;
    while(<$fh>) {
      $in= $_;
      next if /^(#|\s)/o;
      @tmp=split(/\s+/,$_);
      if($motFlag==1 && !(defined $dbFlag) && defined $tmp[0])
      {
	chomp ($in);
        # ( 'VSRKRPR' => 'VSRKRPR Experimental    3       100     0       tala_povm3,tala_povma,tala_povmc        nuc,nuc,nuc', ... )
	$motDat->{$tmp[0]}= $in;
	
      }
      if($i==0){
	$tmp_motif=$tmp[0];
      }
      else {
	$tmp_motif=$tmp_motif.";".$tmp[0];
      }
      $i++;
    }
    close $fh;
    
    if($debug & 0x8){ cluck(Data::Dumper::Dumper($motDat, \$tmp_motif )); }

    return( $motDat, $tmp_motif );
}

#===============================================================================
#===============================================================================

sub motifManager {
#===============================================================================
#    motifManager               compiles list and does preprocessing of Motifs.
#===============================================================================
    if(!(defined $tmp_motif)) {
	die( "Motif List not defined; check inputs and default parameters" );
    }
    @motif= split(/;/o,$tmp_motif); # array motif contains set of NLS signals

    foreach $i (0..$#motif) {
	print $fhout "#motif_$i: $motif[$i]\n";
    }

    if($debug & 0x8){ cluck(Data::Dumper::Dumper(\@motif)); }
    # End of reading in input parameters.
}

#===============================================================================
#===============================================================================

sub findMotif {
#===============================================================================    
#   findMotif                 searches for motif in $res.
#===============================================================================

    local $len= length $res;
    # scanning for motif starts here.
    local $flag=0;		#  check to see if prot contains NLS motif.
    my $stack = '';             # kepps track of NLS's found if more than one found.
    my $loc_stack = ''; # keeps track of loci of NLS Found, 1-based
    my %ext_motif;
    foreach $x (0..$#motif) {
      my $motif_w = $motif[$x]; $motif_w =~ s/x/\\w/go;
    
      # lkajan: make it catch every occurrence of a motif, not only the first
      my $searchbase = 0;
      my $searchseq = $res;
      while( $searchseq )
      {
        # lkajan: does this ever happen?
#        if ($searchseq eq $motif[$x] ){
#          $flag++; $searchbase += length($searchseq); $searchseq = '';
#      
#          if ( defined $motDat->{$motif[$x]}){ $foundMotif{$id}{$motif[$x]}= $motDat->{$motif[$x]}; }
#
#          $ext_motif{$motif[$x]}++;
#
#          if(defined $prot_list{$motif[$x]}){ $prot_list{$motif[$x]} .= "," . $id; } else { $prot_list{$motif[$x]} = $id; }
#      
#          $stack .=     (length($stack) ? ';' : '' ) . $motif[$x];
#          $loc_stack .= (length($loc_stack) ? ';' : '' ) . $searchbase+1;
#      
#          # stat analysis
#          if(defined $location{$id} && $location{$id}=~ /nuc/) { $op{$motif[$x]}++; } else{ $nop{$motif[$x]}++; }
#        }
        if($searchseq =~ /$motif_w/)
        {
          $loc = $searchbase + $-[0]; # start of last successful match +1

          $flag++; $searchbase += $-[0]+1; $searchseq = substr( $res, $searchbase );
          
          if ( defined $motDat->{$motif[$x]}){ $foundMotif{$id}{$motif[$x]}= $motDat->{$motif[$x]}; }
      
          $ext_motif{$motif[$x]}++;
      
          if(defined $prot_list{$motif[$x]}){ $prot_list{$motif[$x]} .= "," . $id; } else { $prot_list{$motif[$x]} = $id; }
      
          $stack .=     (length($stack) ? ';' : '' ) . $motif[$x];
          $loc_stack .= (length($loc_stack) ? ';' : '' ) . ($loc+1);
      
          # stat analysis
          if(defined $location{$id} && $location{$id}=~ /nuc/) { $op{$motif[$x]}++; } else{ $nop{$motif[$x]}++; }
        }
        else
        {
          last;
        }
      }
    }
    
    if ($flag>0) { 
        #$prot_nls++;
        if(defined $location{$id}){ $loci = $location{$id}; } else { $loci= "unk"; }
    
        print $fhout "$id\t$loci\t$len\t$stack\t$loc_stack\n";
    }

    if($debug & 8){ cluck(Data::Dumper::Dumper( \%location, \%foundMotif, \%prot_list, \$stack, \$loc_stack )); }
}
    
#===============================================================================
    
=pod

=head1 NAME

predictnls - prediction and analysis of nuclear localization signals

=head1 SYNOPSIS

predictnls [OPTION]

=head1 DESCRIPTION

predictnls is a method for the prediction and analysis of nuclear localization signals (NLS)

=head2 Output format

Self-annotating, see example outputs in F<__docdir__/examples>.

=head1 REFERENCES

=over

=item Cokol, M., Nair, R., and Rost, B. (2000). Finding nuclear localization signals. EMBO Rep, 1(5), 411-5.

=back

=head1 OPTIONS

=over

=item debug=[0|1]

=item fileIn=<FILE>

Input file in fasta format. Required. Only one sequence per file.

=item fileOut=<FILE>

Output file. Required.

=item help

=item html=[0|1|2]

Output format. I<0> - text; I<1> - html; I<2> - both. Default: B<0>.

If both outputs are requested, the text file is named according to the value of B<fileOut> and the html file gets an `.html' extension.

=item fileSummary=<FILE>

If set `1' is written into this file if an NLS is found, `0' otherwise.  Optional.

=item nlsdat=<FILE>

Raw nls data, optional

=item version

Print version.

=back

=head1 EXAMPLES

C<predictnls fileIn=__docdir__/examples/O49931.fa fileOut=/tmp/O49931.nls>

=head1 ENVIRONMENT

=over

=item PREDICTNLSCONF

Location of predictnlsrc configuration file to use overriding other configuration files

=back

=head1 FILES

=over

=item F<__pkgdatadir__/predictnlsrc.default>

Default configuration file. See this file for a description of the parameters.

=item F<__sysconfdir__/predictnlsrc>

System configuration file overriding values in F<__pkgdatadir__/predictnlsrc.default>

=item F<~/.predictnlsrc>

User configuration file overriding values in F<__sysconfdir__/predictnlsrc>

=item F<$PREDICTNLSCONF>

If this environment variable is set F<~/.predictnlsrc> is disregarded and the value of the variable is read for configuration options overriding F<__sysconfdir__/predictnlsrc>

=back

=head1 AUTHOR

R. Nair

=head1 COPYRIGHT AND LICENSE

(C) Copyright 2010, R. Nair  License GPLv3+: GNU GPL version 3 or later <http://gnu.org/licenses/gpl.html>.  This is free software: you are free to change and redistribute it.  There is NO WARRANTY, to the extent permitted by law.

=cut

# vim:et:ai:
