#!/usr/bin/perl -w
##############################################################################################################
#Generate inputs for one sequence
#input: script path, sequence, ss, sa, corr, alignment file, separation
#Output: SVM-light type output

#Author: Jianlin Cheng
#Date: 12/14/2004 ------- 1/07/05
############################################################################################################

if (@ARGV != 5)
{
	die "need 5 parameters: package path, input seq file, alignment file, separation, dist threshold.\n"; 
}
$package_path = shift @ARGV;
if (substr($package_path, length($package_path)-1) ne "/")
{
	$package_path .= "/"; 
}
require "${package_path}potential.pl"; 
$seq_file = shift @ARGV; 
open(SEQ, "$seq_file") || die "can't read sequence file: $seq_file\n"; 
$seq_arg = <SEQ>; #separated by space
chomp $seq_arg; 
$ss_arg = <SEQ>; #separated by space 
chomp $ss_arg; 
$sa_arg = <SEQ>; #separated by space (10-buried, 50 exposed)
chomp $sa_arg; 
$coor_arg = <SEQ>; #coordinates
chomp $coor_arg; 
close SEQ; 

$ali_file = shift @ARGV; 
$separation = shift @ARGV; #Only predict contacts with separation equal or greater than 6.  
if ($separation < 6) { die "sequence separation is at least 6.\n"; }; 
$threshold = shift @ARGV;  #contact threashold

$win_size = 9; #size of window around aa of interest
$seg_size = 5; #size of window of central segment

@seq = split(/\s+/, $seq_arg);
@ss = split(/\s+/, $ss_arg);
@sa = split(/\s+/, $sa_arg);
@coor = split(/\s+/, $coor_arg); 
$length = @seq; 

if (@seq != @ss || @seq != @sa || @coor != 3 * @seq)
{
	die "length is not consistent, $ali_file\n"; 
}

open(ALI, "$ali_file") || die "can't read $ali_file\n"; 
@ali = <ALI>;
close ALI; 

#generate profile
#In future the MSA must be filtered or weighted by identity
$ali_num = shift @ali;
chomp $ali_num;
$join_seq = join("", @seq); 
push @ali, "$join_seq\n"; 
$ali_num++; 
$aa_str = "ACDEFGHIKLMNPQRSTVWY"; #20 std aa 

#profile size: 20 + one gap, if all zero, means not exist(for pos outside of window)
@profile = (); 
for ($i = 0; $i < $length; $i++)
{
	for ($j = 0; $j < 21; $j++)
	{
		$profile[$i][$j] = 0; 
	}
}
for ($i = 0; $i < $length; $i++)
{
	for($j = 0; $j < $ali_num; $j++)
	{
		$aa = substr($ali[$j], $i, 1);
		$aa = uc($aa);
		$idx = index($aa_str, $aa);
		if ($idx < 0) #gap case or unknonw
		{
			#treated as a gap
			$idx = 20; 
		}
		$profile[$i][$idx] +=  (1 / $ali_num); 
	}
}


for ($i = 0; $i < $length; $i++)
{
	for ($j = 0; $j < $i; $j++) #Only generate for lower triangle for saving space
	{
		if ( abs($i - $j) < $separation)
		{
			next; 
		}
		#pairwise information
		@input_pair = ();
		#input window around target
		@input_win = ();
		#central segment information
		@input_seg = ();
		#average of central segment
		@input_seg_ave = (); 
		#average of protein information
		@input_prot = ();
		#combination of all inputs
		@total_input = (); 
		#####################Pairwise Information###################################
		#generate pairwise information(right now only for target pair)
		# Can be extended to all pairs in window and can be average of profile.
		@prof1 = ();
		@prof2 = (); 
		for ($k = 0; $k < 21; $k++)
		{
			push @prof1, $profile[$i][$k];
			push @prof2, $profile[$j][$k]; 
		}
		$vcos = &cosine(\@prof1, \@prof2); 
		$vcor = &correlation(\@prof1, \@prof2); 
		@vaat = (); #amino acid type(10 different combination)
		for ($k = 0; $k < 10; $k++)
		{
			push @vaat, 0; 
		}
		$amino1 = $seq[$i];
		$amino2 = $seq[$j]; 
		if (index($aa_str, $amino1) > -1 && index($aa_str, $amino2) > -1)
		{
			$amino_type = &aatype($amino1, $amino2); 
			$vaat[$amino_type] = 1; 
		}
		#generate joint distribution mutual information
		#shouldn't be a 21 * 21 matrix, almost half is redudant
		#either make them symetic or train two triangles later
		@joint_dist = (); 
		for ($k = 0; $k < 21; $k++)
		{
			for ($m = 0; $m < 21; $m++)
			{
				$joint_dist[$k][$m] = 0; 
			}
		}
		for($k = 0; $k < $ali_num; $k++)
		{
			$aa1 = substr($ali[$k], $i, 1);
			$aa1 = uc($aa1);
			$aa2 = substr($ali[$k], $j, 1);
			$aa2 = uc($aa2);
			$idx1 = index($aa_str, $aa1);
			$idx2 = index($aa_str, $aa2);
			if ($idx1 < 0) { $idx1 = 20; };
			if ($idx2 < 0) { $idx2 = 20; };
			$joint_dist[$idx1][$idx2] += ( 1 / $ali_num);
			#$joint_dist[$idx1][$idx2] += ( 1 / (2 * $ali_num));
			#$joint_dist[$idx2][$idx1] += ( 1 / (2 * $ali_num));
		}
		$mutual_info = &mutual(\@prof1, \@prof2, \@joint_dist);  
		#total input size of pair-wise: 13
		push @input_pair, $vcos, $vcor, $mutual_info, @vaat; 

		#get the three differnt types of pariwise potentials
		##################End of pairwise information###############################
		push @input_pair, &getLevittCP($amino1, $amino2); 
		push @input_pair, &getJerniganCP($amino1, $amino2); 
		push @input_pair, &getBraunCP($amino1, $amino2); 
		##################Generate Window Information###############################
		#for each position in window(27): 21 for profile, 3SS, 2AA, 1 entropy
		#total input window size: 486
		for ($k = 0; $k < 2 * $win_size * 27; $k++)
		{
			$input_win[$k] = 0; 
		}
		$half_win = int($win_size / 2); 	
		#create first window
		for ($k = $i - $half_win; $k <= $i + $half_win; $k++)
		{
			if ($k < 0 || $k >= $length) { next; }; 
			#copy profile
			$start = ($k - $i + $half_win ) * 27; 
			for ($m = 0; $m < 21; $m++)
			{
				$input_win[$start + $m] = $profile[$k][$m];  
			}
			#SS information
			$start += 21;  
			$sec = $ss[$k]; 
			if ($sec eq "H") { $input_win[$start] = 1;}
			elsif ($sec eq "E") {$input_win[$start + 1] = 1; }
			elsif ($sec eq "C") {$input_win[$start + 2] = 1; };
			$start += 3; 
			$sov = $sa[$k]; 
			if ($sov < 25) { $input_win[$start] = 1;}
			else { $input_win[$start + 1] = 1; }; 
			$start += 2; 
			#Entropy information
			@prof1 = ();
			for ($m = 0; $m < 21; $m++)
			{
				push @prof1, $profile[$k][$m];
			}
			$input_win[$start] = &entropy(\@prof1);
		}
		#information for second window
		for ($k = $j - $half_win; $k <= $j + $half_win; $k++)
		{
			if ($k < 0 || $k >= $length) { next; }; 
			#copy profile
			$start = ($k - $j + $half_win + $win_size ) * 27; 
			for ($m = 0; $m < 21; $m++)
			{
				$input_win[$start + $m] = $profile[$k][$m];  
			}
			#SS information
			$start += 21;  
			$sec = $ss[$k]; 
			if ($sec eq "H") { $input_win[$start] = 1;}
			elsif ($sec eq "E") {$input_win[$start + 1] = 1; }
			elsif ($sec eq "C") {$input_win[$start + 2] = 1; };
			$start += 3; 
			$sov = $sa[$k]; 
			if ($sov < 25) { $input_win[$start] = 1;}
			else { $input_win[$start + 1] = 1; }; 
			$start += 2; 
			#Entropy information
			@prof1 = ();
			for ($m = 0; $m < 21; $m++)
			{
				push @prof1, $profile[$k][$m];
			}
			$input_win[$start] = &entropy(\@prof1);
		}
		##################End of Window Information#######################

		##################Generate Segment Window Information#############
		#Total size of segment window is: 135
		for ($k = 0; $k < $seg_size * 27; $k++)
		{
			$input_seg[$k] = 0; 
		}
		$half_win = int($seg_size / 2); 
		$middle = int( ($i+$j) / 2);
		for ($k = $middle - $half_win; $k <= $middle + $half_win; $k++)
		{
			if ($k < 0 || $k >= $length) { die "should all be valid in segment"; next; }; 
			#copy profile
			$start = ($k - $middle + $half_win) * 27; 
			for ($m = 0; $m < 21; $m++)
			{
				$input_seg[$start + $m] = $profile[$k][$m];  
			}
			#SS information
			$start += 21;  
			$sec = $ss[$k]; 
			if ($sec eq "H") { $input_seg[$start] = 1;}
			elsif ($sec eq "E") {$input_seg[$start + 1] = 1; }
			elsif ($sec eq "C") {$input_seg[$start + 2] = 1; };
			$start += 3; 
			$sov = $sa[$k]; 
			if ($sov < 25) { $input_seg[$start] = 1;}
			else { $input_seg[$start + 1] = 1; }; 
			$start += 2; 
			#Entropy information
			@prof1 = ();
			for ($m = 0; $m < 21; $m++)
			{
				push @prof1, $profile[$k][$m];
			}
			$input_seg[$start] = &entropy(\@prof1);
		}

		#generate segment average information (Composition of SS, SA) and length information(in range or a number?)
		#is it composition of MSA or just target sequence???? (use profile instead of sequence)
		#21 for composition of AA, 3 for SS, 11 for length (6,7,8,9,10-14,15-19,20-24,25-29,30-39,40-49,>49)
		#at this moment, use length (strictly separation:length+1) as 
		#input (one input), so input size: 21 + 3 + 1 = 15. 
		#Later, we might add composition of solvent accessibility
		
		#Total size of composition of segment: 42 (21 AA, 3 SS and 16 type, and 2 SA) 
		for ($k = 0; $k < 42; $k++) 
		{
			$input_seg_ave[$k] = 0; 
		}
		$seg_length = $i - $j - 1;  

		#WARNING: Here: We assume we work on Lower Triangle. When we work on Upper triangle, this 
		#need to be changed. 
		$comp_bur = 0; 
		for ($k = $j + 1; $k <= $i - 1; $k++)
		{
			#sum the profile
			for ($m = 0; $m < 21; $m++)
			{
				$input_seg_ave[$m] += $profile[$k][$m] / $seg_length; 
			}
			#sum the SS
			if ( $ss[$k] eq "H") { $input_seg_ave[21] +=  1 / $seg_length; }
			elsif ( $seq[$k] eq "E") { $input_seg_ave[22] +=  1 / $seg_length; }
			elsif ( $seq[$k] eq "C") { $input_seg_ave[23] +=  1 / $seg_length; }
			if ($sa[$k] < 25) { $comp_bur += 1 / $seg_length;}

		}
		#set the separation between i and j ( = seg_length + 1)
		#more to do: add separation 10, 11, 12, 13. and check if casp separation include 12 or 24
		#usually they evaluate at: >=6, >=8, >=12, >=16, >=24, ....
		$seg_length++; 
		if ($seg_length < 6) { $input_seg_ave[24] = 1; }
		elsif ($seg_length == 6) { $input_seg_ave[25] = 1; } #added
		elsif ($seg_length == 7) { $input_seg_ave[26] = 1; }
		elsif ($seg_length == 8) { $input_seg_ave[27] = 1; }
		elsif ($seg_length == 9) { $input_seg_ave[28] = 1; }
		elsif ($seg_length == 10) { $input_seg_ave[29] = 1; } #added
		elsif ($seg_length == 11) { $input_seg_ave[30] = 1; } #added
		elsif ($seg_length == 12) { $input_seg_ave[31] = 1; } #added
		elsif ($seg_length == 13) { $input_seg_ave[32] = 1; } #added
		elsif ($seg_length == 14) { $input_seg_ave[33] = 1; }
		elsif ($seg_length < 19) { $input_seg_ave[34] = 1; }
		elsif ($seg_length < 24) { $input_seg_ave[35] = 1; }
		elsif ($seg_length <= 29) { $input_seg_ave[36] = 1; }
		elsif ($seg_length <= 39) { $input_seg_ave[37] = 1; }
		elsif ($seg_length <= 49) { $input_seg_ave[38] = 1; }
		else { $input_seg_ave[39] = 1; }; 
		$input_seg_ave[40] = $comp_bur;
		$input_seg_ave[41] = 1 - $comp_bur; 
		##################End of Segment Window Information###############

		##################Generate Protein Composition Information############################
		#21 for AA, 3 for SS, 4 for length, 2 for SA
		#Total size of protein information is: 30 
		for ($k = 0; $k < 30; $k++) 
		{
			$input_prot[$k] = 0; 
		}
		for ($k = 0; $k < $length; $k++)
		{
			#sum the profile
			for ($m = 0; $m < 21; $m++)
			{
				$input_seg_ave[$m] += $profile[$k][$m] / $length; 
			}
			#sum the SS
			if ( $ss[$k] eq "H") { $input_prot[21] +=  1 / $length; }
			elsif ( $ss[$k] eq "E") { $input_prot[22] +=  1 / $length; }
			elsif ( $ss[$k] eq "C") { $input_prot[23] +=  1 / $length; }
		}
		#set the separation between i and j ( = seg_length + 1)
		##################more do do: NEED TO CHANGE ACCODING TO TRAIN DATAET: better use: 50, 100, 150, >=200 above#########
		if ($length <= 50) { $input_prot[24] = 1; }
		elsif ($length <= 100) { $input_prot[25] = 1; }
		elsif ($length <= 150) { $input_prot[26] = 1; }
		#elsif ($length <= 200) { $input_prot[27] = 1; }
		else { $input_prot[27] = 1; }; 

		#composition of solvent acc
		for ($k = 0; $k < $length; $k++)
		{
			if ( $sa[$k] < 25) { $input_prot[28] +=  1 / $length; }
			else { $input_prot[29] +=  1 / $length; }; 
		}

		##################End of Protien Composition Information##############################


		#round the numbers into at most 2 decimal
		#$total_size = 16 + 18 * 27[=486] + 5 * 27[=135] + 42 + 30 = 709; 
		$total_size = 709; 
		#print "pair: ", join(",", @input_pair), "\n"; 
		#print "win: ", join(",", @input_win), "\n"; 
		#$tmp_size = @input_seg;
		#print "segment size: $tmp_size\n"; 
		#print "seg: ", join(",", @input_seg) , "\n"; 
		#print "seg_ave: ", join(",", @input_seg_ave), "\n"; 
		#print "prot: ", join(",", @input_prot), "\n"; 
		push @total_input, @input_pair, @input_win, @input_seg, @input_seg_ave, @input_prot; 
		#$input_size = @total_input;
		#print "input size: $input_size\n"; 
		for ($k = 0; $k < $total_size; $k++)
		{
			#print "$total_input[$k]\n"; 
			$total_input[$k] = &round($total_input[$k]); 
		}

		#Generate label
		$x1=$y1=$z1=$x2=$y2=$z2=0; 
		$x1 = $coor[3*$i]; 
		$y1 = $coor[3*$i+1]; 
		$z1 = $coor[3*$i+2]; 
		$x2 = $coor[3*$j]; 
		$y2 = $coor[3*$j+1]; 
		$z2 = $coor[3*$j+2]; 
		$dist = sqrt( ($x1-$x2)*($x1-$x2) + ($y1-$y2)*($y1-$y2) + ($z1-$z2)*($z1-$z2) ); 
		$label = "-1"; 
		if ($dist < $threshold)
		{
			$label = "+1"; 
		}

		#pint out the title comments (indices for residue pairs)
		print "#", $j+1, " ", $i+1, "\n";

		#print label and inputs
		print "$label";  
		for ($k = 0; $k < $total_size; $k++)
		{
			if ($total_input[$k] != 0)
			{
				print " ", $k+1, ":", $total_input[$k];  
			}
		}
		print "\n"; 
		#print $label, " ", join(",", @total_input), "\n"; 
		#print "PRESS any key to continue....\n";
		#<STDIN>; 
	}
}
