ProTIS
# ProTIS:  Profiler of Transposon Insertion Sites 
# ProTIS.pl
# Version 1 April 17, 2005  (updated May 30, 2006)
# 

# Converts nucleotide sequence around TA dinucleotides in input sequence
# to v-step values based on overlapping dinucleotides

# Generates 3 output files:  
# First output:  ta sites characterized as full, partial, none; scored, sum 
# 	of scores over $binsize bp bins reported "-v-bins.txt";

# Second output file:  TA sites listed with prediction for individual graphs 
# 	"-ta.txt";

# Third output:  Table of site type, position, vsteps "-table.txt";	
#
# Copyright 2005

# Chris Hackett, San Francisco, CA
# 
# Reference:
# Geurts AM, Hackett CS, Bell JB, Bergemann TL, Collier LS, Carlson CM, Largaespada DA, Hackett PB. 
# Structure-based prediction of insertion-site preferences of transposons into chromosomes.
# Nucleic Acids Res. 2006 May 22;34(9):2803-11. PMID: 16717285 


#!/usr/bin/perl



#ANALYSIS VARIABLES:


#V-step substitution values:



$AA = 2.9;

$AC = 2.3;

$AG = 2.1;

$AT = 1.6;

$CA = 9.8;

$CC = 6.1;

$CG = 12.1;

$CT = 2.1;

$GA = 4.5;

$GC = 4.0;

$GG = 6.1;

$GT = 2.3;

$TA = 6.3;

$TC = 4.5;

$TG = 9.8;

$TT = 2.9;



#scoring values


$tafourplus = 13;  	#score for 4-plus peak preferred

$tathreefive = 5;	#score for 3.5-peak preferred
$tathree = 5;		#score for 3-peak semi-preferred
$tatwofive = 4;		#score for 2.5-peak semi-preferred

$tabasal = 1;		#score for basal ta site


#parameters

$taseqlen = 12;  			#length of TA sequence pulled out

$binsize = 100;				#size of bins (in bp) for "-v-bins.txt" output file

#END OF ANALYSIS VARIABLES


Get_Files();

$index = 0;

$seqsrec = 0;

$seqsconv = 0;

$nucline = <F1>;
print F4 "Type\tPosition";
for ($i = 0; $i < $taseqlen-1; $i++) {
	print F4 "\tvry[$i]";
}
print F4 "\n"; #note: N1 = nucary[0]

$aroundTA = int(($taseqlen/2) - 1);  	#number of bases to count in before first TA site or after last (to avoid errors at beginning or end of sequence), and orients TA dinucleotide in middle of extracted sequences, adjusting to first index of 0; eg if $taseqlen = 12, first TA can start at position 7, TA vstep at vry[5] - note, if seqlen is odd, rounds down
if (($aroundTA +1) == ($taseqlen/2)) {	#accounts for odd TA sequence length at end of sequence
	$lastTA = $aroundTA;
}
else {
	$lastTA = $aroundTA + 1;
}


while (defined($nucline )){   

	if ($nucline =~ /^>/) {

		$seqsrec ++;

		if ($seqsrec > 1) {  #need to catch first/previous case

			$seqsconv ++;

			BinTAVstep();

		} 

		$name = $nucline;

		print "Sequence $seqsrec recognized: $name";

		$seqlength = 0;

		$seqstring = "";

		print F2 "\n$nucline";

	}

	else {

		chomp($nucline);

		$ucnucline = uc($nucline);
		$ucnucline =~ tr/ACGNT//cd; 

		$seqlength += length($ucnucline); #counter for length of sequence

		$seqstring = $seqstring . $ucnucline

	}

	$nucline = <F1>;

}

$seqsconv ++;

BinTAVstep();

print "Sequences Recognized:  $seqsrec\n";

print "Sequences Converted:   $seqsconv\n";

close (F1);

close (F2);

close (F3);

close (F4);


sub BinTAVstep

{

	$binnum = 1;

	$tascoretally = 0;

	$tatally = 0;

	$fourpluscount = 0;
	$threefivecount = 0;
	$threecount = 0;
	$twofivecount = 0;
	$basalcount = 0;

	$bintotal = $seqlength / $binsize ;

	print "Sequence length = $seqlength bp, bins = $bintotal\n";

	print F3 "Bin\tRegion\t4-Plus Preferred\t3.5-Peak Preferred\t3-Peak Semi-Preferred\t2.5-Peak Semi-Preferred\tBasal\tScore\n";
	

	$binstart = 1;
	$stop = $seqlength - $lastTA - 1;  #calculates stopping point near end of sequence
	for($i = 0; $i < $seqlength; $i++) {

		$basecoordinate = $i + 1;

		if ((($i+1)/$binsize == $binnum) || ($i == $seqlength-1)) {  #writes bin data to ouput, increments bin num, resets tally

			print F3 "$binnum\t$binstart-$basecoordinate\t$fourpluscount\t$threefivecount\t$threecount\t$twofivecount\t$basalcount\t$tascoretally\n";	

			$binnum ++;

			$tascoretally = 0;

			$fourpluscount = 0;
			$threefivecount = 0;
			$threecount = 0;
			$twofivecount = 0;
			$basalcount = 0;

			$binstart = $i+2;  #set up start of next bin

		}

		$dinuc = substr($seqstring, $i, 2);

		$base = substr($dinuc, 0, 1);
		if (($dinuc eq "TA") && ($i >= $aroundTA) && ($i < $stop)) {

			$tatally ++;

			$taindex = $i-$aroundTA;

			$taseq = substr($seqstring, $taindex, $taseqlen);

			ScoreTAVstep();

		}	

	}

	print "Sequence $seqsconv TA count:  $tatally\n";

}



sub ScoreTAVstep

{

	chomp($name);

	$name =~ s/>//; #chops off greater than

	print F2 "\n$name TA site $tatally at $basecoordinate bp\n"; #no longer prints $taseq

	for($j = 0; $j < $taseqlen; $j++) {

		$vry[j] = 0;
		$nucary[j] = 0;

	}

	for($j = 0; $j < $taseqlen-1; $j++) {

		$dinuc = substr($taseq, $j, 2);
		Assign_Vstep();  

		$vry[$j] = $vstep;

		$base = substr($dinuc, 0, 1);
		print F2 "$base\t$vry[$j]\n"; 

	}
	for($j = 0; $j < $taseqlen; $j++) {
		$nucary[$j] = substr($taseq, $j, 1);
	}

	ScoreTASite();
	print F4 "$basecoordinate bp";

	for ($j = 0; $j < $taseqlen-1; $j++) {
		print F4 "\t$vry[$j]";
	}
	print F4 "\n";
}



sub ScoreTASite 

{

	
	$type = "";
	if (Four_Plus_Peak()) {
		$type = "4-plus peak preferred";

		$fourpluscount ++;

		$tascoretally += $tafourplus;

	}

	elsif (Three_Point_Five_Peak()) {

		$type = "3.5-peak preferred";

		$threefivecount ++;

		$tascoretally += $tathreefive;

	}
	elsif (Three_Peak()) {

		$type = "3-peak semi-preferred";

		$threecount ++;

		$tascoretally += $tathree;

	}

	elsif (Two_Point_Five_Peak()) {

		$type = "2.5-peak semi-preferred";

		$twofivecount ++;

		$tascoretally += $tatwofive;

	}
	else {

		$type = "basal";

		$basalcount ++;

		$tascoretally += $tabasal;

	}
	print F2 "$type\n";

	print F4 "$type\t";


}	



sub Assign_Vstep

{

	$vstep = "undefined";

	if($dinuc eq "AA") {$vstep = $AA;}

	elsif ($dinuc eq "AC") {$vstep = $AC;}

	elsif ($dinuc eq "AG") {$vstep = $AG;}

	elsif ($dinuc eq "AT") {$vstep = $AT;}

	elsif ($dinuc eq "CA") {$vstep = $CA;}

	elsif ($dinuc eq "CC") {$vstep = $CC;}

	elsif ($dinuc eq "CG") {$vstep = $CG;}

	elsif ($dinuc eq "CT") {$vstep = $CT;}

	elsif ($dinuc eq "GA") {$vstep = $GA;}

	elsif ($dinuc eq "GC") {$vstep = $GC;}

	elsif ($dinuc eq "GG") {$vstep = $GG;}

	elsif ($dinuc eq "GT") {$vstep = $GT;}

	elsif ($dinuc eq "TA") {$vstep = $TA;}

	elsif ($dinuc eq "TC") {$vstep = $TC;}

	elsif ($dinuc eq "TG") {$vstep = $TG;}

	elsif ($dinuc eq "TT") {$vstep = $TT;}

	else {

		$vstep = "undefined";

		print "error in v-step recognition\n";

	}

}



sub Get_Files

{	

	print "Enter input file:  ";

	chomp($infile = <STDIN>);

	$interfile = $infile;

	@interarray = split/\./, $interfile; #take out suffix from input file

	$outfile = @interarray[0] . "-v-bins.txt";	

	$outfile2 = @interarray[0] . "-ta.txt";
	$outfile3 = @interarray[0] . "-table.txt";	

	print "outputfile 1 =  $outfile\n";

	print "outputfile 2 =  $outfile2\n";
	print "outputfile 3 =  $outfile3\n";

	open(F1, "$infile") || die "error: input file not found\n";	

	open(F2, ">$outfile2")|| die "error opening output file\n";

	open(F3, ">$outfile")|| die "error opening output file\n";
	open(F4, ">$outfile3")|| die "error opening output file\n";

}


#PROFILE CLASSIFICATION FUNCTIONS:  
#note:  Coordinates for 12bp-site positions and new classifications 4-17-05

sub Four_Plus_Peak 		# 4-plus peak preferred
{
(($vry[2] < $vry[3] && $vry[3] > $vry[4] && $vry[4] < $vry[5] && $vry[5] > $vry[6] && $vry[6] < $vry[7] && $vry[7] > $vry[8]) && ($vry[0] < $vry[1] && $vry[1] > $vry[2] || $vry[8] < $vry[9] && $vry[9] > $vry[10]))
}

sub Three_Point_Five_Peak	#3.5-peak preferred
{
(($vry[2] < $vry[3] && $vry[3] > $vry[4] && $vry[4] < $vry[5] && $vry[5] > $vry[6] && $vry[6] < $vry[7] && $vry[7] > $vry[8]) && ($vry[1] > $vry[2] || $vry[8] < $vry[9]))		
}

sub Three_Peak			#3-peak semi-preferred
{
(($vry[4] < $vry[5] && $vry[5] > $vry[6]) && (($vry[0] < $vry[1] && $vry[1] > $vry[2] && $vry[2] < $vry[3] && $vry[3] > $vry[4]) || ($vry[6] < $vry[7] && $vry[7] > $vry[8] && $vry[8] < $vry[9] && $vry[9] > $vry[10])))
}

sub Two_Point_Five_Peak		#2.5-peak semi-preferred
{
(($vry[4] < $vry[5] && $vry[5] > $vry[6]) && (($vry[1] > $vry[2] && $vry[2] < $vry[3] && $vry[3] > $vry[4]) || ($vry[6] < $vry[7] && $vry[7] > $vry[8] && $vry[8] < $vry[9])))
}