#!/usr/bin/perl -I/tmp/ViennaRNA-1.5/Perl/blib/arch -I/tmp/ViennaRNA-1.5/Perl/blib/lib #-I/home/stanley/ViennaRNA-1.5/Perl/blib/arch -I/home/stanley/ViennaRNA-1.5/Perl/blib/lib -I/home/stanley/Shuffle-1.4/blib/lib -I/home/stanley/Statistics-Basic-0.41.3/blib/lib #-I/home/stanley/ViennaRNA-1.5/Perl/blib/arch -I/home/stanley/ViennaRNA-1.5/Perl/blib/lib -I/home/stanley/Shuffle-1.4/blib/lib -I/home/stanley/Statistics-Descriptive-2.6/blib/lib ############################################################################ # AUTHOR: Stanley NG Kwang Loong, stanley@bii.a-star.edu.sg # DATE: 31/07/2005 # VERSION: 1.0 ############################################################################ use Algorithm::Numerical::Shuffle qw /shuffle/; use Statistics::Basic::StdDev; use Statistics::Basic::Mean; use RNA; $ENV{UNBIAS}=1; use POSIX qw(log); use strict; srand(time|$$); ############################################################################ # Global Parameters and initialization if any. ############################################################################ my $mfeFile=""; my $inFile="&STDIN"; my $outFile="&STDOUT"; my $numRandomSeqs; #number of random sequences per set my $numSeqs; my $usage = "USAGE: perl genRNARandomStats.pl -n \"number\" -i -o -m \n"; my $statsID=-1; my @templines; my @amfe; my @aseq; my @astruct; my @aQ; my @aD; my @aBP; my @aSS; my @cmfe; my @cseq; my @cstruct; my @cQ; my @cD; my @cBP; my @cSS; ############################################################################ # File IO # Parse the command line. ############################################################################ foreach my $a (0..$#ARGV) { if ($ARGV[$a] eq "-n") { $numRandomSeqs = $ARGV[$a+1]; } elsif ($ARGV[$a] eq "-i") { $inFile = ($ARGV[$a+1] eq "") ? "&STDIN" : $ARGV[$a+1]; } elsif ($ARGV[$a] eq "-o") { $outFile = ($ARGV[$a+1] eq "") ? "&STDOUT" : $ARGV[$a+1]; printf(STDERR "$outFile\n"); if ( -e $outFile) { open (OUTFILE, "<$outFile") or die( "Cannot open input file $outFile: $!" ); while (my $line = ) { chomp($line); if ($line =~ m/^[0-9]/) { push(@templines, $line); $statsID++; } } close (OUTFILE); } } elsif ($ARGV[$a] eq "-m") { $mfeFile = $ARGV[$a+1]; } else { } } if(!defined($inFile) || !defined($outFile) || !defined($mfeFile) || !defined($numRandomSeqs)) { die ("$usage\n"); } open (INFILE, "<$inFile") or die( "Cannot open input file $inFile: $!" ); open (MFEFILE, "<$mfeFile") or die( "Cannot open input file $mfeFile: $!" ); # ID Mean SD $numSeqs = 0; while (my $line = ) { chomp($line); if ($line =~ m/^>/) { } elsif ($line =~ m/^[AaCcUuGg]/) { $aseq[$numSeqs] = $line; } # Fasta Third Line i.e. RNA secondary structure and MFE elsif ($line =~ m/^[.(]/) { $line =~ s/\( /\(/; ($astruct[$numSeqs], $amfe[$numSeqs]) = split(/ /, $line); $amfe[$numSeqs] =~ s/[()]//g; ($aQ[$numSeqs], $aD[$numSeqs], $aBP[$numSeqs], $aSS[$numSeqs]) = rnaAnalysis($aseq[$numSeqs], length($aseq[$numSeqs]), $astruct[$numSeqs], $amfe[$numSeqs]); $numSeqs++; } else { } }#end of while loop close (MFEFILE) or die( "Cannot close input file $mfeFile: $!" ); open (OUTFILE, ">$outFile") or die ("Cannot open output file $outFile: $!"); print (OUTFILE "ID"); print (OUTFILE "\tX\tRMean\tRSD\tZ\tP-MFE\tP-Z"); print (OUTFILE "\tX\tRMean\tRSD\tZ\tP-Q\tP-Z"); print (OUTFILE "\tX\tRMean\tRSD\tZ\tP-D\tP-Z"); print (OUTFILE "\tX\tRMean\tRSD\tZ\tP-PB\tP-Z"); print (OUTFILE "\tX\tRMean\tRSD\tZ\tP-TD\tP-Z"); map { print (OUTFILE "\n$_"); } @templines; my $i = 0; #index for $numRandomSeqs $numSeqs = 0; while (my $line = ) { # Read line by line. chomp($line); if ($line =~ m/^>/) { } elsif ($line =~ m/^[AaCcUuGg]/) { $cseq[$i] = $line if ($numSeqs > $statsID); } # Fasta Third Line i.e. RNA secondary structure and MFE elsif ($line =~ m/^[.(]/) { if ($numSeqs > $statsID) { $line =~ s/\( /\(/; ($cstruct[$i], $cmfe[$i]) = split(/ /, $line); $cmfe[$i] =~ s/[()]//g; ($cQ[$i], $cD[$i], $cBP[$i], $cSS[$i]) = rnaAnalysis($cseq[$i], length($cseq[$i]), $cstruct[$i], $cmfe[$i]); } $i++; #Finished collecting $numRandomSeqs sequences if ($i == $numRandomSeqs) { if ($numSeqs > $statsID) { printf (OUTFILE "\n%u", $numSeqs+1); my ($mean, $sd, $z, $p, $pz); ($mean, $sd, $z, $p, $pz) = computeStats(\@cmfe, $amfe[$numSeqs]); printf (OUTFILE "\t%.2f\t%.4f\t%.4f\t%s\t%.4f\t%s", $amfe[$numSeqs], $mean, $sd, $z, $p, $pz); ($mean, $sd, $z, $p, $pz) = computeStats(\@cQ, $aQ[$numSeqs]); printf (OUTFILE "\t%.2f\t%.4f\t%.4f\t%s\t%.4f\t%s", $aQ[$numSeqs], $mean, $sd, $z, $p, $pz); ($mean, $sd, $z, $p, $pz) = computeStats(\@cD, $aD[$numSeqs]); printf (OUTFILE "\t%.2f\t%.4f\t%.4f\t%s\t%.4f\t%s", $aD[$numSeqs], $mean, $sd, $z, $p, $pz); ($mean, $sd, $z, $p, $pz) = computeStats(\@cBP, $aBP[$numSeqs]); printf (OUTFILE "\t%.2f\t%.4f\t%.4f\t%s\t%.4f\t%s", $aBP[$numSeqs], $mean, $sd, $z, $p, $pz); my @cstructtree = map { RNA::make_tree(RNA::expand_Full($_)) } @cstruct; my $astructtree = RNA::make_tree(RNA::expand_Full($astruct[$numSeqs])); my @aedit_distance = map { RNA::tree_edit_distance($astructtree, $_) } @cstructtree; my $aedit_distance_mean = new Statistics::Basic::Mean(\@aedit_distance)->query; @cstructtree = shuffle(@cstructtree); my $chosentree = pop(@cstructtree); my @cedit_distance = map { RNA::tree_edit_distance($chosentree, $_) } @cstructtree; ($mean, $sd, $z, $p) = computeTreeStats(\@cedit_distance, $aedit_distance_mean); $pz = 0.0; printf (OUTFILE "\t%.2f\t%.4f\t%.4f\t%s\t%.4f\t%s", $aedit_distance_mean, $mean, $sd, $z, $p, $pz); } $i = 0; $numSeqs++; last if (scalar(@amfe) == $numSeqs); } } else { } }#end of while loop print(OUTFILE "\n"); close (INFILE) or die( "Cannot close input file $inFile: $!" ); close (OUTFILE) or die( "Cannot close output file $outFile: $!"); sub pzCount { my ($nativeZ, $times, $data) = @_; my $size = 1000; return 0 if (scalar(@$data) < $size+1); my $R = 0; my $times_size = sprintf("%d", (scalar(@$data) - 1)/$size); while (1) { my @shuffle = shuffle(@$data); my $first = 0; my $last = $size - 1; my $index = scalar(@$data) - 1; foreach my $i(0..$times_size-1) { my @short = @shuffle[$first..$last]; my $mean = new Statistics::Basic::Mean(\@short)->query; my $sd = new Statistics::Basic::StdDev(\@short)->query; $R++ if (($shuffle[$index] - $mean) < ($nativeZ * $sd)); return $R if (--$times == 1); $first += $size; $last += $size; $index -= 1; } } } sub computeStats { my ($data, $value) = @_; my $mean = new Statistics::Basic::Mean($data)->query; my $sd = new Statistics::Basic::StdDev($data)->query; my $p = 0; map { $p++ if ($_ < $value); } @$data; my $z = "undef"; my $pz = "undef"; if ($sd > 0) { $z = sprintf("%.4f", ($value - $mean)/$sd); $pz = 0.0; } return ($mean, $sd, $z, $p/scalar(@$data), $pz); } sub pzTreeCount { my ($nativeZ, $times, $data) = @_; my $size = 1000; return 0 if (scalar(@$data) < $size+1); my $R = 0; my $times_size = sprintf("%d", (scalar(@$data) - 1)/$size); while (1) { my @shuffle = shuffle(@$data); my $first = 0; my $last = $size - 1; my $index = scalar(@$data) - 1; foreach my $i(0..$times_size-1) { my @sample = @shuffle[$first..$last]; my @aedit_distance = map { RNA::tree_edit_distance($shuffle[$index], $_) } @sample; my $aedit_distance_mean = new Statistics::Basic::Mean(\@aedit_distance)->query; my $chosentree = pop(@sample); my @cedit_distance = map { RNA::tree_edit_distance($chosentree, $_) } @sample; my $mean = new Statistics::Basic::Mean(\@cedit_distance)->query; my $sd = new Statistics::Basic::StdDev(\@cedit_distance)->query; $R++ if (($aedit_distance_mean - $mean) < ($nativeZ * $sd)); return $R if (--$times == 1); $first += $size; $last += $size; $index -= 1; } } } sub computeTreeStats { my ($data, $value) = @_; my $mean = new Statistics::Basic::Mean($data)->query; my $sd = new Statistics::Basic::StdDev($data)->query; my $p = 0; map { $p++ if ($_ < $value); } @$data; my $z = "undef"; if ($sd > 0) { $z = sprintf("%.4f", ($value - $mean)/$sd); } return ($mean, $sd, $z, $p/scalar(@$data)); } sub pairwiseTreeDistance { my ($data, $value) = @_; my @datatree = map { RNA::make_tree(RNA::expand_Full($_)) } @$data; my $valuetree = RNA::make_tree(RNA::expand_Full($value)); my @temp = map { RNA::tree_edit_distance($valuetree, $_) } @datatree; my $tempmean = new Statistics::Basic::Mean(\@temp)->query; @datatree = shuffle(@datatree); my $chosentree = pop(@datatree); @temp = (); @temp = map { RNA::tree_edit_distance($chosentree, $_) } @datatree; return(computeStats(\@temp, $tempmean)); } sub rnaAnalysis { my ($seq, $seqLen, $struct, $mfe) = @_; my $bp = $struct =~ tr/(//; my $Q = 0; my $D = 0; my $VI = 0; $RNA::pf_scale = exp((-1)*1.2*$mfe/(0.6163207755*$seqLen)) if ($seqLen > 1000); # compute partition function and pair pobabilities matrix RNA::pf_fold($seq); # compute sum-of-entropy and bp-distance foreach my $j (1..$seqLen-1) { foreach my $k ($j+1..$seqLen) { my $p = RNA::get_pr($j, $k); # points to the computed pair probabilities if ($p > 0) { $Q += $p*log($p); $D += $p*(1 - $p); } } } $Q *= -1/log(2); return ($Q, $D, $bp, 0); } exit;