The following Perl script can be used to analyze DNA, RNA, or protein sequences.
For example, we can use it to find the usage frequencies of trinucleotides
in transcribed or untranscribed regions. It uses a window of fixed size moving
along the sequence to collect usage statistics.
#!perl
# This program reads a text file, collects usage statistics of strings of a fixed window size.
#input window size from command line
$windowSize = @ARGV[0];
#input name of the file to be read from command line
$infile = @ARGV[1];
#input name of the output file from command line
$outfile = @ARGV[2];
#remind user there is no window size or input file
if(!$windowSize or !$infile) {
print "No window length or input file.\nUsage: perl my_perl.pl length infile outfile\n";
}
#read the input file if given
if($infile) {
# open inputfile
open(IN, $infile) or die "can not open $infile\n";
#read the file
while( $line = ) {
# join all the lines in the file
$content .= $line;
}
close(IN) or die "can not close $infile\n";
# remove digits and blank spaces
$content =~ s/[\s\d]//g;
#change to uppercase
$_ = $content;
tr/a-z/A-Z/;
$content = $_;
# length of the sequence
$_ = $content;
$seqLength = tr/a-zA-Z//;
# doing the statistics
for($i=0; $i<$seqLength - $windowSize; $i++) {
#sequence for a moving window of length $windowSize
$winSeq = substr($content, $i, $windowSize);
#put the sequence into unique categories using a Hashmap
$stat{ $winSeq } ++;
}
# unique categories, also keys for the Hashmap
@keys = keys(%stat);
# order alphabetically
@keys = sort(@keys);
# printing out statistics
#if output file name provided
if($outfile) {
open(OUT, ">$outfile") or die "can not open $outfile";
for($i=0; $i<@keys; $i++) {
print OUT "@keys[$i]\t$stat{@keys[$i]}\n";
}
close(OUT) or die "can not close $outfile";
}
# else print to screen
else {
for($i=0; $i<@keys; $i++) {
print "@keys[$i]\t$stat{@keys[$i]}\n";
}
}
}
|