#! /usr/bin/perl -w #Script was written by CSR on 10.22.08 #Input = Sequences in FASTA format. #Ouput = Sequences longer than a specified cutoff. #Usage = ./discard_shortseqs1.0.pl -i [input filename] -o [output filename] -c [sequence length cutoff] #Note: Sequence length cutoff is "greater than" (not "greater than or equal to"). use strict; use Bio::SeqIO; use Getopt::Std; my %args; getopts('i:o:c:', \%args); my $usage = "Warning, missing information...\nUsage is: ./scriptname.pl -i [infile] -o [outfile] -c [sequence length cutoff]\n"; my $infile; my $outfile; my $cutoff; my $incount = 0; my $outcount = 0; #command line arguments if ($args{i}) { $infile = $args{i}; } else { die "$usage"; } if ($args{o}) { $outfile = $args{o}; } else { die "$usage"; } if ($args{c}) { $cutoff = $args{c}; } else { die "$usage"; } #load the input (fasta) file my $in = Bio::SeqIO-> new(-format => 'fasta' , -file => $infile); #open the output file open (OUTPUT , "> $outfile") or die "COULD NOT OPEN THE OUTPUT FILE\n"; #determine the length of each sequence and exclude sequeces less than the specified length. while( my $seqobj = $in->next_seq ) { my $id = $seqobj->display_id; $incount++; my $length = $seqobj->length; my $desc = $seqobj->description(); my $seq = $seqobj->seq; #length cutoff if ($length > $cutoff) { $outcount++; print OUTPUT ">$id $desc\n"; print OUTPUT "$seq\n\n"; } } close (OUTPUT); print "The length cutoff is: $cutoff\nTotal number of sequences is: $incount\nNumber of sequences longer than $cutoff is: $outcount\n";