File Coverage

lib/Bio/Roary/FilterUnknownsFromFasta.pm
Criterion Covered Total %
statement 42 42 100.0
branch 4 4 100.0
condition n/a
subroutine 9 9 100.0
pod n/a
total 55 55 100.0


line stmt bran cond sub pod time code
1             package Bio::Roary::FilterUnknownsFromFasta;
2             $Bio::Roary::FilterUnknownsFromFasta::VERSION = '3.10.1';
3             # ABSTRACT: Take in fasta files, remove sequences with too many unknowns and return a list of the new files
4              
5              
6 4     4   26 use Moose;
  4         11  
  4         31  
7 4     4   28135 use Bio::SeqIO;
  4         11  
  4         114  
8 4     4   23 use Cwd;
  4         9  
  4         277  
9 4     4   25 use Bio::Roary::Exceptions;
  4         32  
  4         328  
10 4     4   26 use File::Basename;
  4         6  
  4         1777  
11              
12             has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
13             has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
14             has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num', default => 5 );
15              
16             has 'filtered_fasta_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_filtered_fasta_files' );
17              
18             has 'input_fasta_to_output_fasta' => ( is => 'ro', isa => 'HashRef', default => sub {{}} );
19              
20             sub _build_filtered_fasta_files
21             {
22 19     19   53 my ($self) = @_;
23            
24 19         49 my @output_file_names;
25 19         36 for my $fasta_file (@{$self->fasta_files})
  19         619  
26             {
27 44         7176 my ( $filename, $directories, $suffix ) = fileparse($fasta_file);
28 44         262 push(@output_file_names, $self->_filter_fasta_sequences_and_return_new_file($filename,$fasta_file ));
29             }
30 19         4031 return \@output_file_names;
31             }
32              
33             sub _does_sequence_contain_too_many_unknowns
34             {
35 183     183   419 my ($self, $sequence_obj) = @_;
36 183         594 my $maximum_number_of_Xs = int(($sequence_obj->length()*$self->maximum_percentage_of_unknowns)/100);
37 183         805 my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g;
38 183 100       3309 if($number_of_Xs_found > $maximum_number_of_Xs)
39             {
40 2         5 return 1;
41             }
42             else
43             {
44 181         534 return 0;
45             }
46             }
47              
48              
49             sub _filter_fasta_sequences_and_return_new_file
50             {
51 44     44   126 my ($self, $output_file, $input_file) = @_;
52 44         141 my $output_filename = $output_file.'.tmp.filtered.fa';
53 44         300 my $out_fasta_obj = Bio::SeqIO->new( -file => ">".$output_filename, -format => 'Fasta');
54 44         36761 my $fasta_obj = Bio::SeqIO->new( -file => $input_file, -format => 'Fasta');
55            
56 44         36477 $self->input_fasta_to_output_fasta->{$input_file} = $output_filename;
57              
58 44         175 while(my $seq = $fasta_obj->next_seq())
59             {
60 183 100       71513 if($self->_does_sequence_contain_too_many_unknowns($seq))
61             {
62 2         7 next;
63             }
64             # strip out extra details put in by fastatranslate
65 181         621 $seq->description(undef);
66 181         2859 $out_fasta_obj->write_seq($seq);
67             }
68 44         11873 return $output_filename;
69             }
70              
71              
72              
73 4     4   33 no Moose;
  4         8  
  4         42  
74             __PACKAGE__->meta->make_immutable;
75              
76             1;
77              
78             __END__
79              
80             =pod
81              
82             =encoding UTF-8
83              
84             =head1 NAME
85              
86             Bio::Roary::FilterUnknownsFromFasta - Take in fasta files, remove sequences with too many unknowns and return a list of the new files
87              
88             =head1 VERSION
89              
90             version 3.10.1
91              
92             =head1 SYNOPSIS
93              
94             Take in fasta files, remove sequences with too many unknowns and return a list of the new files
95             use Bio::Roary::FilterUnknownsFromFasta;
96              
97             my $obj = Bio::Roary::FilterUnknownsFromFasta->new(
98             fasta_files => [],
99             );
100             $obj->filtered_fasta_files();
101              
102             =head1 AUTHOR
103              
104             Andrew J. Page <ap13@sanger.ac.uk>
105              
106             =head1 COPYRIGHT AND LICENSE
107              
108             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
109              
110             This is free software, licensed under:
111              
112             The GNU General Public License, Version 3, June 2007
113              
114             =cut