File Coverage

lib/Bio/Roary/FilterUnknownsFromFasta.pm
Criterion Covered Total %
statement 42 42 100.0
branch 4 4 100.0
condition n/a
subroutine 9 9 100.0
pod n/a
total 55 55 100.0


line stmt bran cond sub pod time code
1             package Bio::Roary::FilterUnknownsFromFasta;
2             $Bio::Roary::FilterUnknownsFromFasta::VERSION = '3.10.2';
3             # ABSTRACT: Take in fasta files, remove sequences with too many unknowns and return a list of the new files
4              
5              
6 4     4   28 use Moose;
  4         10  
  4         30  
7 4     4   24388 use Bio::SeqIO;
  4         9  
  4         76  
8 4     4   54 use Cwd;
  4         9  
  4         241  
9 4     4   22 use Bio::Roary::Exceptions;
  4         27  
  4         83  
10 4     4   20 use File::Basename;
  4         6  
  4         1573  
11              
12             has 'fasta_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
13             has 'apply_unknowns_filter' => ( is => 'rw', isa => 'Bool', default => 1 );
14             has 'maximum_percentage_of_unknowns' => ( is => 'ro', isa => 'Num', default => 5 );
15              
16             has 'filtered_fasta_files' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_filtered_fasta_files' );
17              
18             has 'input_fasta_to_output_fasta' => ( is => 'ro', isa => 'HashRef', default => sub {{}} );
19              
20             sub _build_filtered_fasta_files
21             {
22 19     19   50 my ($self) = @_;
23            
24 19         31 my @output_file_names;
25 19         30 for my $fasta_file (@{$self->fasta_files})
  19         553  
26             {
27 44         6528 my ( $filename, $directories, $suffix ) = fileparse($fasta_file);
28 44         226 push(@output_file_names, $self->_filter_fasta_sequences_and_return_new_file($filename,$fasta_file ));
29             }
30 19         3691 return \@output_file_names;
31             }
32              
33             sub _does_sequence_contain_too_many_unknowns
34             {
35 183     183   446 my ($self, $sequence_obj) = @_;
36 183         565 my $maximum_number_of_Xs = int(($sequence_obj->length()*$self->maximum_percentage_of_unknowns)/100);
37 183         617 my $number_of_Xs_found = () = $sequence_obj->seq() =~ /X/g;
38 183 100       2794 if($number_of_Xs_found > $maximum_number_of_Xs)
39             {
40 2         5 return 1;
41             }
42             else
43             {
44 181         484 return 0;
45             }
46             }
47              
48              
49             sub _filter_fasta_sequences_and_return_new_file
50             {
51 44     44   107 my ($self, $output_file, $input_file) = @_;
52 44         140 my $output_filename = $output_file.'.tmp.filtered.fa';
53 44         312 my $out_fasta_obj = Bio::SeqIO->new( -file => ">".$output_filename, -format => 'Fasta');
54 44         32236 my $fasta_obj = Bio::SeqIO->new( -file => $input_file, -format => 'Fasta');
55            
56 44         30712 $self->input_fasta_to_output_fasta->{$input_file} = $output_filename;
57              
58 44         154 while(my $seq = $fasta_obj->next_seq())
59             {
60 183 100       62527 if($self->_does_sequence_contain_too_many_unknowns($seq))
61             {
62 2         7 next;
63             }
64             # strip out extra details put in by fastatranslate
65 181         566 $seq->description(undef);
66 181         2437 $out_fasta_obj->write_seq($seq);
67             }
68 44         10165 return $output_filename;
69             }
70              
71              
72              
73 4     4   30 no Moose;
  4         10  
  4         52  
74             __PACKAGE__->meta->make_immutable;
75              
76             1;
77              
78             __END__
79              
80             =pod
81              
82             =encoding UTF-8
83              
84             =head1 NAME
85              
86             Bio::Roary::FilterUnknownsFromFasta - Take in fasta files, remove sequences with too many unknowns and return a list of the new files
87              
88             =head1 VERSION
89              
90             version 3.10.2
91              
92             =head1 SYNOPSIS
93              
94             Take in fasta files, remove sequences with too many unknowns and return a list of the new files
95             use Bio::Roary::FilterUnknownsFromFasta;
96              
97             my $obj = Bio::Roary::FilterUnknownsFromFasta->new(
98             fasta_files => [],
99             );
100             $obj->filtered_fasta_files();
101              
102             =head1 AUTHOR
103              
104             Andrew J. Page <ap13@sanger.ac.uk>
105              
106             =head1 COPYRIGHT AND LICENSE
107              
108             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
109              
110             This is free software, licensed under:
111              
112             The GNU General Public License, Version 3, June 2007
113              
114             =cut