File Coverage

lib/Bio/Roary/FilterFullClusters.pm
Criterion Covered Total %
statement 52 55 94.5
branch 7 10 70.0
condition 3 6 50.0
subroutine 10 10 100.0
pod 0 2 0.0
total 72 83 86.7


line stmt bran cond sub pod time code
1             package Bio::Roary::FilterFullClusters;
2             $Bio::Roary::FilterFullClusters::VERSION = '3.10.2';
3             # ABSTRACT: Take an a clusters file from CD-hit and the fasta file and output a fasta file without full clusters
4              
5              
6 2     2   85663 use Moose;
  2         397155  
  2         15  
7 2     2   12473 use Bio::SeqIO;
  2         50892  
  2         1121  
8             with 'Bio::Roary::ClustersRole';
9              
10             has 'number_of_input_files' => ( is => 'ro', isa => 'Int', required => 1 );
11             has 'fasta_file' => ( is => 'ro', isa => 'Str', required => 1 );
12             has 'output_file' => ( is => 'ro', isa => 'Str', required => 1 );
13             has '_greater_than_or_equal' => ( is => 'ro', isa => 'Bool', default => 0 );
14             has 'cdhit_input_fasta_file' => ( is => 'ro', isa => 'Str', required => 1 );
15             has 'cdhit_output_fasta_file' => ( is => 'ro', isa => 'Str', required => 1 );
16              
17             has 'output_groups_file' => ( is => 'ro', isa => 'Str', required => 1 );
18              
19             has '_full_cluster_gene_names' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__full_cluster_gene_names' );
20             has '_input_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__input_seqio' );
21             has '_output_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__output_seqio' );
22              
23             has '_all_full_cluster_genes' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__all_full_cluster_genes' );
24              
25             sub _build__full_cluster_gene_names
26             {
27 1     1   3 my($self) = @_;
28            
29 1         2 my %full_cluster_gene_names ;
30            
31 1         2 for my $gene_name (keys %{$self->_clustered_genes})
  1         23  
32             {
33            
34 10 50       184 if($self->_greater_than_or_equal == 0)
35             {
36 0 0 0     0 if(defined($self->_clustered_genes->{$gene_name}) && @{$self->_clustered_genes->{$gene_name}} >= ($self->number_of_input_files -1))
  0         0  
37             {
38 0         0 $full_cluster_gene_names{$gene_name}++;
39             }
40             }
41             else
42             {
43 10 100 100     203 if(defined($self->_clustered_genes->{$gene_name}) && @{$self->_clustered_genes->{$gene_name}} == ($self->number_of_input_files -1))
  2         33  
44             {
45 1         3 $full_cluster_gene_names{$gene_name}++;
46             }
47             }
48             }
49            
50 1         21 return \%full_cluster_gene_names;
51             }
52              
53             sub _build__input_seqio {
54 1     1   3 my ($self) = @_;
55 1         23 return Bio::SeqIO->new( -file => $self->fasta_file, -format => 'Fasta' );
56             }
57              
58             sub _build__output_seqio {
59 1     1   2 my ( $self, $chunk_number ) = @_;
60 1         23 return Bio::SeqIO->new( -file => ">".$self->output_file, -format => 'Fasta' );
61             }
62              
63             sub _build__all_full_cluster_genes
64             {
65 1     1   3 my ($self) = @_;
66 1         1 my %full_cluster_genes;
67            
68 1         2 for my $gene_name (keys %{$self->_full_cluster_gene_names})
  1         21  
69             {
70 1         3 $full_cluster_genes{$gene_name}++;
71 1         2 for my $cluster_gene_name (@{$self->_clustered_genes->{$gene_name}})
  1         20  
72             {
73 5         17 $full_cluster_genes{$cluster_gene_name}++;
74             }
75             }
76 1         22 return \%full_cluster_genes;
77             }
78              
79              
80             sub _create_groups_file
81             {
82 1     1   3 my ($self) = @_;
83 1         34 open(my $out_fh, '>>', $self->output_groups_file);
84            
85 1         4 for my $gene_name (keys %{$self->_full_cluster_gene_names})
  1         31  
86             {
87 1         4 print {$out_fh} $gene_name."\t". join("\t", @{$self->_clustered_genes->{$gene_name}}). "\n";
  1         4  
  1         23  
88             }
89 1         37 close($out_fh);
90             }
91              
92              
93              
94             sub filter_complete_cluster_from_original_fasta
95             {
96 1     1 0 3 my ($self) = @_;
97              
98 1         29 my $input_seq_io = Bio::SeqIO->new( -file => $self->cdhit_input_fasta_file, -format => 'Fasta' );
99 1         629 my $output_seq_io = Bio::SeqIO->new( -file => ">".$self->cdhit_output_fasta_file, -format => 'Fasta' );
100            
101 1         547 while ( my $input_seq = $input_seq_io->next_seq() )
102             {
103 30 100       8603 unless(defined($self->_all_full_cluster_genes->{$input_seq->display_id}))
104             {
105 24         269 $output_seq_io->write_seq($input_seq);
106             }
107             }
108            
109 1         175 $self->_create_groups_file;
110 1         10 return $self;
111             }
112              
113             sub filter_full_clusters_from_fasta
114             {
115 1     1 0 3 my ($self) = @_;
116            
117 1         53 while ( my $input_seq = $self->_input_seqio->next_seq() ) {
118 10 100       1599 unless(defined($self->_full_cluster_gene_names->{$input_seq->display_id}))
119             {
120 9         279 $self->_output_seqio->write_seq($input_seq);
121             }
122             }
123 1         32 return $self;
124             }
125              
126 2     2   15 no Moose;
  2         4  
  2         14  
127             __PACKAGE__->meta->make_immutable;
128              
129             1;
130              
131             __END__
132              
133             =pod
134              
135             =encoding UTF-8
136              
137             =head1 NAME
138              
139             Bio::Roary::FilterFullClusters - Take an a clusters file from CD-hit and the fasta file and output a fasta file without full clusters
140              
141             =head1 VERSION
142              
143             version 3.10.2
144              
145             =head1 SYNOPSIS
146              
147             Take an a clusters file from CD-hit and the fasta file and output a fasta file without full clusters
148             use Bio::Roary::FilterFullClusters;
149              
150             my $obj = Bio::Roary::FilterFullClusters->new(
151             clusters_filename => $cluster_file,
152             fasta_file => $fasta_file,
153             number_of_input_files => 10,
154             output_file => 'filtered_file'
155             );
156             $obj->filter_full_clusters_from_fasta();
157              
158             =head1 AUTHOR
159              
160             Andrew J. Page <ap13@sanger.ac.uk>
161              
162             =head1 COPYRIGHT AND LICENSE
163              
164             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
165              
166             This is free software, licensed under:
167              
168             The GNU General Public License, Version 3, June 2007
169              
170             =cut