File Coverage

lib/Bio/Roary/FilterFullClusters.pm
Criterion Covered Total %
statement 52 55 94.5
branch 7 10 70.0
condition 3 6 50.0
subroutine 10 10 100.0
pod 0 2 0.0
total 72 83 86.7


line stmt bran cond sub pod time code
1             package Bio::Roary::FilterFullClusters;
2             $Bio::Roary::FilterFullClusters::VERSION = '3.11.0';
3             # ABSTRACT: Take an a clusters file from CD-hit and the fasta file and output a fasta file without full clusters
4              
5              
6 2     2   88336 use Moose;
  2         367601  
  2         12  
7 2     2   12369 use Bio::SeqIO;
  2         51680  
  2         1068  
8             with 'Bio::Roary::ClustersRole';
9              
10             has 'number_of_input_files' => ( is => 'ro', isa => 'Int', required => 1 );
11             has 'fasta_file' => ( is => 'ro', isa => 'Str', required => 1 );
12             has 'output_file' => ( is => 'ro', isa => 'Str', required => 1 );
13             has '_greater_than_or_equal' => ( is => 'ro', isa => 'Bool', default => 0 );
14             has 'cdhit_input_fasta_file' => ( is => 'ro', isa => 'Str', required => 1 );
15             has 'cdhit_output_fasta_file' => ( is => 'ro', isa => 'Str', required => 1 );
16              
17             has 'output_groups_file' => ( is => 'ro', isa => 'Str', required => 1 );
18              
19             has '_full_cluster_gene_names' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__full_cluster_gene_names' );
20             has '_input_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__input_seqio' );
21             has '_output_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__output_seqio' );
22              
23             has '_all_full_cluster_genes' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__all_full_cluster_genes' );
24              
25             sub _build__full_cluster_gene_names
26             {
27 1     1   3 my($self) = @_;
28            
29 1         1 my %full_cluster_gene_names ;
30            
31 1         2 for my $gene_name (keys %{$self->_clustered_genes})
  1         24  
32             {
33            
34 10 50       190 if($self->_greater_than_or_equal == 0)
35             {
36 0 0 0     0 if(defined($self->_clustered_genes->{$gene_name}) && @{$self->_clustered_genes->{$gene_name}} >= ($self->number_of_input_files -1))
  0         0  
37             {
38 0         0 $full_cluster_gene_names{$gene_name}++;
39             }
40             }
41             else
42             {
43 10 100 100     170 if(defined($self->_clustered_genes->{$gene_name}) && @{$self->_clustered_genes->{$gene_name}} == ($self->number_of_input_files -1))
  2         36  
44             {
45 1         2 $full_cluster_gene_names{$gene_name}++;
46             }
47             }
48             }
49            
50 1         22 return \%full_cluster_gene_names;
51             }
52              
53             sub _build__input_seqio {
54 1     1   2 my ($self) = @_;
55 1         23 return Bio::SeqIO->new( -file => $self->fasta_file, -format => 'Fasta' );
56             }
57              
58             sub _build__output_seqio {
59 1     1   2 my ( $self, $chunk_number ) = @_;
60 1         22 return Bio::SeqIO->new( -file => ">".$self->output_file, -format => 'Fasta' );
61             }
62              
63             sub _build__all_full_cluster_genes
64             {
65 1     1   2 my ($self) = @_;
66 1         2 my %full_cluster_genes;
67            
68 1         1 for my $gene_name (keys %{$self->_full_cluster_gene_names})
  1         22  
69             {
70 1         3 $full_cluster_genes{$gene_name}++;
71 1         1 for my $cluster_gene_name (@{$self->_clustered_genes->{$gene_name}})
  1         19  
72             {
73 5         8 $full_cluster_genes{$cluster_gene_name}++;
74             }
75             }
76 1         21 return \%full_cluster_genes;
77             }
78              
79              
80             sub _create_groups_file
81             {
82 1     1   4 my ($self) = @_;
83 1         30 open(my $out_fh, '>>', $self->output_groups_file);
84            
85 1         3 for my $gene_name (keys %{$self->_full_cluster_gene_names})
  1         28  
86             {
87 1         2 print {$out_fh} $gene_name."\t". join("\t", @{$self->_clustered_genes->{$gene_name}}). "\n";
  1         4  
  1         20  
88             }
89 1         33 close($out_fh);
90             }
91              
92              
93              
94             sub filter_complete_cluster_from_original_fasta
95             {
96 1     1 0 2 my ($self) = @_;
97              
98 1         29 my $input_seq_io = Bio::SeqIO->new( -file => $self->cdhit_input_fasta_file, -format => 'Fasta' );
99 1         559 my $output_seq_io = Bio::SeqIO->new( -file => ">".$self->cdhit_output_fasta_file, -format => 'Fasta' );
100            
101 1         576 while ( my $input_seq = $input_seq_io->next_seq() )
102             {
103 30 100       7988 unless(defined($self->_all_full_cluster_genes->{$input_seq->display_id}))
104             {
105 24         257 $output_seq_io->write_seq($input_seq);
106             }
107             }
108            
109 1         171 $self->_create_groups_file;
110 1         8 return $self;
111             }
112              
113             sub filter_full_clusters_from_fasta
114             {
115 1     1 0 3 my ($self) = @_;
116            
117 1         28 while ( my $input_seq = $self->_input_seqio->next_seq() ) {
118 10 100       1597 unless(defined($self->_full_cluster_gene_names->{$input_seq->display_id}))
119             {
120 9         258 $self->_output_seqio->write_seq($input_seq);
121             }
122             }
123 1         31 return $self;
124             }
125              
126 2     2   16 no Moose;
  2         3  
  2         12  
127             __PACKAGE__->meta->make_immutable;
128              
129             1;
130              
131             __END__
132              
133             =pod
134              
135             =encoding UTF-8
136              
137             =head1 NAME
138              
139             Bio::Roary::FilterFullClusters - Take an a clusters file from CD-hit and the fasta file and output a fasta file without full clusters
140              
141             =head1 VERSION
142              
143             version 3.11.0
144              
145             =head1 SYNOPSIS
146              
147             Take an a clusters file from CD-hit and the fasta file and output a fasta file without full clusters
148             use Bio::Roary::FilterFullClusters;
149              
150             my $obj = Bio::Roary::FilterFullClusters->new(
151             clusters_filename => $cluster_file,
152             fasta_file => $fasta_file,
153             number_of_input_files => 10,
154             output_file => 'filtered_file'
155             );
156             $obj->filter_full_clusters_from_fasta();
157              
158             =head1 AUTHOR
159              
160             Andrew J. Page <ap13@sanger.ac.uk>
161              
162             =head1 COPYRIGHT AND LICENSE
163              
164             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
165              
166             This is free software, licensed under:
167              
168             The GNU General Public License, Version 3, June 2007
169              
170             =cut