File Coverage

lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm
Criterion Covered Total %
statement 83 83 100.0
branch 18 20 90.0
condition 6 12 50.0
subroutine 14 14 100.0
pod n/a
total 121 129 93.8


line stmt bran cond sub pod time code
1             package Bio::Roary::ExtractCoreGenesFromSpreadsheet;
2             $Bio::Roary::ExtractCoreGenesFromSpreadsheet::VERSION = '3.10.2';
3             # ABSTRACT: Take in a spreadsheet produced by the pipeline and identify the core genes.
4              
5              
6 4     4   90405 use Moose;
  4         395407  
  4         26  
7 4     4   25959 use Text::CSV;
  4         40646  
  4         171  
8 4     4   809 use Bio::Roary::GroupStatistics;
  4         20  
  4         173  
9 4     4   34 use POSIX;
  4         9  
  4         36  
10              
11             has 'spreadsheet' => ( is => 'ro', isa => 'Str', required => 1 );
12             has '_csv_parser' => ( is => 'ro', isa => 'Text::CSV', lazy => 1, builder => '_build__csv_parser' );
13             has '_input_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__input_spreadsheet_fh' );
14             has 'ordered_core_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_core_genes' );
15             has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1 );
16             has 'sample_names' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } );
17             has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
18             has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 );
19              
20             has '_number_of_isolates' => ( is => 'rw', isa => 'Int' );
21             has '_gene_column' => ( is => 'rw', isa => 'Int' );
22             has '_num_isolates_column' => ( is => 'rw', isa => 'Int' );
23             has '_avg_sequences_per_isolate_column' => ( is => 'rw', isa => 'Int' );
24             has '_genome_fragement_column' => ( is => 'rw', isa => 'Int' );
25             has '_order_within_fragement_column' => ( is => 'rw', isa => 'Int' );
26             has '_min_no_isolates_for_core' => ( is => 'rw', isa => 'Num', lazy => 1, builder => '_build__min_no_isolates_for_core' );
27              
28             sub _build__min_no_isolates_for_core {
29 5     5   12 my ($self) = @_;
30 5         149 my $threshold = $self->_number_of_isolates * $self->core_definition;
31              
32 5         117 return $threshold;
33             }
34              
35             sub _build__csv_parser {
36 5     5   13 my ($self) = @_;
37 5         60 return Text::CSV->new( { binary => 1, always_quote => 1 } );
38             }
39              
40             sub _build__input_spreadsheet_fh {
41 5     5   11 my ($self) = @_;
42 5         182 open( my $fh, $self->spreadsheet );
43 5         143 return $fh;
44             }
45              
46             sub _update_number_of_isolates {
47 5     5   14 my ( $self, $header_row ) = @_;
48 5         8 my $number_of_isolates = @{$header_row} - @{ Bio::Roary::GroupStatistics->fixed_headers };
  5         10  
  5         46  
49 5         157 $self->_number_of_isolates($number_of_isolates);
50             }
51              
52             sub _setup_column_mappings {
53 5     5   18 my ( $self, $header_row ) = @_;
54              
55             # current ordering
56 5         38 my %columns_of_interest_mappings = (
57             'Gene' => 0,
58             'No. isolates' => 3,
59             'Avg sequences per isolate' => 5,
60             'Genome Fragment' => 6,
61             'Order within Fragment' => 7,
62             'QC' => 10,
63             );
64              
65             # Dynamically overwrite the default ordering
66 5         14 for ( my $i = 0 ; $i < @{$header_row} ; $i++ ) {
  90         138  
67 85         140 for my $col_name (%columns_of_interest_mappings) {
68 840 100       1167 if ( $header_row->[$i] eq $col_name ) {
69 30         35 $columns_of_interest_mappings{$col_name} = $i;
70 30         44 last;
71             }
72             }
73             }
74 5         161 $self->_gene_column( $columns_of_interest_mappings{'Gene'} );
75 5         134 $self->_num_isolates_column( $columns_of_interest_mappings{'No. isolates'} );
76 5         167 $self->_avg_sequences_per_isolate_column( $columns_of_interest_mappings{'Avg sequences per isolate'} );
77 5         137 $self->_genome_fragement_column( $columns_of_interest_mappings{'Genome Fragment'} );
78 5         133 $self->_order_within_fragement_column( $columns_of_interest_mappings{'Order within Fragment'} );
79 5         20 $self->_update_number_of_isolates($header_row);
80              
81             # Get the sample_names
82 5         9 my @sample_names;
83 5         18 for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$header_row} ; $i++ ) {
  20         36  
84 15         29 push( @sample_names, $header_row->[$i] );
85             }
86 5         162 $self->sample_names( \@sample_names );
87             }
88              
89             sub _length_of_fixed_headers {
90 388     388   507 my ($self) = @_;
91 388         365 return @{ Bio::Roary::GroupStatistics->fixed_headers() };
  388         585  
92             }
93              
94             sub _populate_sample_to_gene_lookup_with_row {
95 61     61   90 my ( $self, $row ) = @_;
96              
97 61         101 for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$row} ; $i++ ) {
  384         664  
98 323 100 66     885 if ( defined( $row->[$i] ) && $row->[$i] ne "" ) {
99 322         7012 my $sample_name = $self->sample_names->[ $i - $self->_length_of_fixed_headers() ];
100              
101 322         7499 $self->sample_names_to_genes->{$sample_name}->{ $row->[$i] } = 1;
102             }
103             }
104 61         1373 return 1;
105             }
106              
107             sub _ordered_core_genes {
108 5     5   11 my ($self) = @_;
109 5         6 my %ordered_genes;
110 5         113 while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) {
111 77 100       2061 next if ( @{$row} < 12 ); # no genes in group
  77         162  
112 76 50 33     1755 next if ( !defined( $row->[ $self->_gene_column ] ) || $row->[ $self->_gene_column ] eq '' ); # no gene name
113             next
114 76 50 33     1857 if ( !defined( $row->[ $self->_avg_sequences_per_isolate_column ] ) || $row->[ $self->_avg_sequences_per_isolate_column ] eq '' )
115             ; # no average
116             next
117 76 100 66     2045 if ( !defined( $row->[ $self->_genome_fragement_column ] ) || $row->[ $self->_genome_fragement_column ] eq '' )
118             ; # fragment not defined
119              
120             # next if($self->_number_of_isolates != $row->[$self->_num_isolates_column]); # if gene is not in all isolates
121 74 100       1683 next if ( $row->[ $self->_num_isolates_column ] < $self->_min_no_isolates_for_core );
122              
123 64 100       1390 if ( $self->allow_paralogs ) {
124             # should never happen
125 5 100       117 next if ( $row->[ $self->_avg_sequences_per_isolate_column ] < 1 );
126             }
127             else {
128 59 100       1440 next if ( $row->[ $self->_avg_sequences_per_isolate_column ] != 1 );
129             }
130              
131 61         1315 $ordered_genes{ $row->[ $self->_genome_fragement_column ] }{ $row->[ $self->_order_within_fragement_column ] } =
132             $row->[ $self->_gene_column ];
133 61         125 $self->_populate_sample_to_gene_lookup_with_row($row);
134             }
135              
136 5         142 my @ordered_core_genes;
137 5         32 for my $fragment_key ( sort { $a <=> $b } keys %ordered_genes ) {
  2         7  
138 7         10 for my $order_within_fragement ( sort { $a <=> $b } keys %{ $ordered_genes{$fragment_key} } ) {
  234         222  
  7         39  
139 61         96 push( @ordered_core_genes, $ordered_genes{$fragment_key}{$order_within_fragement} );
140             }
141             }
142 5         151 return \@ordered_core_genes;
143             }
144              
145             sub _build_ordered_core_genes {
146 5     5   11 my ($self) = @_;
147 5         131 my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh );
148 5         270 $self->_setup_column_mappings($header_row);
149              
150 5         16 return $self->_ordered_core_genes();
151             }
152              
153 4     4   10732 no Moose;
  4         9  
  4         28  
154             __PACKAGE__->meta->make_immutable;
155              
156             1;
157              
158             __END__
159              
160             =pod
161              
162             =encoding UTF-8
163              
164             =head1 NAME
165              
166             Bio::Roary::ExtractCoreGenesFromSpreadsheet - Take in a spreadsheet produced by the pipeline and identify the core genes.
167              
168             =head1 VERSION
169              
170             version 3.10.2
171              
172             =head1 SYNOPSIS
173              
174             Take in a spreadsheet produced by the pipeline and identify the core genes.
175             use Bio::Roary::ExtractCoreGenesFromSpreadsheet;
176              
177             my $obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new(
178             spreadsheet => 'group_statistics.csv',
179             );
180             $obj->ordered_core_genes();
181              
182             =head1 AUTHOR
183              
184             Andrew J. Page <ap13@sanger.ac.uk>
185              
186             =head1 COPYRIGHT AND LICENSE
187              
188             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
189              
190             This is free software, licensed under:
191              
192             The GNU General Public License, Version 3, June 2007
193              
194             =cut