File Coverage

lib/Bio/Roary/UniqueGenesPerSample.pm
Criterion Covered Total %
statement 35 36 97.2
branch 7 12 58.3
condition n/a
subroutine 6 6 100.0
pod 0 1 0.0
total 48 55 87.2


line stmt bran cond sub pod time code
1             package Bio::Roary::UniqueGenesPerSample;
2             $Bio::Roary::UniqueGenesPerSample::VERSION = '3.10.2';
3             # ABSTRACT: Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
4              
5              
6 1     1   151841 use Moose;
  1         512461  
  1         7  
7 1     1   8960 use Bio::Roary::Exceptions;
  1         3  
  1         543  
8              
9             has 'clustered_proteins' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' );
10             has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'unique_genes_per_sample.tsv' );
11              
12             has '_output_fh' => ( is => 'ro', lazy => 1, builder => '_build__output_fh' );
13              
14             sub _build__output_fh {
15 1     1   4 my ($self) = @_;
16 1 50       49 open( my $fh, '>', $self->output_filename )
17             or Bio::Roary::Exceptions::CouldntWriteToFile->throw( error => "Couldnt write output file:" . $self->output_filename );
18 1         45 return $fh;
19             }
20              
21             #group_17585: 14520_6#21_00645
22             sub _sample_to_gene_freq {
23 2     2   7 my ($self) = @_;
24              
25 2 50       159 open( my $input_fh, $self->clustered_proteins )
26             or Bio::Roary::Exceptions::FileNotFound->throw( error => "Couldnt read input file:" . $self->clustered_proteins );
27              
28 2         9 my %sample_to_gene_freq;
29 2         36 while (<$input_fh>) {
30 12         27 chomp;
31 12         22 my $line = $_;
32 12 50       30 next if ( length( $line ) < 6 );
33 12 100       71 if ( $line =~ /^.+: ([^\s]+)$/ ) {
34 10         31 my $gene_id = $1;
35 10 50       45 if ( $gene_id =~ /^(.+)_[\d]+$/ ) {
36 10         26 my $sample_name = $1;
37 10         61 $sample_to_gene_freq{$sample_name}++;
38             }
39             else {
40             # gene id may not be valid so ignore
41 0         0 next;
42             }
43             }
44             else {
45             # its either an invalid line or theres more than 1 gene in the cluster
46 2         10 next;
47             }
48             }
49              
50 2         49 return \%sample_to_gene_freq;
51             }
52              
53             sub write_unique_frequency {
54 1     1 0 5 my ($self) = @_;
55              
56 1         3 my %sample_to_gene_freq = %{$self->_sample_to_gene_freq};
  1         4  
57            
58 1 50       14 for my $sample ( sort { $sample_to_gene_freq{$b} <=> $sample_to_gene_freq{$a} || $a cmp $b } keys %sample_to_gene_freq ) {
  5         24  
59 4         10 print { $self->_output_fh } $sample . "\t" . $sample_to_gene_freq{$sample} . "\n";
  4         143  
60             }
61 1         38 close($self->_output_fh);
62 1         14 return 1;
63             }
64              
65 1     1   8 no Moose;
  1         2  
  1         7  
66             __PACKAGE__->meta->make_immutable;
67              
68             1;
69              
70             __END__
71              
72             =pod
73              
74             =encoding UTF-8
75              
76             =head1 NAME
77              
78             Bio::Roary::UniqueGenesPerSample - Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
79              
80             =head1 VERSION
81              
82             version 3.10.2
83              
84             =head1 SYNOPSIS
85              
86             Take in the clustered file and produce a sorted file with the frequency of each samples unique genes
87             use Bio::Roary::UniqueGenesPerSample;
88              
89             my $obj = Bio::Roary::SequenceLengths->new(
90             clustered_proteins => 'clustered_proteins',
91             output_filename => 'output_filename',
92             );
93             $obj->write_unique_frequency;
94              
95             =head1 AUTHOR
96              
97             Andrew J. Page <ap13@sanger.ac.uk>
98              
99             =head1 COPYRIGHT AND LICENSE
100              
101             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
102              
103             This is free software, licensed under:
104              
105             The GNU General Public License, Version 3, June 2007
106              
107             =cut