File Coverage

lib/Bio/Roary/Output/GroupsMultifastaNucleotide.pm
Criterion Covered Total %
statement 71 108 65.7
branch 5 18 27.7
condition 0 3 0.0
subroutine 20 23 86.9
pod 0 1 0.0
total 96 153 62.7


line stmt bran cond sub pod time code
1             package Bio::Roary::Output::GroupsMultifastaNucleotide;
2             $Bio::Roary::Output::GroupsMultifastaNucleotide::VERSION = '3.10.2';
3             # ABSTRACT: Take in a GFF files and a groups file and output one multifasta file per group with nucleotide sequences.
4              
5              
6 3     3   19 use Moose;
  3         16  
  3         16  
7 3     3   16966 use Bio::SeqIO;
  3         8655  
  3         86  
8 3     3   17 use File::Path qw(make_path);
  3         7  
  3         131  
9 3     3   15 use File::Basename;
  3         7  
  3         153  
10 3     3   20 use File::Copy;
  3         6  
  3         121  
11 3     3   14 use File::Temp qw/ tempfile /;
  3         7  
  3         151  
12 3     3   16 use Bio::Roary::Exceptions;
  3         34  
  3         55  
13 3     3   14 use Bio::Roary::AnalyseGroups;
  3         7  
  3         49  
14 3     3   12 use Bio::Tools::GFF;
  3         5  
  3         2904  
15             with 'Bio::Roary::BedFromGFFRole';
16              
17             has 'gff_file' => ( is => 'ro', isa => 'Str', required => 1 );
18             has 'group_names' => ( is => 'ro', isa => 'ArrayRef', required => 0 );
19             has 'output_directory' => ( is => 'ro', isa => 'Str', required => 1 );
20             has 'pan_reference_groups_seen' => ( is => 'rw', isa => 'HashRef', required => 1 );
21             has 'number_of_gff_files' => ( is => 'ro', isa => 'Int', required => 1 );
22             has 'pan_reference_filename' => ( is => 'ro', isa => 'Str',default => 'pan_genome_reference.fa' );
23             has 'dont_delete_files' => ( is => 'ro', isa => 'Bool',default => 0 );
24             has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 );
25              
26             has 'annotate_groups' => ( is => 'ro', isa => 'Bio::Roary::AnnotateGroups', required => 1 );
27             has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 );
28              
29             has 'fasta_file' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_fasta_file' );
30             has '_input_seqio' => ( is => 'ro', isa => 'Bio::SeqIO', lazy => 1, builder => '_build__input_seqio' );
31              
32             has 'output_filename' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build_output_filename' );
33              
34             sub _build_output_filename
35             {
36 1     1   2 my ($self) = @_;
37 1         27 my ( $filename, $directories, $suffix ) = fileparse($self->gff_file);
38 1         27 return join('/',($self->output_directory, $filename.'.tmp_nuc_sequences.fa' ));
39             }
40              
41             sub _build__input_seqio {
42 1     1   2 my ($self) = @_;
43 1         29 return Bio::SeqIO->new( -file => $self->fasta_file, -format => 'Fasta' );
44             }
45              
46             sub _bed_output_filename {
47 3     3   8 my ($self) = @_;
48 3         105 return join( '.', ( $self->output_filename, 'intermediate.bed' ) );
49             }
50              
51             sub populate_files {
52 1     1 0 3 my ($self) = @_;
53 1         28 while ( my $input_seq = $self->_input_seqio->next_seq() )
54             {
55 0 0       0 if ( $self->annotate_groups->_ids_to_groups->{$input_seq->display_id} )
56             {
57 0         0 my $current_group = $self->annotate_groups->_ids_to_groups->{$input_seq->display_id};
58 0         0 my $gene_name = $self->annotate_groups->_groups_to_consensus_gene_names->{$current_group};
59              
60 0 0       0 if(! defined($self->pan_reference_groups_seen->{$current_group}))
61             {
62 0         0 my $pan_output_seq = $self->_pan_genome_reference_io_obj($current_group);
63 0 0       0 $pan_output_seq->write_seq(Bio::Seq->new( -display_id => $input_seq->display_id, -desc => ($gene_name ? $gene_name : $current_group), -seq => $input_seq->seq ) );
64 0         0 $self->pan_reference_groups_seen->{$current_group} = 1;
65             }
66              
67 0         0 my $number_of_genes = @{$self->annotate_groups->_groups_to_id_names->{$current_group}};
  0         0  
68             # Theres no need to align noncore files
69 0 0 0     0 next if($self->dont_delete_files == 0 && $number_of_genes < ($self->core_definition * $self->number_of_gff_files ));
70            
71 0         0 my $output_seq = $self->_group_seq_io_obj($current_group,$number_of_genes);
72 0         0 $output_seq->write_seq($input_seq);
73             }
74             }
75              
76 0         0 unlink($self->fasta_file);
77 0         0 1;
78             }
79              
80             sub _group_file_name
81             {
82 0     0   0 my ($self,$group_name,$num_group_genes) = @_;
83 0         0 my $annotated_group_name = $self->annotate_groups->_groups_to_consensus_gene_names->{$group_name};
84 0         0 $annotated_group_name =~ s!\W!_!gi;
85 0         0 my $filename = $annotated_group_name.'.fa';
86 0         0 my $group_file_name = join('/',($self->output_directory, $filename ));
87 0         0 return $group_file_name;
88             }
89              
90              
91             sub _pan_genome_reference_io_obj
92             {
93 0     0   0 my ($self) = @_;
94 0         0 return Bio::SeqIO->new( -file => ">>".$self->pan_reference_filename, -format => 'Fasta' );
95             }
96              
97              
98             sub _group_seq_io_obj
99             {
100 0     0   0 my ($self,$group_name,$num_group_genes) = @_;
101 0         0 my $filename = $self->_group_file_name($group_name,$num_group_genes);
102 0         0 return Bio::SeqIO->new( -file => ">>".$filename, -format => 'Fasta' );
103             }
104              
105              
106             sub _extracted_nucleotide_fasta_file_from_bed_filename {
107 2     2   5 my ($self) = @_;
108 2         48 return join( '.', ( $self->output_filename, 'intermediate.extracted.fa' ) );
109             }
110              
111             sub _create_nucleotide_fasta_file_from_gff {
112 1     1   2 my ($self) = @_;
113            
114 1         28 open(my $input_fh, $self->gff_file);
115 1         7 open(my $output_fh, '>', $self->_nucleotide_fasta_file_from_gff_filename);
116 1         7 my $at_sequence = 0;
117 1         16 while(<$input_fh>)
118             {
119 271         327 my $line = $_;
120 271 100       380 if($line =~/^>/)
121             {
122 1         8 $at_sequence = 1;
123             }
124            
125 271 100       380 if($at_sequence == 1)
126             {
127 252         251 print {$output_fh} $line;
  252         522  
128             }
129             }
130 1         7 close($input_fh);
131 1         23 close($output_fh);
132             }
133              
134             sub _nucleotide_fasta_file_from_gff_filename {
135 4     4   13 my ($self) = @_;
136 4         182 return join( '.', ( $self->output_filename, 'intermediate.fa' ) );
137             }
138              
139             sub _extract_nucleotide_regions {
140 1     1   2 my ($self) = @_;
141              
142 1         4 $self->_create_nucleotide_fasta_file_from_gff;
143 1         6 $self->_create_bed_file_from_gff;
144              
145 1         182 my $cmd =
146             'bedtools getfasta -s -fi '
147             . $self->_nucleotide_fasta_file_from_gff_filename
148             . ' -bed '
149             . $self->_bed_output_filename . ' -fo '
150             . $self->_extracted_nucleotide_fasta_file_from_bed_filename
151             . ' -name > /dev/null 2>&1';
152 1         4932 system($cmd);
153 1         24 unlink( $self->_nucleotide_fasta_file_from_gff_filename );
154 1         15 unlink( $self->_bed_output_filename );
155 1         8 unlink( $self->_nucleotide_fasta_file_from_gff_filename . '.fai' );
156 1         8 return $self->_extracted_nucleotide_fasta_file_from_bed_filename;
157             }
158              
159             sub _cleanup_fasta {
160 1     1   4 my ($self,$infile) = @_;
161            
162 1         90 my($fh, $outfile) = tempfile();
163 1 50       615 return unless ( -e $infile );
164              
165 0         0 open( my $in, '<', $infile );
166 0         0 open( my $out, '>', $outfile );
167 0         0 while ( my $line = <$in> ) {
168 0         0 chomp $line;
169 0 0       0 $line =~ s/"//g if ( $line =~ /^>/ );
170            
171 0 0       0 if($line =~ /^(>[^:]+)/)
172             {
173 0         0 $line = $1;
174             }
175 0         0 print $out "$line\n";
176             }
177 0         0 close $in;
178 0         0 close $out;
179            
180 0         0 move( $outfile, $infile);
181 0         0 return $infile;
182             }
183              
184              
185             sub _build_fasta_file {
186 1     1   2 my ($self) = @_;
187 1         3 my $fasta_filename = $self->_extract_nucleotide_regions;
188 1         7 return $self->_cleanup_fasta($fasta_filename);
189             }
190              
191 3     3   23 no Moose;
  3         5  
  3         20  
192             __PACKAGE__->meta->make_immutable;
193              
194             1;
195              
196             __END__
197              
198             =pod
199              
200             =encoding UTF-8
201              
202             =head1 NAME
203              
204             Bio::Roary::Output::GroupsMultifastaNucleotide - Take in a GFF files and a groups file and output one multifasta file per group with nucleotide sequences.
205              
206             =head1 VERSION
207              
208             version 3.10.2
209              
210             =head1 SYNOPSIS
211              
212             Take in a GFF files and a groups file and output one multifasta file per group with nucleotide sequences.
213             use Bio::Roary::Output::GroupsMultifastas;
214              
215             my $obj = Bio::Roary::Output::GroupsMultifastasNucleotide->new(
216             group_names => ['aaa','bbb'],
217             );
218             $obj->populate_files();
219              
220             =head1 AUTHOR
221              
222             Andrew J. Page <ap13@sanger.ac.uk>
223              
224             =head1 COPYRIGHT AND LICENSE
225              
226             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
227              
228             This is free software, licensed under:
229              
230             The GNU General Public License, Version 3, June 2007
231              
232             =cut