|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Bio::Roary::MergeMultifastaAlignments;  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 $Bio::Roary::MergeMultifastaAlignments::VERSION = '3.10.1';  | 
| 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # ABSTRACT: Merge multifasta alignment files with equal numbers of sequences.  | 
| 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
6
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
9
 | 
 use Moose;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
    | 
| 
7
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
7893
 | 
 use Bio::SeqIO;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
21
 | 
    | 
| 
8
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
346
 | 
 use Bio::Roary::Output::CoreGeneAlignmentCoordinatesEMBL;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
570
 | 
    | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'multifasta_files'       => ( is => 'ro', isa => 'ArrayRef',   required => 1 );  | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'sample_names'           => ( is => 'ro', isa => 'ArrayRef',   required => 1 );  | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'sample_names_to_genes'  => ( is => 'rw', isa => 'HashRef',    required => 1 );  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'output_filename'        => ( is => 'ro', isa => 'Str',        default  => 'core_alignment.aln' );  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'output_header_filename' => ( is => 'ro', isa => 'Str',        default  => 'core_alignment_header.embl' );  | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has '_output_seqio_obj'      => ( is => 'ro', isa => 'Bio::SeqIO', lazy     => 1, builder => '_build__output_seqio_obj' );  | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has '_gene_lengths'          => ( is => 'rw', isa => 'HashRef',    lazy     => 1, builder => '_build__gene_lengths' );  | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has '_gene_to_sequence' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );  | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has '_sorted_multifasta_files' => ( is => 'rw', isa => 'ArrayRef', lazy => 1, builder => '_build__sorted_multifasta_files' );  | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub BUILD {  | 
| 
21
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
  
0
  
 | 
5
 | 
     my ($self) = @_;  | 
| 
22
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
53
 | 
     $self->_gene_lengths;  | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _input_seq_io_obj {  | 
| 
26
 | 
4
 | 
 
 | 
 
 | 
  
4
  
 | 
 
 | 
8
 | 
     my ( $self, $filename ) = @_;  | 
| 
27
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
33
 | 
     return Bio::SeqIO->new( -file => $filename, -format => 'Fasta' );  | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _build__output_seqio_obj {  | 
| 
31
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
5
 | 
     my ($self) = @_;  | 
| 
32
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
48
 | 
     return Bio::SeqIO->new( -file => ">" . $self->output_filename, -format => 'Fasta' );  | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _build__gene_lengths {  | 
| 
36
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
5
 | 
     my ($self) = @_;  | 
| 
37
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     my %gene_lengths;  | 
| 
38
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
     for my $filename ( @{ $self->_sorted_multifasta_files } ) {  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
59
 | 
    | 
| 
39
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
242
 | 
         my $seq_io = $self->_input_seq_io_obj($filename);  | 
| 
40
 | 
4
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
4953
 | 
         next unless ( defined($seq_io) );  | 
| 
41
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
         while ( my $seq_record = $seq_io->next_seq ) {  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # Save all of the gene sequences to memory, massive speedup but a bit naughty.  | 
| 
44
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1354
 | 
             $self->_gene_to_sequence->{$filename}->{ $seq_record->display_id } = $seq_record->seq;  | 
| 
45
 | 
8
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
110
 | 
             $gene_lengths{$filename} = $seq_record->length() if ( !defined( $gene_lengths{$filename} ) );  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
49
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
250
 | 
     return \%gene_lengths;  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _build__sorted_multifasta_files {  | 
| 
53
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
 
 | 
5
 | 
     my ($self) = @_;  | 
| 
54
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
     my @sorted_gene_files = sort @{ $self->multifasta_files };  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
47
 | 
    | 
| 
55
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
49
 | 
     return \@sorted_gene_files;  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _sequence_for_sample_from_gene_file {  | 
| 
59
 | 
9
 | 
 
 | 
 
 | 
  
9
  
 | 
 
 | 
15
 | 
     my ( $self, $sample_name, $gene_file ) = @_;  | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
61
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # loop over this to get the geneIDs  | 
| 
62
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
     for my $gene_id ( sort keys %{ $self->_gene_to_sequence->{$gene_file} } ) {  | 
| 
 
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
211
 | 
    | 
| 
63
 | 
14
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
347
 | 
         if ( defined( $self->sample_names_to_genes->{$sample_name}->{$gene_id} ) ) {  | 
| 
64
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
177
 | 
             return $self->_gene_to_sequence->{$gene_file}->{$gene_id};  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
67
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
     return $self->_padded_string_for_gene_file($gene_file);  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _padded_string_for_gene_file {  | 
| 
71
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
2
 | 
     my ( $self, $gene_file ) = @_;  | 
| 
72
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
22
 | 
     return '' unless ( defined( $self->_gene_lengths->{$gene_file} ) );  | 
| 
73
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
21
 | 
     return 'N' x ( $self->_gene_lengths->{$gene_file} );  | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _create_merged_sequence_for_sample {  | 
| 
77
 | 
5
 | 
 
 | 
 
 | 
  
5
  
 | 
 
 | 
9
 | 
     my ( $self, $sample_name ) = @_;  | 
| 
78
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
     my $merged_sequence = '';  | 
| 
79
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
     for my $gene_file ( @{ $self->_sorted_multifasta_files } ) {  | 
| 
 
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
127
 | 
    | 
| 
80
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
21
 | 
         $merged_sequence .= $self->_sequence_for_sample_from_gene_file( $sample_name, $gene_file );  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
82
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
     return $merged_sequence;  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub merge_files {  | 
| 
86
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
  
0
  
 | 
4
 | 
     my ($self) = @_;  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
88
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
     for my $sample_name ( @{ $self->sample_names } ) {  | 
| 
 
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
48
 | 
    | 
| 
89
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
627
 | 
         my $sequence = $self->_create_merged_sequence_for_sample($sample_name);  | 
| 
90
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
27
 | 
         my $seq_io = Bio::Seq->new( -display_id => $sample_name, -seq => $sequence );  | 
| 
91
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1295
 | 
         $self->_output_seqio_obj->write_seq($seq_io);  | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
94
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # Create a header file which gives the coordinates of each gene in the multifasta  | 
| 
95
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     Bio::Roary::Output::CoreGeneAlignmentCoordinatesEMBL->new(  | 
| 
96
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
395
 | 
         multifasta_files => $self->_sorted_multifasta_files,  | 
| 
97
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         gene_lengths     => $self->_gene_lengths,  | 
| 
98
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         output_filename  => $self->output_header_filename  | 
| 
99
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     )->create_file();  | 
| 
100
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
101
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
64
 | 
     return 1;  | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
104
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
8
 | 
 no Moose;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
    | 
| 
105
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	__PACKAGE__->meta->make_immutable;  | 
| 
106
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
107
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
109
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  | 
| 
110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
111
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =pod  | 
| 
112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
113
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =encoding UTF-8  | 
| 
114
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
115
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 NAME  | 
| 
116
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
117
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Bio::Roary::MergeMultifastaAlignments - Merge multifasta alignment files with equal numbers of sequences.  | 
| 
118
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
119
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 VERSION  | 
| 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
121
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 version 3.10.1  | 
| 
122
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
123
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 SYNOPSIS  | 
| 
124
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
125
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Merge multifasta alignment files with equal numbers of sequences.So each sequence in each file gets concatenated together.  It is assumed the   | 
| 
126
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sequences are in the correct order.  | 
| 
127
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    use Bio::Roary::MergeMultifastaAlignments;  | 
| 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
129
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    my $obj = Bio::Roary::MergeMultifastaAlignments->new(  | 
| 
130
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      multifasta_files => [],  | 
| 
131
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
      output_filename  => 'output_merged.aln'  | 
| 
132
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    );  | 
| 
133
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    $obj->merge_files;  | 
| 
134
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 AUTHOR  | 
| 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
137
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Andrew J. Page <ap13@sanger.ac.uk>  | 
| 
138
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
139
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 COPYRIGHT AND LICENSE  | 
| 
140
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
141
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.  | 
| 
142
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
143
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This is free software, licensed under:  | 
| 
144
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
145
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   The GNU General Public License, Version 3, June 2007  | 
| 
146
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
147
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  |