File Coverage

blib/lib/FASTAid.pm
Criterion Covered Total %
statement 61 62 98.3
branch 20 26 76.9
condition 1 3 33.3
subroutine 6 6 100.0
pod 2 2 100.0
total 90 99 90.9


line stmt bran cond sub pod time code
1             package FASTAid;
2              
3 1     1   41399 use version; $VERSION = qv('0.0.4');
  1         2686  
  1         7  
4              
5             =head1 NAME
6              
7             FASTAid - index a FASTA sequence database for fast random-access sequence retrieval
8              
9             =head1 VERSION
10              
11             This document describes FASTAid version 0.0.3
12              
13              
14             =head1 SYNOPSIS
15              
16             use FASTAid qw( create_index retrieve_entry );
17              
18             my $FASTA_file = 'lots_of_seqs.fa';
19              
20             # index the file of FASTA seqs
21             create_index($FASTA_file);
22              
23             # retrieve one or more FASTA seqs from the file
24             my $seq_array_ref = retrieve_entry($FASTA_file, 'NM_00204', 'AA23456');
25              
26             foreach my $seq ( @{$seq_array_ref} ) {
27              
28             ...do something with each FASTA sequence...
29             }
30              
31              
32             =head1 DESCRIPTION
33              
34             FASTAid indexes files containing FASTA sequence records and allows quick
35             random-access retrieval of one or more FASTA sequences.
36              
37             FASTAid writes the index to a file with the suffix '.fec'.
38              
39              
40             =head1 DIAGNOSTICS
41              
42             =over
43              
44             =item C<< could not open FASTA file >>
45              
46             A file could not be opened. Probably the path you supplied is incorrect or the permissions
47             are incorrect.
48              
49             =item C<< There is already an entry ID >>
50              
51             The same identifier appears more than once in the FASTA file you supplied. This is a fatal
52             error because FASTAid uses the identifier to index the position of the sequence.
53              
54             =item C<< Cannot write FASTAid index >>
55              
56             The index could not be written. This is a file system error, so probably you don't have
57             permissions to write in the directory.
58              
59             =item C<< Must supply at least one ID >>
60              
61             No identifiers were supplied as arguments to retrieve_entry. Since FASTAid uses the
62             identifier as the lookup, it can't retrieve an entry without an identifier.
63              
64             =item C<< Entry ID = not found! >>
65              
66             An identifier could not be found in the index. This is a warning, not a fatal error,
67             because if other identifiers are supplied to retrieve_entry, those sequences will be
68             returned even if others fail.
69              
70             There are two common causes for this error: either the index is out of date and the
71             identifier doesn't exist in the index, or the identifier was misspelled when attempting
72             the lookup.
73              
74             =back
75              
76             =head1 DEPENDENCIES
77              
78             L
79              
80              
81             =head1 INCOMPATIBILITIES
82              
83             None reported.
84              
85              
86             =head1 BUGS AND LIMITATIONS
87              
88             No bugs have been reported.
89              
90             Please report any bugs or feature requests to
91             C, or through the web interface at
92             L.
93              
94              
95             =head1 AUTHOR
96              
97             Jarret Glasscock C<< >>
98             Dave Messina C<< >>
99              
100              
101             =head1 ACKNOWLEDGMENTS
102              
103             This software was developed at the Genome Sequencing Center at Washington
104             University, St. Louis, MO.
105              
106              
107             =head1 COPYRIGHT
108              
109             Copyright (C) 2004-6 Glasscock, Messina. All Rights Reserved.
110              
111              
112             =head1 DISCLAIMER
113              
114             This software is provided "as is" without warranty of any kind.
115              
116              
117             =cut
118              
119              
120             # PRAGMAS
121 1     1   121 use strict;
  1         1  
  1         29  
122 1     1   6 use warnings;
  1         6  
  1         130  
123              
124             # INCLUDES
125 1     1   6 use Carp;
  1         2  
  1         970  
126              
127              
128             =head2 create_index
129              
130             Usage : create_index(my_fasta_file) or die "index was not created";
131             Function : creates a byte index file representing positions of FASTA formatted entries.
132             Returns : returns true upon success of index creation, false upon failure
133             Args : a single argument, the path to a FASTA file
134              
135             =cut
136              
137             sub create_index {
138 5     5 1 3016 my ($fasta) = @_;
139 5         19 my $index = $fasta . '.fec';
140              
141 5         7 my ( %data, $begin, $id );
142 5 50       335 open( DB, $fasta )
143             or croak( qq{could not open FASTA file $fasta:\n}, qq{$!\n} );
144              
145             # record offsets of records in perl database
146 5         93 while () {
147 22 100       96 if ( /^>(\S+)/) {
148 7         18 $id = $1;
149 7         16 $begin = tell(DB) - length($_);
150 7 100       21 if ( defined $data{$id} ) {
151 1         19 croak "There is already an entry ID = $id\n";
152             }
153 6         34 else { $data{$id} = "$begin" }
154             }
155             }
156 4         38 close DB;
157              
158             # test for empty index
159 4 100       21 return 0 if (scalar (keys %data) == 0);
160              
161             # Write out index
162 3 50       366 open( OUT, ">$index" )
163             or croak( qq{Cannot write FASTAid index $index:\n}, qq{$!\n} );
164 3         18 foreach my $key ( sort { $a cmp $b } keys %data ) {
  2         7  
165 5         26 print OUT $key, " ", $data{$key}, "\n";
166             }
167 3         157 close OUT;
168              
169 3         25 return 1;
170             }
171              
172             =head2 retrieve_entry
173              
174             Usage : my $array_ref = retrieve_entry(FASTA_file_path, identifier1, identifier2, ..);
175             Function : retrieves FASTA sequence given index and identifier(s).
176             Returns : returns a reference to an array of FASTA entries
177             Args : FASTA_file_path, identifier1, identifier2, ..
178              
179             =cut
180              
181             sub retrieve_entry {
182 4     4 1 782 my ( $fasta, @ids ) = @_;
183              
184             # make sure we have IDs to retrieve
185 4 100       40 croak "Must supply at least one ID" if scalar @ids == 0;
186              
187 3         5 my @seqs; # where we store the FASTAs we're returning
188              
189             # Get the INDEX into an array so we can easily work with it
190 3         4 my ( @IND1, @IND2 );
191 3         8 my $index = $fasta . '.fec';
192              
193 3 50       105 open( INDEX, $index )
194             or croak( qq{cannot open FASTAid index $index:\n}, qq{$!\n} );
195 3         41 while () {
196 5 50       30 if ( $_ =~ /^(\S+)\s+(\S+)$/ ) {
197 5         13 push @IND1, $1;
198 5         31 push @IND2, $2;
199             }
200             }
201 3         29 close INDEX;
202              
203             # Get the FASTA file into an index before each $id is looked for
204 3 50       81 open( DB, "$fasta" )
205             or croak( qq{cannot open FASTA file $fasta:\n}, qq{$!\n} );
206              
207 3         8 foreach my $id (@ids) {
208 5         9 my ( $low, $high ) = ( 0, @IND1 - 1 );
209 5         9 my $seq; # the FASTA we're returning
210              
211             my $try;
212             TRY:
213 5         12 while ( $low <= $high ) {
214 7         18 $try = int( ( $low + $high ) / 2 ); # middle
215 7 100       19 $low = $try + 1, next TRY if $IND1[$try] lt $id;
216 6 100       18 $high = $try - 1, next TRY if $IND1[$try] gt $id;
217 5         6 last; # found it!!!
218             }
219              
220             # translate this back to the corresponding value
221             # in the byte index portion of lookup
222 5         10 my $trans_try = $IND2[$try];
223              
224             # go to the offset in the FASTA file
225 5 50 33     32 if ( $trans_try >= 0 && $IND1[$try] eq $id ) {
226 5         32 seek DB, $trans_try, 0; # position the file pointer there
227 5         51 my $def = ; # defline is first line
228 5         11 $seq = $def; # put the defline into our
229             # return string
230              
231             ENTRY:
232 5         18 while () { # save the other lines
233 14 100       32 last ENTRY if $_ =~ /^>/;
234 12         45 $seq .= $_;
235             }
236              
237             # add the seq to the array we're returning
238 5         17 push @seqs, $seq;
239             }
240             else {
241 0         0 carp "Entry ID = $id not found!\n";
242             }
243             }
244              
245             # send back an array of FASTA sequences
246 3         16 return \@seqs;
247             }
248              
249             1;