File Coverage

Bio/DB/IndexedBase.pm

Criterion	Covered	Total	%
statement	286	323	88.5
branch	90	136	66.1
condition	36	57	63.1
subroutine	64	75	85.3
pod	20	20	100.0
total	496	611	81.1

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::DB::IndexedBase
3							#
4							# You may distribute this module under the same terms as perl itself
5							#
6
7							=head1 NAME
8
9							Bio::DB::IndexedBase - Base class for modules using indexed sequence files
10
11							=head1 SYNOPSIS
12
13							use Bio::DB::XXX; # a made-up class that uses Bio::IndexedBase
14
15							# 1/ Bio::SeqIO-style access
16
17							# Index some sequence files
18							my $db = Bio::DB::XXX->new('/path/to/file'); # from a single file
19							my $db = Bio::DB::XXX->new(['file1', 'file2']); # from multiple files
20							my $db = Bio::DB::XXX->new('/path/to/files/'); # from a directory
21
22							# Get IDs of all the sequences in the database
23							my @ids = $db->get_all_primary_ids;
24
25							# Get a specific sequence
26							my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
27
28							# Loop through all sequences
29							my $stream = $db->get_PrimarySeq_stream;
30							while (my $seq = $stream->next_seq) {
31							# Do something...
32							}
33
34
35							# 2/ Access via filehandle
36							my $fh = Bio::DB::XXX->newFh('/path/to/file');
37							while (my $seq = <$fh>) {
38							# Do something...
39							}
40
41
42							# 3/ Tied-hash access
43							tie %sequences, 'Bio::DB::XXX', '/path/to/file';
44							print $sequences{'CHROMOSOME_I:1,20000'};
45
46							=head1 DESCRIPTION
47
48							Bio::DB::IndexedBase provides a base class for modules that want to index
49							and read sequence files and provides persistent, random access to each sequence
50							entry, without bringing the entire file into memory. This module is compliant
51							with the Bio::SeqI interface and both. Bio::DB::Fasta and Bio::DB::Qual both use
52							Bio::DB::IndexedBase.
53
54							When you initialize the module, you point it at a single file, several files, or
55							a directory of files. The first time it is run, the module generates an index
56							of the content of the files using the AnyDBM_File module (BerkeleyDB preferred,
57							followed by GDBM_File, NDBM_File, and SDBM_File). Subsequently, it uses the
58							index file to find the sequence file and offset for any requested sequence. If
59							one of the source files is updated, the module reindexes just that one file. You
60							can also force reindexing manually at any time. For improved performance, the
61							module keeps a cache of open filehandles, closing less-recently used ones when
62							the cache is full.
63
64							Entries may have any line length up to 65,536 characters, and different line
65							lengths are allowed in the same file. However, within a sequence entry, all
66							lines must be the same length except for the last. An error will be thrown if
67							this is not the case!
68
69							This module was developed for use with the C. elegans and human genomes, and has
70							been tested with sequence segments as large as 20 megabases. Indexing the C.
71							elegans genome (100 megabases of genomic sequence plus 100,000 ESTs) takes ~5
72							minutes on my 300 MHz pentium laptop. On the same system, average access time
73							for any 200-mer within the C. elegans genome was E0.02s.
74
75							=head1 DATABASE CREATION AND INDEXING
76
77							The two constructors for this class are new() and newFh(). The former creates a
78							Bio::DB::IndexedBase object which is accessed via method calls. The latter
79							creates a tied filehandle which can be used Bio::SeqIO style to fetch sequence
80							objects in a stream fashion. There is also a tied hash interface.
81
82							=over
83
84							=item $db = Bio::DB::IndexedBase-Enew($path [,%options])
85
86							Create a new Bio::DB::IndexedBase object from the files designated by $path
87							$path may be a single file, an arrayref of files, or a directory containing
88							such files.
89
90							After the database is created, you can use methods like get_all_primary_ids()
91							and get_Seq_by_id() to retrieve sequence objects.
92
93							=item $fh = Bio::DB::IndexedBase-EnewFh($path [,%options])
94
95							Create a tied filehandle opened on a Bio::DB::IndexedBase object. Reading
96							from this filehandle with EE will return a stream of sequence objects,
97							Bio::SeqIO style. The path and the options should be specified as for new().
98
99							=item $obj = tie %db,'Bio::DB::IndexedBase', '/path/to/file' [,@args]
100
101							Create a tied-hash by tieing %db to Bio::DB::IndexedBase using the indicated
102							path to the files. The optional @args list is the same set used by new(). If
103							successful, tie() returns the tied object, undef otherwise.
104
105							Once tied, you can use the hash to retrieve an individual sequence by
106							its ID, like this:
107
108							my $seq = $db{CHROMOSOME_I};
109
110							The keys() and values() functions will return the sequence IDs and their
111							sequences, respectively. In addition, each() can be used to iterate over the
112							entire data set:
113
114							while (my ($id,$sequence) = each %db) {
115							print "$id => $sequence\n";
116							}
117
118
119							When dealing with very large sequences, you can avoid bringing them into memory
120							by calling each() in a scalar context. This returns the key only. You can then
121							use tied(%db) to recover the Bio::DB::IndexedBase object and call its methods.
122
123							while (my $id = each %db) {
124							print "$id: $db{$sequence:1,100}\n";
125							print "$id: ".tied(%db)->length($id)."\n";
126							}
127
128							In addition, you may invoke the FIRSTKEY and NEXTKEY tied hash methods directly
129							to retrieve the first and next ID in the database, respectively. This allows one to
130							write the following iterative loop using just the object-oriented interface:
131
132							my $db = Bio::DB::IndexedBase->new('/path/to/file');
133							for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
134							# do something with sequence
135							}
136
137							=back
138
139							=head1 INDEX CONTENT
140
141							Several attributes of each sequence are stored in the index file. Given a
142							sequence ID, these attributes can be retrieved using the following methods:
143
144							=over
145
146							=item offset($id)
147
148							Get the offset of the indicated sequence from the beginning of the file in which
149							it is located. The offset points to the beginning of the sequence, not the
150							beginning of the header line.
151
152							=item strlen($id)
153
154							Get the number of characters in the sequence string.
155
156							=item length($id)
157
158							Get the number of residues of the sequence.
159
160							=item linelen($id)
161
162							Get the length of the line for this sequence. If the sequence is wrapped, then
163							linelen() is likely to be much shorter than strlen().
164
165							=item headerlen($id)
166
167							Get the length of the header line for the indicated sequence.
168
169							=item header_offset
170
171							Get the offset of the header line for the indicated sequence from the beginning
172							of the file in which it is located. This attribute is not stored. It is
173							calculated from offset() and headerlen().
174
175							=item alphabet($id)
176
177							Get the molecular type (alphabet) of the indicated sequence. This method handles
178							residues according to the IUPAC convention.
179
180							=item file($id)
181
182							Get the the name of the file in which the indicated sequence can be found.
183
184							=back
185
186							=head1 INTERFACE COMPLIANCE NOTES
187
188							Bio::DB::IndexedBase is compliant with the Bio::DB::SeqI and hence with the
189							Bio::RandomAccessI interfaces.
190
191							Database do not necessarily provide any meaningful internal primary ID for the
192							sequences they store. However, Bio::DB::IndexedBase's internal primary IDs are
193							the IDs of the sequences. This means that the same ID passed to get_Seq_by_id()
194							and get_Seq_by_primary_id() will return the same sequence.
195
196							Since this database index has no notion of sequence version or namespace, the
197							get_Seq_by_id(), get_Seq_by_acc() and get_Seq_by_version() are identical.
198
199							=head1 BUGS
200
201							When a sequence is deleted from one of the files, this deletion is not detected
202							by the module and removed from the index. As a result, a "ghost" entry will
203							remain in the index and will return garbage results if accessed.
204
205							Also, if you are indexing a directory, it is wise to not add or remove files
206							from it.
207
208							In case you have changed the files in a directory, or the sequences in a file,
209							you can to rebuild the entire index, either by deleting it manually, or by
210							passing -reindex=E1 to new() when initializing the module.
211
212							=head1 SEE ALSO
213
214							L
215
216							L
217
218							L
219
220							=head1 AUTHOR
221
222							Lincoln Stein Elstein@cshl.orgE.
223
224							Copyright (c) 2001 Cold Spring Harbor Laboratory.
225
226							Florent Angly (for the modularization)
227
228							This library is free software; you can redistribute it and/or modify
229							it under the same terms as Perl itself. See DISCLAIMER.txt for
230							disclaimers of warranty.
231
232							=head1 APPENDIX
233
234							The rest of the documentation details each of the object
235							methods. Internal methods are usually preceded with a _
236
237							=cut
238
239
240							package Bio::DB::IndexedBase;
241
242							BEGIN {
243							@AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
244	2	50		2		83	if(!$INC{'AnyDBM_File.pm'});
245							}
246
247	2			2		10	use strict;
	2					3
	2					36
248	2			2		9	use warnings;
	2					4
	2					49
249	2			2		7	use IO::File;
	2					3
	2					291
250	2			2		570	use AnyDBM_File;
	2					4373
	2					80
251	2			2		12	use Fcntl;
	2					3
	2					484
252	2			2		14	use File::Spec;
	2					3
	2					42
253	2			2		9	use File::Basename qw(basename dirname);
	2					2
	2					118
254	2			2		649	use Bio::PrimarySeq;
	2					4
	2					59
255
256	2			2		11	use base qw(Bio::DB::SeqI);
	2					4
	2					476
257
258							# Store offset, strlen, linelen, headerlen, type and fileno
259	2			2		11	use constant STRUCT => 'NNNnnCa*'; # 32-bit file offset and seq length
	2					4
	2					116
260	2			2		15	use constant STRUCTBIG => 'QQQnnCa*'; # 64-bit
	2					3
	2					75
261
262	2			2		8	use constant NA => 0;
	2					4
	2					67
263	2			2		10	use constant DNA => 1;
	2					4
	2					62
264	2			2		11	use constant RNA => 2;
	2					3
	2					64
265	2			2		9	use constant PROTEIN => 3;
	2					4
	2					94
266
267							# You can avoid dying if you want but you may get incorrect results
268	2			2		10	use constant DIE_ON_MISSMATCHED_LINES => 1;
	2					3
	2					1334
269
270							# Remove carriage returns (\r) and newlines (\n) from a string. When
271							# called from subseq, this can take a signficiant portion of time, in
272							# Variant Effect Prediction. Therefore we compile the match portion.
273							sub _strip_crnl {
274	2			2		164	eval 'require Inline::C';
275	2	50				41581	if ( $INC{'Inline/C.pm'} ) {
276							# C can do _strip_crnl much faster. But this requires the
277							# Inline::C module which we don't require people to have. So we make
278							# this optional by wrapping the C code in an eval. If the eval works,
279							# the Perl strip_crnl() function is overwritten.
280	2					13	Inline->bind(
281							C => q(
282							/*
283							Strip all newlines (\n) and carriage returns (\r) from the string
284							*/
285							char* _strip_crnl(char* str) {
286							char *s;
287							char *s2 = str;
288							for (s = str; s; s++) {
289							if (s != '\n' && s != '\r') {
290							s2++ = s;
291							}
292							}
293							*s2 = '\0';
294							return str;
295							}
296							)
297							);
298							} else {
299							# "tr" is much faster than the regex, with "s"
300							*Bio::DB::IndexedBase::_strip_crnl = sub {
301	0			0		0	my $str = shift;
302	0					0	$str =~ tr/\n\r//d;
303	0					0	return $str;
304	0					0	};
305							}
306
307	2					1057515	return _strip_crnl(@_);
308							}
309
310							=head2 new
311
312							Title : new
313							Usage : my $db = Bio::DB::IndexedBase->new($path, -reindex => 1);
314							Function: Initialize a new database object
315							Returns : A Bio::DB::IndexedBase object
316							Args : A single file, or path to dir, or arrayref of files
317							Optional arguments:
318
319							Option Description Default
320							----------- ----------- -------
321							-glob Glob expression to search for files in directories *
322							-makeid A code subroutine for transforming IDs None
323							-maxopen Maximum size of filehandle cache 32
324							-debug Turn on status messages 0
325							-reindex Force the index to be rebuilt 0
326							-dbmargs Additional arguments to pass to the DBM routine None
327							-index_name Name of the file that will hold the indices
328							-clean Remove the index file when finished 0
329
330							The -dbmargs option can be used to control the format of the index. For example,
331							you can pass $DB_BTREE to this argument so as to force the IDs to be sorted and
332							retrieved alphabetically. Note that you must use the same arguments every time
333							you open the index!
334
335							The -makeid option gives you a chance to modify sequence IDs during indexing.
336							For example, you may wish to extract a portion of the gi\|gb\|abc\|xyz nonsense
337							that GenBank Fasta files use. The original header line can be recovered later.
338							The option value for -makeid should be a code reference that takes a scalar
339							argument (the full header line) and returns a scalar or an array of scalars (the
340							ID or IDs you want to assign). For example:
341
342							$db = Bio::DB::IndexedBase->new('file.fa', -makeid => \&extract_gi);
343
344							sub extract_gi {
345							# Extract GI from GenBank
346							my $header = shift;
347							my ($id) = ($header =~ /gi\\|(\d+)/m);
348							return $id \|\| '';
349							}
350
351							extract_gi() will be called with the full header line, e.g. a Fasta line would
352							include the "E", the ID and the description:
353
354							>gi\|352962132\|ref\|NG_030353.1\| Homo sapiens sal-like 3 (Drosophila) (SALL3)
355
356							In the database, this sequence can now be retrieved by its GI instead of its
357							complete ID:
358
359							my $seq = $db->get_Seq_by_id(352962132);
360
361							The -makeid option is ignored after the index is constructed.
362
363							=cut
364
365							sub new {
366	20			20	1	865	my ($class, $path, %opts) = @_;
367
368							my $self = bless {
369							debug => $opts{-debug} \|\| 0,
370							makeid => $opts{-makeid},
371							glob => $opts{-glob} \|\| eval '$'.$class.'::file_glob' \|\| '*',
372							maxopen => $opts{-maxopen} \|\| 32,
373							clean => $opts{-clean} \|\| 0,
374							dbmargs => $opts{-dbmargs} \|\| undef,
375							fhcache => {},
376							cacheseq => {},
377							curopen => 0,
378							openseq => 1,
379							dirname => undef,
380							offsets => undef,
381							index_name => $opts{-index_name},
382							obj_class => eval '$'.$class.'::obj_class',
383	20		50			1170	offset_meth => \&{$class.'::_calculate_offsets'},
	20		50			292
			50
			100
			50
384							fileno2path => [],
385							filepath2no => {},
386							}, $class;
387
388	20					74	my ($offsets, $dirname);
389	20		100			93	my $ref = ref $path \|\| '';
390	20	100				51	if ( $ref eq 'ARRAY' ) {
391	1					9	$offsets = $self->index_files($path, $opts{-reindex});
392	1					7	require Cwd;
393	1					5	$dirname = Cwd::getcwd();
394							} else {
395	19		66			133	$self->{index_name} \|\|= $self->_default_index_name($path);
396	19	100				131	if (-d $path) {
		50
397							# because Win32 glob() is broken with respect to long file names
398							# that contain whitespace.
399	9	50	33			58	$path = Win32::GetShortPathName($path)
400							if $^O =~ /^MSWin/i && eval 'use Win32; 1';
401	9					49	$offsets = $self->index_dir($path, $opts{-reindex});
402	8					26	$dirname = $path;
403							} elsif (-f _) {
404	10					49	$offsets = $self->index_file($path, $opts{-reindex});
405	9					355	$dirname = dirname($path);
406							} else {
407	0					0	$self->throw( "No file or directory called '$path'");
408							}
409							}
410	18					39	@{$self}{qw(dirname offsets)} = ($dirname, $offsets);
	18					53
411
412	18					155	return $self;
413							}
414
415
416							=head2 newFh
417
418							Title : newFh
419							Usage : my $fh = Bio::DB::IndexedBase->newFh('/path/to/files/', %options);
420							Function: Index and get a new Fh for a single file, several files or a directory
421							Returns : Filehandle object
422							Args : Same as new()
423
424							=cut
425
426							sub newFh {
427	1			1	1	71	my ($class, @args) = @_;
428	1					6	my $self = $class->new(@args);
429	1					6	require Symbol;
430	1					3	my $fh = Symbol::gensym;
431	1	50				59	tie $$fh, 'Bio::DB::Indexed::Stream', $self
432							or $self->throw("Could not tie filehandle: $!");
433	1					5	return $fh;
434							}
435
436
437							=head2 dbmargs
438
439							Title : dbmargs
440							Usage : my @args = $db->dbmargs;
441							Function: Get stored dbm arguments
442							Returns : Array
443							Args : None
444
445							=cut
446
447							sub dbmargs {
448	38			38	1	51	my $self = shift;
449	38	50				127	my $args = $self->{dbmargs} or return;
450	0	0				0	return ref($args) eq 'ARRAY' ? @$args : $args;
451							}
452
453
454							=head2 glob
455
456							Title : glob
457							Usage : my $glob = $db->glob;
458							Function: Get the expression used to match files in directories
459							Returns : String
460							Args : None
461
462							=cut
463
464							sub glob {
465	2			2	1	4	my $self = shift;
466	2					11	return $self->{glob};
467							}
468
469
470							=head2 index_dir
471
472							Title : index_dir
473							Usage : $db->index_dir($dir);
474							Function: Index the files that match -glob in the given directory
475							Returns : Hashref of offsets
476							Args : Dirname
477							Boolean to force a reindexing the directory
478
479							=cut
480
481							sub index_dir {
482	9			9	1	31	my ($self, $dir, $force_reindex) = @_;
483	9					1789	my @files = glob( File::Spec->catfile($dir, $self->{glob}) );
484	9	50				48	return if scalar @files == 0;
485	9		33			31	$self->{index_name} \|\|= $self->_default_index_name($dir);
486	9					45	my $offsets = $self->_index_files(\@files, $force_reindex);
487	8					39	return $offsets;
488							}
489
490
491							=head2 get_all_primary_ids
492
493							Title : get_all_primary_ids, get_all_ids, ids
494							Usage : my @ids = $db->get_all_primary_ids;
495							Function: Get the IDs stored in all indexes. This is a Bio::DB::SeqI method
496							implementation. Note that in this implementation, the internal
497							database primary IDs are also the sequence IDs.
498							Returns : List of ids
499							Args : None
500
501							=cut
502
503							sub get_all_primary_ids {
504	6			6	1	13	return keys %{shift->{offsets}};
	6					216
505							}
506
507							{
508	2			2		18	no warnings 'once';
	2					3
	2					731
509							ids = get_all_ids = \&get_all_primary_ids;
510							}
511
512
513							=head2 index_file
514
515							Title : index_file
516							Usage : $db->index_file($filename);
517							Function: Index the given file
518							Returns : Hashref of offsets
519							Args : Filename
520							Boolean to force reindexing the file
521
522							=cut
523
524							sub index_file {
525	10			10	1	30	my ($self, $file, $force_reindex) = @_;
526	10		33			31	$self->{index_name} \|\|= $self->_default_index_name($file);
527	10					32	my $offsets = $self->_index_files([$file], $force_reindex);
528	9					40	return $offsets;
529							}
530
531							sub _default_index_name {
532	18			18		42	my ($self,$path) = @_;
533	18	100				393	return File::Spec->catfile($path,'directory.index') if -d $path;
534	9					41	return "$path.index";
535							}
536
537							=head2 index_files
538
539							Title : index_files
540							Usage : $db->index_files(\@files);
541							Function: Index the given files
542							Returns : Hashref of offsets
543							Args : Arrayref of filenames
544							Boolean to force reindexing the files
545
546							=cut
547
548							sub index_files {
549	1			1	1	3	my ($self, $files, $force_reindex) = @_;
550	1					5	my @paths = map { File::Spec->rel2abs($_) } @$files;
	2					66
551	1					8	require Digest::MD5;
552	1					11	my $digest = Digest::MD5::md5_hex( join('', sort @paths) );
553	1		33			8	$self->{index_name} \|\|= "fileset_$digest.index"; # unique name for the given files
554	1					4	my $offsets = $self->_index_files($files, $force_reindex);
555	1					3	return $offsets;
556							}
557
558
559							=head2 index_name
560
561							Title : index_name
562							Usage : my $indexname = $db->index_name($path);
563							Function: Get the full name of the index file
564							Returns : String
565							Args : None
566
567							=cut
568
569							sub index_name {
570	38			38	1	1244	return shift->{index_name};
571							}
572
573
574							=head2 path
575
576							Title : path
577							Usage : my $path = $db->path($path);
578							Function: When a single file or a directory of files is indexed, this returns
579							the file directory. When indexing an arbitrary list of files, the
580							return value is the path of the current working directory.
581							Returns : String
582							Args : None
583
584							=cut
585
586							sub path {
587	0			0	1	0	return shift->{dirname};
588							}
589
590
591							=head2 get_PrimarySeq_stream
592
593							Title : get_PrimarySeq_stream
594							Usage : my $stream = $db->get_PrimarySeq_stream();
595							Function: Get a SeqIO-like stream of sequence objects. The stream supports a
596							single method, next_seq(). Each call to next_seq() returns a new
597							PrimarySeqI compliant sequence object, until no more sequences remain.
598							This is a Bio::DB::SeqI method implementation.
599							Returns : A Bio::DB::Indexed::Stream object
600							Args : None
601
602							=cut
603
604							sub get_PrimarySeq_stream {
605	3			3	1	6	my $self = shift;
606	3					21	return Bio::DB::Indexed::Stream->new($self);
607							}
608
609
610							=head2 get_Seq_by_id
611
612							Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id
613							Usage : my $seq = $db->get_Seq_by_id($id);
614							Function: Given an ID, fetch the corresponding sequence from the database.
615							This is a Bio::DB::SeqI and Bio::DB::RandomAccessI method implementation.
616							Returns : A sequence object
617							Args : ID
618
619							=cut
620
621							sub get_Seq_by_id {
622	22			22	1	640	my ($self, $id) = @_;
623	22	50				48	$self->throw('Need to provide a sequence ID') if not defined $id;
624	22	100				233	return if not exists $self->{offsets}{$id};
625	18					120	return $self->{obj_class}->new($self, $id);
626							}
627
628							{
629	2			2		13	no warnings 'once';
	2					6
	2					4151
630							get_Seq_by_version = get_Seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
631							}
632
633
634							=head2 _calculate_offsets
635
636							Title : _calculate_offsets
637							Usage : $db->_calculate_offsets($filename, $offsets);
638							Function: This method calculates the sequence offsets in a file based on ID and
639							should be implemented by classes that use Bio::DB::IndexedBase.
640							Returns : Hash of offsets
641							Args : File to process
642							Hashref of file offsets keyed by IDs.
643
644							=cut
645
646							sub _calculate_offsets {
647	0			0		0	my $self = shift;
648	0					0	$self->throw_not_implemented();
649							}
650
651
652							sub _index_files {
653							# Do the indexing of the given files using the index file on record
654	20			20		50	my ($self, $files, $force_reindex) = @_;
655
656	20					77	$self->_set_pack_method( @$files );
657
658							# Get name of index file
659	20					73	my $index = $self->index_name;
660
661							# If caller has requested reindexing, unlink the index file.
662	20	100				50	if ($force_reindex) {
663							# Tied-hash in Strawberry Perl creates "$file.index"
664	12	50				134	unlink $index if -e $index;
665							# Tied-hash in ActivePerl creates "$file.index.pag" and "$file.index.dir"
666	12	100				231	unlink "$index.dir" if -e "$index.dir";
667	12	100				162	unlink "$index.pag" if -e "$index.pag";
668							}
669
670							# Get the modification time of the index
671	20		50			224	my $indextime = (stat $index)[9] \|\| 0;
672
673							# Register files and find if there has been any update
674	20					39	my $modtime = 0;
675	20					44	my @updated;
676	20					54	for my $file (@$files) {
677							# Register file
678	63					1560	$self->_path2fileno(basename($file));
679							# Any update?
680	63		50			499	my $m = (stat $file)[9] \|\| 0;
681	63	100				142	if ($m > $modtime) {
682	20					32	$modtime = $m;
683							}
684	63	50				109	if ($m > $indextime) {
685	63					125	push @updated, $file;
686							}
687							}
688
689							# Get termination length from first file
690	20					82	$self->{termination_length} = $self->_calc_termination_length( $files->[0] );
691
692							# Reindex contents of changed files if needed
693	20		66			76	my $reindex = $force_reindex \|\| (scalar @updated > 0);
694	20	50				77	$self->{offsets} = $self->_open_index($index, $reindex) or return;
695	20	50				58	if ($reindex) {
696	20					37	$self->{indexing} = $index;
697	20					42	for my $file (@updated) {
698	62					3011	my $fileno = $self->_path2fileno(basename($file));
699	62					172	&{$self->{offset_meth}}($self, $fileno, $file, $self->{offsets});
	62					499
700							}
701	18					56	delete $self->{indexing};
702							}
703
704							# Closing and reopening might help corrupted index file problem on Windows
705	18					100	$self->_close_index($self->{offsets});
706
707	18					57	return $self->{offsets} = $self->_open_index($index);
708							}
709
710
711							sub _open_index {
712							# Open index file in read-only or write mode
713	38			38		82	my ($self, $index_file, $write) = @_;
714	38					53	my %offsets;
715	38	100				74	my $flags = $write ? O_CREAT\|O_RDWR : O_RDONLY;
716	38					111	my @dbmargs = $self->dbmargs;
717	38	50				1539	tie %offsets, 'AnyDBM_File', $index_file, $flags, 0644, @dbmargs
718							or $self->throw( "Could not open index file $index_file: $!");
719	38					204	return \%offsets;
720							}
721
722
723							sub _close_index {
724							# Close index file
725	45			45		81	my ($self, $index) = @_;
726	45					367	untie %$index;
727	45					75	return 1;
728							}
729
730							# Compiling the below regular expression speeds up _parse_compound_id
731							my $compound_id = qr/^ (.+?) (?:\:([\d_]+)(?:,\|-\|\.\.)([\d_]+))? (?:\/(.+))? $/x;
732
733							sub _parse_compound_id {
734							# Handle compound IDs:
735							# $db->seq($id)
736							# $db->seq($id, $start, $stop, $strand)
737							# $db->seq("$id:$start,$stop")
738							# $db->seq("$id:$start..$stop")
739							# $db->seq("$id:$start-$stop")
740							# $db->seq("$id:$start,$stop/$strand")
741							# $db->seq("$id:$start..$stop/$strand")
742							# $db->seq("$id:$start-$stop/$strand")
743							# $db->seq("$id/$strand")
744	45			45		91	my ($self, $id, $start, $stop, $strand) = @_;
745
746	45	100	33			402	if ( (not defined $start ) &&
			66
747							(not defined $stop ) &&
748							(not defined $strand) &&
749							($id =~ m{$compound_id}) ) {
750							# Start, stop and strand not provided and ID looks like a compound ID
751	20					88	($id, $start, $stop, $strand) = ($1, $2, $3, $4);
752							}
753
754							# Start, stop and strand defaults
755	45		100			107	$stop \|\|= $self->length($id) \|\| 0; # 0 if sequence not found in database
			100
756	45	100	100			97	$start \|\|= ($stop > 0) ? 1 : 0;
757	45		100			117	$strand \|\|= 1;
758
759							# Convert numbers such as 1_000_000 to 1000000
760	45					88	$start =~ s/_//g;
761	45					70	$stop =~ s/_//g;
762
763	45	100				86	if ($start > $stop) {
764							# Change the strand
765	8					22	($start, $stop) = ($stop, $start);
766	8					16	$strand *= -1;
767							}
768
769	45					153	return $id, $start, $stop, $strand;
770							}
771
772
773							sub _guess_alphabet {
774							# Determine the molecular type of the given sequence string:
775							# 'dna', 'rna', 'protein' or '' (unknown/empty)
776	2997			2997		4162	my ($self, $string) = @_;
777							# Handle IUPAC residues like PrimarySeq does
778	2997					5398	my $alphabet = Bio::PrimarySeq::_guess_alphabet_from_string($self, $string, 1);
779	2997	50				8303	return $alphabet eq 'dna' ? DNA
		100
		100
780							: $alphabet eq 'rna' ? RNA
781							: $alphabet eq 'protein' ? PROTEIN
782							: NA;
783							}
784
785
786							sub _makeid {
787							# Process the header line by applying any transformation given in -makeid
788	3053			3053		5120	my ($self, $header_line) = @_;
789	3053	100				10989	return ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($header_line) : $1;
790							}
791
792
793							sub _check_linelength {
794							# Check that the line length is valid. Generate an error otherwise.
795	105			105		212	my ($self, $linelength) = @_;
796	105	100				233	return if not defined $linelength;
797	96	50				284	$self->throw(
798							"Each line of the file must be less than 65,536 characters. Line ".
799							"$. is $linelength chars."
800							) if $linelength > 65535;
801							}
802
803
804							sub _calc_termination_length {
805							# Try the beginning of the file to determine termination length
806							# Account for crlf-terminated Windows and Mac files
807	20			20		50	my ($self, $file) = @_;
808	20	50				134	my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
809
810							# In Windows, text files have '\r\n' as line separator, but when reading in
811							# text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
812							# "length $_" will report 4 although the line is 5 bytes in length.
813							# We assume that all lines have the same line separator and only read current line.
814	20					1411	my $init_pos = tell($fh);
815	20					218	my $curr_line = <$fh>;
816	20					50	my $pos_diff = tell($fh) - $init_pos;
817	20					44	my $correction = $pos_diff - length $curr_line;
818	20					106	close $fh;
819
820	20	50				87	$self->{termination_length} = ($curr_line =~ /\r\n$/) ? 2 : 1+$correction;
821	20					69	return $self->{termination_length};
822							}
823
824
825							sub _calc_offset {
826							# Get the offset of the n-th residue of the sequence with the given ID
827							# and termination length (tl)
828	88			88		141	my ($self, $id, $n) = @_;
829	88					102	my $tl = $self->{termination_length};
830	88					93	$n--;
831	88					171	my ($offset, $seqlen, $linelen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,1,3];
	88					112
832	88	100				226	$n = 0 if $n < 0;
833	88	100				126	$n = $seqlen-1 if $n >= $seqlen;
834	88					254	return $offset + $linelen * int($n/($linelen-$tl)) + $n % ($linelen-$tl);
835							}
836
837
838							sub _fh {
839							# Given a sequence ID, return the filehandle on which to find this sequence
840	49			49		76	my ($self, $id) = @_;
841	49	50				92	$self->throw('Need to provide a sequence ID') if not defined $id;
842	49	100				100	my $file = $self->file($id) or return;
843	48	0				84	return eval {
844	48					528	$self->_fhcache( File::Spec->catfile($self->{dirname}, $file));
845							} or $self->throw( "Can't open file $file" );
846							}
847
848
849							sub _fhcache {
850	48			48		94	my ($self, $path) = @_;
851	48	100				102	if (!$self->{fhcache}{$path}) {
852	9	50				29	if ($self->{curopen} >= $self->{maxopen}) {
853	0					0	my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};}
854	0					0	keys %{$self->{fhcache}};
	0					0
855	0					0	splice(@lru, $self->{maxopen} / 3);
856	0					0	$self->{curopen} -= @lru;
857	0					0	for (@lru) {
858	0					0	delete $self->{fhcache}{$_};
859							}
860							}
861	9		50			50	$self->{fhcache}{$path} = IO::File->new($path) \|\| return;
862	9					648	binmode $self->{fhcache}{$path};
863	9					21	$self->{curopen}++;
864							}
865	48					85	$self->{cacheseq}{$path}++;
866	48					181	return $self->{fhcache}{$path};
867							}
868
869
870							#-------------------------------------------------------------
871							# Methods to store and retrieve data from indexed file
872							#
873
874							=head2 offset
875
876							Title : offset
877							Usage : my $offset = $db->offset($id);
878							Function: Get the offset of the indicated sequence from the beginning of the
879							file in which it is located. The offset points to the beginning of
880							the sequence, not the beginning of the header line.
881							Returns : String
882							Args : ID of sequence
883
884							=cut
885
886							sub offset {
887	1			1	1	2	my ($self, $id) = @_;
888	1	50				5	$self->throw('Need to provide a sequence ID') if not defined $id;
889	1	50				18	my $offset = $self->{offsets}{$id} or return;
890	1					3	return (&{$self->{unpackmeth}}($offset))[0];
	1					4
891							}
892
893
894							=head2 strlen
895
896							Title : strlen
897							Usage : my $length = $db->strlen($id);
898							Function: Get the number of characters in the sequence string.
899							Returns : Integer
900							Args : ID of sequence
901
902							=cut
903
904							sub strlen {
905	16			16	1	19	my ($self, $id) = @_;
906	16	50				25	$self->throw('Need to provide a sequence ID') if not defined $id;
907	16	50				83	my $offset = $self->{offsets}{$id} or return;
908	16					30	return (&{$self->{unpackmeth}}($offset))[1];
	16					24
909							}
910
911
912							=head2 length
913
914							Title : length
915							Usage : my $length = $db->length($id);
916							Function: Get the number of residues of the sequence.
917							Returns : Integer
918							Args : ID of sequence
919
920							=cut
921
922							sub length {
923	30			30	1	65	my ($self, $id) = @_;
924	30	50				70	$self->throw('Need to provide a sequence ID') if not defined $id;
925	30	100				273	my $offset = $self->{offsets}{$id} or return;
926	29					61	return (&{$self->{unpackmeth}}($offset))[2];
	29					59
927							}
928
929
930							=head2 linelen
931
932							Title : linelen
933							Usage : my $linelen = $db->linelen($id);
934							Function: Get the length of the line for this sequence.
935							Returns : Integer
936							Args : ID of sequence
937
938							=cut
939
940							sub linelen {
941	0			0	1	0	my ($self, $id) = @_;
942	0	0				0	$self->throw('Need to provide a sequence ID') if not defined $id;
943	0	0				0	my $offset = $self->{offsets}{$id} or return;
944	0					0	return (&{$self->{unpackmeth}}($offset))[3];
	0					0
945							}
946
947
948							=head2 headerlen
949
950							Title : headerlen
951							Usage : my $length = $db->headerlen($id);
952							Function: Get the length of the header line for the indicated sequence.
953							Returns : Integer
954							Args : ID of sequence
955
956							=cut
957
958							sub headerlen {
959	0			0	1	0	my ($self, $id) = @_;
960	0	0				0	$self->throw('Need to provide a sequence ID') if not defined $id;
961	0	0				0	my $offset = $self->{offsets}{$id} or return;
962	0					0	return (&{$self->{unpackmeth}}($offset))[4];
	0					0
963							}
964
965
966							=head2 header_offset
967
968							Title : header_offset
969							Usage : my $offset = $db->header_offset($id);
970							Function: Get the offset of the header line for the indicated sequence from
971							the beginning of the file in which it is located.
972							Returns : String
973							Args : ID of sequence
974
975							=cut
976
977							sub header_offset {
978	0			0	1	0	my ($self, $id) = @_;
979	0	0				0	$self->throw('Need to provide a sequence ID') if not defined $id;
980	0	0				0	return if not $self->{offsets}{$id};
981	0					0	return $self->offset($id) - $self->headerlen($id);
982							}
983
984
985							=head2 alphabet
986
987							Title : alphabet
988							Usage : my $alphabet = $db->alphabet($id);
989							Function: Get the molecular type of the indicated sequence: dna, rna or protein
990							Returns : String
991							Args : ID of sequence
992
993							=cut
994
995							sub alphabet {
996	13			13	1	30	my ($self, $id) = @_;
997	13	50				31	$self->throw('Need to provide a sequence ID') if not defined $id;
998	13	50				85	my $offset = $self->{offsets}{$id} or return;
999	13					29	my $alphabet = (&{$self->{unpackmeth}}($offset))[5];
	13					28
1000	13	100				89	return : $alphabet == Bio::DB::IndexedBase::DNA ? 'dna'
		100
		100
1001							: $alphabet == Bio::DB::IndexedBase::RNA ? 'rna'
1002							: $alphabet == Bio::DB::IndexedBase::PROTEIN ? 'protein'
1003							: '';
1004							}
1005
1006
1007							=head2 file
1008
1009							Title : file
1010							Usage : my $file = $db->file($id);
1011							Function: Get the the name of the file in which the indicated sequence can be
1012							found.
1013							Returns : String
1014							Args : ID of sequence
1015
1016							=cut
1017
1018							sub file {
1019	59			59	1	111	my ($self, $id) = @_;
1020	59	50				123	$self->throw('Need to provide a sequence ID') if not defined $id;
1021	59	100				424	my $offset = $self->{offsets}{$id} or return;
1022	58					131	return $self->_fileno2path((&{$self->{unpackmeth}}($offset))[6]);
	58					110
1023							}
1024
1025
1026							sub _fileno2path {
1027	58			58		103	my ($self, $fileno) = @_;
1028	58					225	return $self->{fileno2path}->[$fileno];
1029							}
1030
1031
1032							sub _path2fileno {
1033	125			125		319	my ($self, $path) = @_;
1034	125	100				409	if ( not exists $self->{filepath2no}->{$path} ) {
1035	63					184	my $fileno = ($self->{filepath2no}->{$path} = 0+ $self->{fileno}++);
1036	63					111	$self->{fileno2path}->[$fileno] = $path; # Save path
1037							}
1038	125					247	return $self->{filepath2no}->{$path};
1039
1040							}
1041
1042
1043							sub _packSmall {
1044	3052			3052		11033	return pack STRUCT, @_;
1045							}
1046
1047
1048							sub _packBig {
1049	0			0		0	return pack STRUCTBIG, @_;
1050							}
1051
1052
1053							sub _unpackSmall {
1054	209			209		1014	return unpack STRUCT, shift;
1055							}
1056
1057
1058							sub _unpackBig {
1059	0			0		0	return unpack STRUCTBIG, shift;
1060							}
1061
1062
1063							sub _set_pack_method {
1064							# Determine whether to use 32 or 64 bit integers for the given files.
1065	20			20		35	my $self = shift;
1066							# Find the maximum file size:
1067	20					49	my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
	88					106
	63					368
1068	20					35	my $fourGB = (2 ** 32) - 1;
1069
1070	20	50				46	if ($maxsize > $fourGB) {
1071							# At least one file exceeds 4Gb - we will need to use 64 bit ints
1072	0					0	$self->{packmeth} = \&_packBig;
1073	0					0	$self->{unpackmeth} = \&_unpackBig;
1074							} else {
1075	20					50	$self->{packmeth} = \&_packSmall;
1076	20					50	$self->{unpackmeth} = \&_unpackSmall;
1077							}
1078	20					35	return 1;
1079							}
1080
1081
1082							#-------------------------------------------------------------
1083							# Tied hash logic
1084							#
1085
1086							sub TIEHASH {
1087	2			2		42	return shift->new(@_);
1088							}
1089
1090
1091							sub FETCH {
1092	3			3		64	return shift->subseq(@_);
1093							}
1094
1095
1096							sub STORE {
1097	0			0		0	shift->throw("Read-only database");
1098							}
1099
1100
1101							sub DELETE {
1102	0			0		0	shift->throw("Read-only database");
1103							}
1104
1105
1106							sub CLEAR {
1107	0			0		0	shift->throw("Read-only database");
1108							}
1109
1110
1111							sub EXISTS {
1112	1			1		8	return defined shift->offset(@_);
1113							}
1114
1115
1116							sub FIRSTKEY {
1117	4			4		6	return tied(%{shift->{offsets}})->FIRSTKEY(@_);
	4					41
1118							}
1119
1120
1121							sub NEXTKEY {
1122	9			9		11	return tied(%{shift->{offsets}})->NEXTKEY(@_);
	9					44
1123							}
1124
1125
1126							sub DESTROY {
1127	27			27		2143	my $self = shift;
1128
1129							# Close filehandles
1130	27					48	while (my ($file, $fh) = each %{ $self->{fhcache} }) {
	37					260
1131	10	50				24	if (defined $fh) {
1132	10					62	$fh->close;
1133							}
1134							}
1135	27					98	$self->_close_index($self->{offsets});
1136
1137	27	100	100			115	if ( $self->{clean} \|\| $self->{indexing} ) {
1138							# Indexing aborted or cleaning requested. Delete the index file.
1139	6					10	my $index = $self->{index_name};
1140
1141							# Tied-hash in Strawberry Perl creates "$file.index"
1142	6	50				42	unlink $index if -e $index;
1143							# Tied-hash in ActivePerl creates "$file.index.pag" and "$file.index.dir"
1144	6	100				171	unlink "$index.dir" if -e "$index.dir";
1145	6	100				157	unlink "$index.pag" if -e "$index.pag";
1146							}
1147	27					262	return 1;
1148							}
1149
1150
1151							#-------------------------------------------------------------
1152							# stream-based access to the database
1153							#
1154
1155							package Bio::DB::Indexed::Stream;
1156	2			2		15	use base qw(Tie::Handle Bio::DB::SeqI);
	2					4
	2					630
1157
1158
1159							sub new {
1160	4			4		7	my ($class, $db) = @_;
1161	4					15	my $key = $db->FIRSTKEY;
1162	4					26	return bless {
1163							db => $db,
1164							key => $key
1165							}, $class;
1166							}
1167
1168							sub next_seq {
1169	10			10		17	my $self = shift;
1170	10					13	my ($key, $db) = @{$self}{'key', 'db'};
	10					21
1171	10	100				24	return if not defined $key;
1172	9					18	my $value = $db->get_Seq_by_id($key);
1173	9					29	$self->{key} = $db->NEXTKEY($key);
1174	9					25	return $value;
1175							}
1176
1177							sub TIEHANDLE {
1178	1			1		4	my ($class, $db) = @_;
1179	1					3	return $class->new($db);
1180							}
1181
1182							sub READLINE {
1183	1			1		2	my $self = shift;
1184	1		50			3	return $self->next_seq \|\| undef;
1185							}
1186
1187
1188							1;