File Coverage

blib/lib/Sort/Merge.pm
Criterion Covered Total %
statement 46 46 100.0
branch 7 8 87.5
condition 2 2 100.0
subroutine 6 6 100.0
pod 1 3 33.3
total 62 65 95.3


line stmt bran cond sub pod time code
1             #!/usr/bin/perl
2             # Copyright 2003, James Mastros,
3             # Released under the same terms as perl itself.
4             # See bottom for full copyright info.
5             package Sort::Merge;
6              
7 3     3   118953 use warnings;
  3         8  
  3         137  
8 3     3   16 use strict;
  3         5  
  3         98  
9 3     3   14443 use Data::Dumper;
  3         48580  
  3         1517  
10              
11             our $VERSION = 0.01;
12              
13             # "Big" data structure is sources.
14             # This is an array(ref) of arrayrefs.
15             # Each inner array has three elements.
16             # The first is a coderef, which should return the next two elements when called.
17             # The second is the sort key for the next data point.
18             # The third is the data point itself.
19              
20             # Given a sources array, makes sure that each has it's next datapoint loaded,
21             # and returns the new sources list.
22             sub prime {
23 51     51 0 50 my @sources=@{shift @_};
  51         86  
24 51         52 my $indexes=shift;
25 51   100     106 $indexes ||= [0..$#sources];
26 51         62 my @indexes=@$indexes;
27            
28             # print "In prime: ", Dumper(\@sources);
29 51         71 foreach my $source (@sources[@$indexes]) {
30 55 50       166 if (not defined $source->[1]) {
31             # print "Priming.\n";
32 55         106 my ($key, $data) = $source->[0]->();
33 55 100       261 if (!defined $key) {
34             # print "Source dead.\n";
35             # We can't just delete it here, so mark it undef, and filter them out later.
36 8         12 $source=undef;
37             }
38             # print "Key: $key, Data: $data\n";
39 55         62 $source->[1] = $key;
40 55         170 $source->[2] = $data;
41             }
42             }
43            
44             # print "Before purge: ", Dumper(\@sources);
45 51         68 @sources = grep {defined $_->[0]} @sources;
  131         226  
46             # print "After purge: ", Dumper(\@sources);
47              
48 51         118 return @sources;
49             }
50              
51             # Given a sources arrayref, and a coderef to call with each data point, sort.
52             sub sort_inner {
53 5     5 0 8 my $sources=shift;
54 5         10 my $output=shift;
55              
56 5         11 my @sources=@$sources;
57 5         7 my $lastsource;
58 5         21 while (@sources) {
59 51         88 @sources=prime(\@sources, $lastsource);
60 51 100       106 last if !@sources;
61              
62             # print Dumper \@sources;
63            
64 47         52 my($firstsource,$earliesttime)=(0,~0);
65            
66 47         62 foreach (0..$#sources) {
67 123 100       218 if ($earliesttime > $sources[$_][1]) {
68 69         60 $earliesttime=$sources[$_][1];
69 69         77 $firstsource=$_;
70             }
71             }
72 47         94 $output->($sources[$firstsource]);
73 47         148 $sources[$firstsource][1] = undef;
74 47         52 $sources[$firstsource][2] = undef;
75 47         115 $lastsource=[$firstsource];
76             }
77             }
78              
79             # Given an arrayref of input coderefs, and a single output coderef, sort.
80             sub sort_coderefs {
81 5     5 1 3028 my $inputs=shift;
82 5         10 my $output=shift;
83 5         7 my @sources;
84              
85 5         15 @sources = map {[$_]} @$inputs;
  8         26  
86 5         20 return sort_inner(\@sources, $output);
87             }
88              
89             =head1 NAME
90              
91             Sort::Merge - general merge sort
92              
93             =head1 SYNOPSIS
94              
95             use Sort::Merge;
96             use IO::File;
97             my @output;
98             Sort::Merge::sort_coderefs([sub{our $i=0; return ($i++ x 2)},
99             sub{our $j=0; return ($j++*5 x 2)}],
100             sub{print shift->[1});
101              
102             =head1 DESCRIPTION
103              
104             Merge sorting is a technique for merging together multiple input
105             streams, each of which is sorted, into one larger output stream, still
106             sorted. This module implements that algorithm.
107              
108             Note the large caveat above: the inputs must already be sorted. If
109             the inputs aren't sorted, you'll need to sort them yourself first, in
110             which case a merge sort may not be the right algorithm for the job.
111              
112             Sort::Merge was designed to give you, the programmer, control over
113             where your data comes from, and where it goes -- it's simply supposed
114             to sort it, and not fetch it from files, parse out sort keys, etc,
115             etc. There's at least one merge-sorting module already on CPAN,
116             File::MergeSort, but I wanted one that I could use to sort together
117             multiple files that were of different formats, and not line-oriented
118             formats either.
119              
120             The module I got out doesn't even require that the inputs are files at
121             all, or that the output gets printed. It also streams the output, so
122             you needn't wait until the sort is complete before printing the first
123             piece of output. (The fact that this is possible at all is one of
124             more useful properties of merge sorts vs. other sorts.)
125              
126             Most (OK, at the moment all) of the available interfaces require you to
127             write your input and output handlers in terms of coderefs. If this is
128             too much work for you, I encourage you to not use this module, or,
129             alternatively, to propose another interface to me. I'm actively
130             looking for more and better interfaces.
131              
132             =head2 Sources
133              
134             There are two types of coderefs Sort::Merge makes you write, sources
135             and the output.
136              
137             Sources are sources of input data. They are called with no
138             parameters. They are expected to return a two-element list if they
139             have more data.
140              
141             The first element should be a sort key, which will be compared
142             lexographicly (using cmp). If you wish to think in terms of numerical
143             sort keys, simply run it through chr, pack, or sprintf to get a
144             representation that will sort properly. (Passing it to chr requires
145             that the number is a nonnegative integer, and may warn for some
146             values, but is quite fast.)
147              
148             The second element may be any arbitrary scalar. This is your
149             datapoint. It is passed to the output subroutine without being
150             examined. It can be a line of text, an arrayref or hashref, or even
151             an object, or undef for all Sort::Merge cares.
152              
153             If the source has run out of data, it should return nothing (an empty
154             list, not C). If it wishes to signal an error and abort
155             processing, it can simply C; the error will be propagated.
156              
157             =head2 Output
158              
159             Because I'm lazy, the output callback gets passed the source
160             structure. That is, it gets an arrayref of three values. The first
161             is the coderef, the second is the key, the third is the data.
162              
163             To put it another way, it gets an arrayref of the coderef, followed by
164             what the coderef has returned.
165              
166             =head2 C
167              
168             C takes an array-ref of source coderefs, and a single output coderef.
169              
170             The output coderef is called for each element, in sorted order, and
171             the function returns empty-list.
172              
173             =head1 SEE ALSO
174              
175             L, the source code.
176              
177             =head1 COPYRIGHT AND DISCLAIMERS
178              
179             Copyright (c) 2003 James M. Mastros. All rights reserved.
180              
181             This library is free software; you can redistribute it and/or modify it
182             under the same terms as Perl itself.
183              
184             This program is distributed in the hope that it will be useful, but
185             without any warranty; without even the implied warranty of
186             merchantability or fitness for a particular purpose.
187              
188             =head1 AUTHOR
189              
190             James Mastros Ejames@mastros.bizE, L
191              
192             =cut