File Coverage

blib/lib/Parse/ExuberantCTags/Merge.pm
Criterion Covered Total %
statement 188 191 98.4
branch 41 58 70.6
condition 7 11 63.6
subroutine 20 20 100.0
pod 1 2 50.0
total 257 282 91.1


line stmt bran cond sub pod time code
1             package Parse::ExuberantCTags::Merge;
2              
3 2     2   47439 use 5.006001;
  2         6  
  2         63  
4 2     2   11 use strict;
  2         14  
  2         64  
5 2     2   9 use warnings;
  2         21  
  2         115  
6              
7             our $VERSION = '1.01';
8 2     2   10 use constant DEBUG => 0;
  2         4  
  2         174  
9              
10 2     2   10 use constant SMALL_DEFAULT => 2**22;
  2         3  
  2         84  
11 2     2   10 use constant SUPER_SMALL_DEFAULT => 2**17;
  2         4  
  2         87  
12              
13 2     2   11 use constant FILENAME => 0;
  2         4  
  2         90  
14 2     2   7 use constant SORTED => 1;
  2         3  
  2         152  
15              
16 2     2   8 use constant MRG_LINE => 0;
  2         3  
  2         92  
17 2     2   20 use constant MRG_FH => 1;
  2         2  
  2         133  
18              
19             use Class::XSAccessor
20 2         18 constructor => 'new',
21             accessors => {
22             small_size_threshold => 'small_size_threshold',
23             super_small_size_threshold => 'super_small_size_threshold',
24             tempdir => 'tempdir',
25 2     2   1669 };
  2         5819  
26              
27 2     2   568 use Carp ();
  2         4  
  2         29  
28 2     2   2421 use File::Temp ();
  2         54941  
  2         58  
29 2     2   37 use File::Spec ();
  2         4  
  2         33  
30 2     2   2314 use Parse::ExuberantCTags::Merge::SimpleScopeGuard;
  2         5  
  2         3785  
31              
32             sub add_file {
33 21     21 1 14360 my $self = shift;
34 21         38 my $file = shift;
35 21 50       55 Carp::croak("Need file argument")
36             if not defined $file;
37 21 50       319 Carp::croak("Input file '$file' does not exist")
38             if not -f $file;
39            
40 21         91 my %opts = @_;
41 21         34 my $sorted = $opts{sorted};
42              
43 21   100     93 $self->{files} ||= [];
44 21         28 push @{$self->{files}}, [$file, $sorted];
  21         58  
45 21         103 return();
46             }
47              
48              
49             sub write {
50 12     12 0 696 my $self = shift;
51 12         18 my $outfile = shift;
52 12 50       29 Carp::croak("Need output file argument")
53             if not defined $outfile;
54              
55             # determine temporary directory
56 12         30 my $tmpdir = $self->tempdir;
57 12 50 33     39 if (not defined $tmpdir or not -d $tmpdir) {
58 12         188 $tmpdir = File::Spec->tmpdir();
59 12         34 $self->tempdir($tmpdir);
60             }
61              
62 12         53 my $total_size = 0;
63 12         17 my $sorted_size = 0;
64 12         16 my $unsorted_size = 0;
65 12         18 my @sorted;
66             my @unsorted;
67              
68 12         26 my $files = $self->{files};
69 12 50 33     61 Carp::croak("Need input files")
70             if not defined $files or @$files == 0;
71              
72             # only one sorted input file => copy
73 12 100 100     53 if (@$files == 1 and $files->[0][SORTED]) {
74 1         2 warn "Only one sorted input file => copying" if DEBUG;
75 1         3 my $infile = $files->[0][FILENAME];
76 1 50       43 open my $fh, '<', $infile
77             or die "Opening input file '$infile' for reading failed: $!";
78 1 50       65 open my $ofh, '>', $outfile
79             or die "Opening output file '$outfile' for writing failed: $!";
80              
81 1         16 print $ofh "!_TAG_FILE_SORTED 1 /0=unsorted, 1=sorted/\n";
82              
83 1         5 local $/ = \1000000;
84 1         60 while (<$fh>) {
85 1         10 print $ofh $_;
86             }
87 1         20 close $fh;
88 1         92 close $ofh;
89 1         10 return(1);
90             }
91            
92             # calculate the file sizes
93 11         26 foreach my $file (@$files) {
94 20         28 my $fname = $file->[FILENAME];
95 20         228 my $s = -s $fname;
96 20         26 $total_size += $s;
97 20 100       46 if ($file->[SORTED]) {
98 9         14 $sorted_size += $s;
99 9         26 push @sorted, $fname;
100             }
101             else {
102 11         13 $unsorted_size += $s;
103 11         36 push @unsorted, $fname;
104             }
105             }
106              
107             # get size thresholds
108 11         27 my $threshold_super_small = $self->super_small_size_threshold();
109 11 100       27 $threshold_super_small = SUPER_SMALL_DEFAULT if not defined $threshold_super_small;
110 11         26 my $threshold_small = $self->small_size_threshold();
111 11 100       26 $threshold_small = SMALL_DEFAULT if not defined $threshold_small;
112 11         12 warn "Thresholds: tiny=$threshold_super_small small=$threshold_small" if DEBUG > 1;
113              
114             # storage of temporary files and guard to clean them up on scope exit
115 11         15 my @tmpfiles;
116 11         128 my $guard = Parse::ExuberantCTags::Merge::SimpleScopeGuard->new(files => \@tmpfiles);
117              
118             # select sort strategy
119              
120             # everything small, sort all in memory regardless
121 11 100       26 if ($total_size < $threshold_super_small) {
122 5         8 warn "Total size < super-small-threshold => memory sort" if DEBUG;
123 5 50       305 open my $ofh, '>', $outfile
124             or die "Could not open output file '$outfile' for writing: $!";
125 5         18 return $self->_memory_sort($ofh, @sorted, @unsorted);
126             }
127            
128             # This must handle the unsorted files
129 6 100       16 if (@unsorted) {
130 4         6 warn "There are unsorted files..." if DEBUG;
131 4 100       15 if ($unsorted_size < $threshold_small) {
    100          
132             # unsorted files are small and will be sorted in memory
133 2         3 warn "Unsorted files small => memory sort" if DEBUG;
134 2         2 my ($tfh, $tmpfile);
135 2 100       5 if (@sorted) { # if there are sorted files (must be largish), use a tempfile
136 1         5 ($tfh, $tmpfile) = File::Temp::tempfile(
137             "ctagsSortXXXXXX", UNLINK => 0, DIR => $tmpdir
138             );
139 1         365 push @tmpfiles, $tmpfile;
140             }
141             else { # unsorted only => use real output file
142 1 50       50 open $tfh, '>', $outfile
143             or die "Could not open output file '$outfile' for writing: $!";
144             }
145 2         8 $self->_memory_sort($tfh, @unsorted);
146 2         71 close $tfh;
147 2 100       7 if (not @sorted) { # only unsorted data => done!
148 1         5 return 1;
149             }
150 1         3 push @sorted, $tmpfile;
151 1         18 $sorted_size += -s $tmpfile;
152             }
153             elsif ($sorted_size < $threshold_small) {
154             # handle everything with Sort::External
155             # don't bother with merge-sorting the small sorted files
156 1         2 warn "Sorted files small or not existant => external sort for all" if DEBUG;
157 1 50       46 open my $ofh, '>', $outfile
158             or die "Could not open output file '$outfile' for writing: $!";
159 1         75 return $self->_external_sort($ofh, @unsorted, @sorted);
160             }
161             else {
162             # both are large. First do an external sort on the unsorted files,
163             # then do a merge sort
164 1         3 warn "potentially large files => external sort for unsorted files" if DEBUG;
165 1         6 my ($tfh, $tmpfile) = File::Temp::tempfile(
166             "ctagsSortXXXXXX", UNLINK => 0, DIR => $tmpdir
167             );
168 1         383 push @tmpfiles, $tmpfile;
169 1         5 $self->_external_sort($tfh, @unsorted);
170 1         300 close $tfh;
171 1         3 push @sorted, $tmpfile;
172 1         20 $sorted_size += -s $tmpfile;
173             }
174             } # end if there is unsorted data
175              
176             # at this point, there should be only sorted files
177             # left => merge sort
178 4         7 warn "running merge sort" if DEBUG;
179 4 50       343 open my $ofh, '>', $outfile
180             or die "Could not open output file '$outfile' for writing: $!";
181              
182 4         14 return $self->_merge_sort($ofh, @sorted);
183             }
184              
185              
186             sub _merge_sort {
187 4     4   6 warn "running _merge_sort" if DEBUG;
188 4         7 my $self = shift;
189 4         4 my $ofh = shift;
190 4         11 my @infiles = @_;
191              
192 4         20 print $ofh "!_TAG_FILE_SORTED 1 /0=unsorted, 1=sorted/\n";
193              
194 4         16 local $/ = "\n";
195              
196             # get the first lines and create a list of simple structs for sorting
197 8 50       242 my @files =
198             map {
199 4         8 open my $fh, '<', $_ or die "Can't open input file '$_' for reading: $!";
200 8         138 my $first = <$fh>;
201 8 100       28 $first = <$fh> if $first =~ /^!_TAG_FILE_SORTED\t/; # skip magic line
202 8         31 [$first, $fh]
203             }
204             @infiles;
205              
206             # initial sort of the first lines
207 4         20 @files = sort {$a->[MRG_LINE] cmp $b->[MRG_LINE]} @files;
  4         13  
208            
209             # keep sorting until all sources run out
210 4         9 while (@files) {
211             # first file in the list always has the next "lowest" line
212 24         25 my $next = $files[0];
213 24         36 print $ofh $next->[MRG_LINE];
214              
215             # fetch a new line for this file handle
216 24         30 my $fh = $next->[MRG_FH];
217 24         67 $next->[MRG_LINE] = <$fh>;
218 24 100       43 if (not defined $next->[MRG_LINE]) {
219             # eof, lose the file
220 8         13 splice(@files, 0, 1);
221 8         141 next;
222             }
223              
224             # one pass of bubble sort to propagate the new line to its place
225 16         47 for (my $i = 1; $i < @files; ++$i) {
226 8 50       20 if (($files[$i-1][MRG_LINE] cmp $files[$i][MRG_LINE]) == 1) {
227 0         0 my $tmp = $files[$i-1];
228 0         0 $files[$i-1] = $files[$i];
229 0         0 $files[$i] = $tmp;
230             } else {
231 8         21 last;
232             }
233             }
234             } # end while there are files
235              
236 4         278 return(1);
237             }
238              
239              
240              
241             sub _external_sort {
242 2     2   4 warn "running _external_sort" if DEBUG;
243 2         4 my $self = shift;
244 2         3 my $ofh = shift;
245 2         6 my @infiles = @_;
246              
247 2         11 print $ofh "!_TAG_FILE_SORTED 1 /0=unsorted, 1=sorted/\n";
248              
249 2         921 require Sort::External;
250 2         3308 my $exsort = Sort::External->new(
251             mem_threshold => 1024**2 * 32, # todo: configuration
252             );
253              
254 2         1040 local $/ = "\n";
255 2         5 foreach my $infile (@infiles) {
256 3 50       96 open my $fh, '<', $infile
257             or die "Could not open input file '$infile' for reading: $!";
258 3         51 my $first_line = <$fh>;
259 3 50       24 $exsort->feed($first_line) if $first_line !~ /^!_TAG_FILE_SORTED/;
260 3         10 while (<$fh>) {
261 6         41 $exsort->feed($_);
262             }
263 3         36 close $fh;
264             }
265 2         10 $exsort->finish();
266 2         47 while (defined($_ = $exsort->fetch)) {
267 9         35 print $ofh $_;
268             }
269              
270 2         44 return(1);
271             }
272              
273              
274             sub _memory_sort {
275 7     7   8 warn "running _memory_sort" if DEBUG;
276 7         11 my $self = shift;
277 7         9 my $ofh = shift;
278 7         18 my @infiles = @_;
279              
280 7         80 local $/ = "\n";
281 7         10 my @records;
282 7         13 foreach my $infile (@infiles) {
283 11 50       353 open my $fh, '<', $infile
284             or die "Could not open input file '$infile' for reading: $!";
285 11         144 my $first_line = <$fh>;
286 11 50       79 push @records, $first_line if $first_line !~ /^!_TAG_FILE_SORTED/;
287 11         88 push @records, <$fh>;
288 11         148 close $fh;
289             }
290 7         33 @records = sort @records; # check fast inplace sort
291            
292 7         81 print $ofh "!_TAG_FILE_SORTED 1 /0=unsorted, 1=sorted/\n";
293 7         15 print $ofh @records;
294              
295 7         320 return(1);
296             }
297              
298             1;
299             __END__