File Coverage

blib/lib/File/Shuffle.pm
Criterion Covered Total %
statement 98 105 93.3
branch 19 34 55.8
condition 7 18 38.8
subroutine 14 14 100.0
pod 1 5 20.0
total 139 176 78.9


line stmt bran cond sub pod time code
1             package File::Shuffle;
2              
3 1     1   35499 use strict;
  1         2  
  1         38  
4 1     1   6 use warnings;
  1         3  
  1         34  
5 1     1   6137 use File::Temp qw(tempdir);
  1         35136  
  1         58  
6 1     1   910 use Sort::External;
  1         3154  
  1         40  
7 1     1   947 use Data::Dump qw(dump);
  1         6811  
  1         90  
8              
9             BEGIN
10             {
11 1     1   12 use Exporter ();
  1         2  
  1         21  
12 1     1   5 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  1         1  
  1         113  
13 1     1   2 $VERSION = '0.10';
14 1         109 @ISA = qw(Exporter);
15 1         3 @EXPORT = qw();
16 1         3 @EXPORT_OK = qw(fileShuffle);
17 1         603 %EXPORT_TAGS = ();
18             }
19              
20             #01234567890123456789012345678901234567891234
21             #Randomly shuffle the lines in a file.
22              
23             =head1 NAME
24              
25             C - Randomly shuffle the lines in a file.
26              
27             =head1 SYNOPSIS
28              
29             use File::Temp qw(tempfile);
30             use File::Shuffle qw(fileShuffle);
31             use Data::Dump qw(dump);
32             my ($handle, $inputFile) = tempfile();
33             print $handle join("\n", 0..9, '');
34             close $handle;
35             fileShuffle (inputFile => $inputFile);
36             open ($handle, '<', $inputFile);
37             my @lines = <$handle>;
38             close $handle;
39             print @lines;
40              
41             =head1 DESCRIPTION
42              
43             C provides the routine C to randomly shuffle the lines
44             in a file.
45              
46             =head1 SUBROUTINES
47              
48             =head2 C
49              
50             The subroutine C randomly shuffles the lines in a file with
51             the following parameters:
52              
53             =over
54              
55             =item C
56              
57             inputFile => '...'
58              
59             C holds the path to the file whose lines are to be shuffled; if it does
60             not exist or if it is not a file an exception is thrown.
61              
62             =item C
63              
64             outputFile => '...'
65              
66             C is the file the shuffled lines are to be written to, it may equal
67             C; the default is C.
68              
69             =item C
70              
71             tempDirectory => File::Temp::tempdir()
72              
73             C is a temporary directory that intermediate files are written to if the C
74             is too large to shuffle using only internal memory; the default
75             is set using L.
76              
77             =item C
78              
79             encoding => ''
80              
81             C is the encoding to used when openning the input and output files; the default is the
82             the system default of the Perl C function.
83              
84             =item C
85              
86             fileSizeBound => 1000000
87              
88             If the input file contains less than C bytes, the file will be shuffled entirely using
89             internal memory, otherwise L is used to shuffle the lines in the file.
90              
91             =back
92              
93             =cut
94              
95             sub fileShuffle
96             {
97 2     2 1 5183 my (%Parameters) = @_;
98              
99             # make sure the input file was defined.
100 2 50 33     21 unless (exists($Parameters{inputFile}) && defined($Parameters{inputFile}))
101             {
102 0         0 die("error: inputFile parameter is undefined.\n");
103             }
104 2         4 my $InputFile = $Parameters{inputFile};
105              
106             # make sure the input files was defined.
107 2 50       7 unless (defined $InputFile)
108             {
109 0         0 die("error: input file undefined.\n");
110             }
111              
112             # make sure the input file exists.
113 2 50       39 unless (-e $InputFile)
114             {
115 0         0 die("error: input file '$InputFile' does not exist.\n");
116             }
117              
118             # make sure the input file is a file.
119 2 50       32 unless (-f $InputFile)
120             {
121 0         0 die("error: input file '$InputFile' is not a file.\n");
122             }
123              
124             # set the default encoding to utf8.
125 2         5 my $Encoding = '';
126 2 50 33     9 $Encoding = $Parameters{encoding} if (exists($Parameters{encoding}) && defined($Parameters{encoding}));
127              
128             # ensure the encoding is prefixed with a colon.
129 2 50 33     9 $Encoding = ':' . $Encoding if (length($Encoding) && (substr($Encoding, 0, 1) ne ':'));
130              
131             # set the default file size bound.
132 2         5 my $FileSizeBound = 1000000;
133 2 100 66     14 $FileSizeBound = int abs $Parameters{fileSizeBound} if (exists($Parameters{fileSizeBound}) && defined($Parameters{fileSizeBound}));
134              
135             # set the temp directory if defined.
136 2         3 my $TempDirectory;
137 2 50 33     7 $TempDirectory = $Parameters{tempDirectory} if (exists($Parameters{tempDirectory}) && defined($Parameters{tempDirectory}));
138              
139             # set the temp directory if defined.
140 2         3 my $OutputFile = $InputFile;
141 2 50 33     15 $OutputFile = $Parameters{outputFile} if (exists($Parameters{outputFile}) && defined($Parameters{outputFile}));
142              
143             # open the input file for reading only.
144 2         2 my $inputFileHandle;
145 2 50       76 unless (open($inputFileHandle, "<$Encoding", $InputFile))
146             {
147 0         0 die("could not open file '$InputFile' for reading: $!\n");
148             }
149              
150             # get the size of the file.
151 2         30 my $inputFileBytes = -s $InputFile;
152              
153             # small files are shuffled like an internal array.
154 2 100       74 if ($inputFileBytes <= $FileSizeBound)
155             {
156              
157             # the file is small enough to read into memory and shuffle.
158 1         4 shuffleInternal($inputFileHandle, $OutputFile, $Encoding);
159             }
160             else
161             {
162             # the file is too large to read in, so shuffle via a random prefix and sort.
163              
164             # create and set the temporary directory if needed.
165 1 50       9 $TempDirectory = tempdir(CLEANUP => 1) unless defined $TempDirectory;
166              
167             # shuffle the file via sorting.
168 1         707 shuffleExternal($inputFileHandle, $OutputFile, $Encoding, $TempDirectory, $inputFileBytes);
169             }
170              
171 2         18 return undef;
172             }
173              
174             sub shuffleInternal
175             {
176 1     1 0 3 my ($InputHandle, $OutputFile, $Encoding) = @_;
177              
178             # read in all the lines of the file.
179 1         857 my @linesInFile = <$InputHandle>;
180              
181             # close the input file.
182 1         43 close $InputHandle;
183              
184             # shuffle the lines.
185 1         2 my $totalLines = @linesInFile;
186 1         5 for (my $i = 0 ; $i < $totalLines ; $i++)
187             {
188              
189             # select a random line to swap $i with.
190 1001         1340 my $j = int rand $totalLines;
191              
192             # save the line at $j.
193 1001         1272 my $lineAtj = $linesInFile[$j];
194              
195             # replace line at $j with $i.
196 1001         1326 $linesInFile[$j] = $linesInFile[$i];
197              
198             # replace line at $i with $j.
199 1001         2543 $linesInFile[$i] = $lineAtj;
200             }
201              
202             # open the output file for writing only.
203 1         3 my $outputFileHandle;
204 1 50       120 unless (open($outputFileHandle, ">$Encoding", $OutputFile))
205             {
206 0         0 die("could not open file '$OutputFile' for writing: $!\n");
207             }
208              
209             # output the shuffled lines.
210 1         7 for (my $i = 0 ; $i < $totalLines ; $i++)
211             {
212 1001         1159 print $outputFileHandle $linesInFile[$i];
213 1001         2388 $linesInFile[$i] = undef;
214             }
215              
216             # close the output file.
217 1         101 close $outputFileHandle;
218              
219 1         79 return undef;
220             }
221              
222             sub shuffleExternal
223             {
224 1     1   1471 use bytes;
  1         11  
  1         6  
225              
226 1     1 0 6 my ($InputHandle, $OutputFile, $Encoding, $TempDirectory, $InputFileBytes) = @_;
227              
228             # set the prefix size.
229 1         33 my $prefixByteSize = getPrefixByteSize($InputFileBytes);
230              
231             # create the sorter.
232 1         15 my $sorter = Sort::External->new(mem_threshold => 64 * 1024 * 1024, working_dir => $TempDirectory);
233              
234             # add each line to the sorter prefixed with a random string of $prefixByteSize bytes.
235 1         749 while (defined(my $line = <$InputHandle>))
236             {
237              
238             # feed the prefix and line to the sorter.
239 1001         2504 $sorter->feed(getRandomString($prefixByteSize) . $line);
240             }
241              
242             # close the input file.
243 1         26 close $InputHandle;
244              
245             # finish the sorting.
246 1         12 $sorter->finish();
247              
248             # open the output file for writing only.
249 1         795 my $outputFileHandle;
250 1 50       200 unless (open($outputFileHandle, ">$Encoding", $OutputFile))
251             {
252 0         0 die("could not open file '$OutputFile' for writing: $!\n");
253             }
254              
255 1         10 while (defined(my $prefixedLine = $sorter->fetch))
256             {
257              
258             # write the original line to the output file.
259 1001         4454 print $outputFileHandle substr($prefixedLine, $prefixByteSize);
260             }
261 1         34 $sorter = undef;
262              
263             # close the output file.
264 1         17 close $outputFileHandle;
265              
266 1         525 return undef;
267             }
268              
269             sub getPrefixByteSize
270             {
271 1     1 0 3 my ($BytesInFile) = @_;
272              
273             # make sure the total bytes is a non-negative integer.
274 1         278 $BytesInFile = int abs $BytesInFile;
275              
276             # if less than two, return zero.
277 1 50       8 return 0 if $BytesInFile < 2;
278              
279             # assume each line in the file is at least two bytes;
280             # compute the number of bits needed to represent the
281             # maximum possible number of lines.
282 1         23 my $maxPrefixBits = log(abs($BytesInFile) + 1) / log(2) - 1;
283 1 50       7 $maxPrefixBits = 1 if $maxPrefixBits < 1;
284 1         3 $maxPrefixBits = int $maxPrefixBits;
285              
286             # compute the number of bytes needed for the prefixes.
287 1         3 my $bytesInPrefix = int(($maxPrefixBits + 7) / 8);
288              
289 1         3 return $bytesInPrefix;
290             }
291              
292             sub getRandomString
293             {
294              
295             # get the number of bytes in the string.
296 1001     1001 0 1031 my $totalBytes = $_[0];
297              
298             # get the number of shorts in the string.
299 1001         1024 my $totalShorts = $totalBytes >> 1;
300              
301             # generate the shorts.
302 1001         916 my @listOfShorts;
303 1001         1756 while ($totalShorts > 0)
304             {
305 1001         2281 push @listOfShorts, pack('S', int rand(1 << 16));
306 1001         2015 --$totalShorts;
307             }
308              
309             # if totalBytes is odd, add one more random byte.
310 1001 50       1798 push @listOfShorts, pack('C', int rand(1 << 8)) if $totalBytes & 1;
311              
312             # return the string.
313 1001         6487 return join('', @listOfShorts);
314             }
315              
316             =head1 INSTALLATION
317              
318             Use L to install the module and all its prerequisites:
319              
320             perl -MCPAN -e shell
321             cpan[1]> install File::Shuffle
322              
323             =head1 BUGS
324              
325             Please email bugs reports or feature requests to C, or through
326             the web interface at L. The author
327             will be notified and you can be automatically notified of progress on the bug fix or feature request.
328              
329             =head1 AUTHOR
330              
331             Jeff Kubina
332              
333             =head1 COPYRIGHT
334              
335             Copyright (c) 2009 Jeff Kubina. All rights reserved.
336             This program is free software; you can redistribute
337             it and/or modify it under the same terms as Perl itself.
338              
339             The full text of the license can be found in the
340             LICENSE file included with this module.
341              
342             =head1 KEYWORDS
343              
344             file, permute, randomize, shuffle
345              
346             =head1 SEE ALSO
347              
348             L
349              
350             =cut
351              
352             1;
353              
354             # The preceding line will help the module return a true value