File Coverage

blib/lib/Text/Parts.pm
Criterion Covered Total %
statement 152 172 88.3
branch 67 96 69.7
condition 16 22 72.7
subroutine 23 29 79.3
pod 7 7 100.0
total 265 326 81.2


line stmt bran cond sub pod time code
1             package Text::Parts;
2              
3 10     10   281646 use warnings;
  10         24  
  10         311  
4 10     10   54 use strict;
  10         17  
  10         280  
5 10     10   52 use Carp ();
  10         25  
  10         155  
6 10     10   50 use File::Spec ();
  10         26  
  10         18462  
7              
8             sub new {
9 52     52 1 286515 my ($class, %args) = @_;
10 52   33     434 $args{eol} ||= $/;
11 52 100       2222 $args{file} = File::Spec->rel2abs($args{file}) if $args{file};
12 52   50     344 $args{parser_method} ||= 'getline';
13 52         224 bless \%args, $class;
14             }
15              
16             sub eol {
17 1     1 1 5 my $self = shift;
18 1 50       12 $self->{eol} = shift if @_;
19 1         2 $self->{eol};
20             }
21              
22             sub file {
23 81     81 1 122 my $self = shift;
24 81 50       257 $self->{file} = File::Spec->rel2abs(shift) if @_;
25 81         1199 $self->{file};
26             }
27              
28             sub parser {
29 0     0 1 0 my $self = shift;
30 0 0       0 $self->{parser} = shift if @_;
31 0         0 $self->{parser};
32             }
33              
34             sub parser_method {
35 0     0 1 0 my $self = shift;
36 0 0       0 $self->{parser_method} = shift if @_;
37 0         0 $self->{parser_method};
38             }
39              
40             sub _size {
41 65     65   106 my ($self) = @_;
42 65         3558 return -s $self->{file};
43             }
44              
45             sub split {
46 61     61 1 353 my ($self, %opt) = @_;
47 61 50 66     211 Carp::croak("num or size is required.") if not $opt{num} and not $opt{size};
48              
49 61 100       213 my $num = $opt{num} ? $opt{num} : int($self->_size / $opt{size});
50 61 100       167 my $max_num = $opt{max_num} ? $opt{max_num} : $num;
51              
52 61 50       162 Carp::croak('num must be grater than 1.') if $num <= 1;
53              
54 61         179 my $file = $self->file;
55 61         174 my $file_size = $self->_size;
56 61         182 my $chunk_size = int $file_size / $num;
57 61         92 my @parts;
58 61 50       3082 open my $fh, '<', $file or Carp::croak "$!: $file";
59 61 50       301 binmode($fh) if $^O =~m{MSWin};
60 61         278 local $/ = $self->{eol};
61 61         130 my $eol_len = length($/);
62 61         86 my $start = 0;
63 61         340 seek $fh, 0, 0;
64 61 50       170 my $getline_method = $self->{parser} ? '_getline_parser' : '_getline';
65 61 100       169 $getline_method .= '_restrict' if $self->{check_line_start};
66 61         87 my $cnt = 1;
67 61         219 while ($num-- > 0) {
68 10302 100       18777 last if $cnt++ > $max_num;
69 10293 100       18665 $chunk_size = $file_size - $start if $start + $chunk_size > $file_size;
70 10293 100       21104 last unless $chunk_size;
71              
72 10280         78425 seek $fh, $chunk_size - $eol_len, 1;
73 10280         23243 $self->$getline_method($fh);
74 10280         19823 my $end = tell($fh);
75 10280 100       53909 my %args = (%$self, (exists $opt{no_open} ? (no_open => $opt{no_open}) : ()));
76 10280         36916 push @parts, Text::Parts::Part->new(%args, start => $start, end => $end - $eol_len);
77 10280         14837 $start = $end;
78 10280 100 100     44154 if (($num > 1) and $chunk_size > $eol_len + 1) {
79 9974         23696 $chunk_size = int(($file_size - $end) / $num);
80 9974 50       40450 $chunk_size = $eol_len + 1 if $chunk_size < $eol_len + 1;
81             }
82             }
83 61         803 close $fh;
84 61         2969 return @parts;
85             }
86              
87             sub write_files {
88 20     20 1 18252 my ($self, $filename, %opt) = @_;
89 20         109 local $/ = $self->{eol};
90              
91 20 50       137 $filename or Carp::croak("file is needed as first argument.");
92 20 100       110 my $code = ref $opt{code} eq 'CODE' ? delete $opt{code} : undef;
93 20         33 my @filename;
94 20 100       70 my $n = defined $opt{start_number} ? delete $opt{start_number} : 1;
95              
96 20 100       143 my @parts = $self->split(%opt, no_open => 1, ($opt{last_number} ? (max_num => $opt{last_number} - $n + 1) : ()));
97              
98 20 50       297 open my $fh, '<', $self->file or Carp::croak "cannot open file($!): " . $self->file;
99 20 50       115 binmode($fh) if $^O =~m{MSWin};
100 20         109 seek $fh, 0, 0;
101 20         44 my $eol_len = length($/);
102 20         45 foreach my $part (@parts) {
103 6107         6476136 my $buf;
104 6107         45708 read $fh, $buf, $part->{end} - $part->{start} + $eol_len;
105 6107         44714 $buf =~s{$/\z}{}s;
106 6107         26314 push @filename, sprintf $filename, $n++;
107 6107 50       486846 open my $fh_w, '>', $filename[-1] or Carp::croak("cannot open file($!): " . $filename[-1]);
108 6107 50       24690 binmode($fh_w) if $^O =~m{MSWin};
109 6107         52184 seek $fh_w, 0, 0;
110 6107         30949 print $fh_w $buf;
111 6107         262056 close $fh_w;
112 6107 100       33631 $code and $code->($filename[-1]);
113             }
114 20         16079 return @filename;
115             }
116              
117             sub _getline {
118 10280     10280   14020 my ($self, $fh) = @_;
119 10280         84384 <$fh>;
120             }
121              
122             sub _getline_parser {
123 0     0   0 my ($self, $fh) = @_;
124 0         0 my $method = $self->{parser_method};
125 0         0 $self->{parser}->$method($fh);
126             }
127              
128             sub _getline_restrict {
129 30     30   47 my ($self, $fh) = @_;
130 30         53 $self->_move_line_start($fh);
131 30         56 $self->_getline($fh);
132             }
133              
134             sub _getline_parser_restrict {
135 0     0   0 my ($self, $fh) = @_;
136 0         0 $self->_move_line_start($fh);
137 0         0 $self->_getline_parser($fh);
138             }
139              
140             sub _move_line_start {
141 34     34   4381 my ($self, $fh) = @_;
142 34         45 my $current = tell $fh;
143 34         253 <$fh>;
144 34         57 my $end = tell $fh;
145 34 50       217 my $size = $current - 1024 < 0 ? int($current / 2) : 1024;
146 34         57 my $eol = $self->{eol};
147 34         50 my $eol_len = length $self->{eol};
148 34         36 my $check = 0;
149 34   66     152 while ($end - $current + $size > 0 and $current - $size > 0) {
150 40         234 seek $fh, $current - $size, 0;
151 40         204 read $fh, my $buffer, $end - $current + $size;
152 40         181 my @buffer = split /$eol/, $buffer;
153 40 100       82 if (@buffer > 1) {
154 30         35 $check = 1;
155 30         44 $current = $end - (length($buffer[-1]) + $eol_len);
156 30         62 last;
157             } else {
158 10         58 $size += $size;
159             }
160             }
161 34 100       241 seek $fh, ($check ? $current : 0), 0;
162             }
163              
164             package
165             Text::Parts::Part;
166              
167 10     10   24444 use overload '<>' => \&getline;
  10         11351  
  10         78  
168             # sub {
169             # my $self = shift;
170             # if (wantarray) {
171             # my @lines;
172             # until ($self->eof) {
173             # push @lines, $self->getline;
174             # }
175             # return @lines;
176             # } else {
177             # return $self->getline;
178             # }
179             # };
180              
181             sub new {
182 10280     10280   37811 my ($class, %args) = @_;
183 10280         11590 my $fh;
184 10280         61852 my $self = bless {%args}, $class;
185 10280 100       25172 if (not $args{no_open}) {
186 127         439 $self->open_and_seek;
187             }
188 10280         28782 $self;
189             }
190              
191             sub eol {
192 0     0   0 my $self = shift;
193 0         0 $self->{eol};
194             }
195              
196             sub open_and_seek {
197 6171     6171   13607 my ($self) = @_;
198 6171 50       356257 open my $fh, '<', $self->{file} or Carp::croak("cannot read" . $self->{file} . ": $!");
199 6171         43519 seek $fh, $self->{start}, 0;
200 6171         19035 $self->{fh} = $fh;
201             }
202              
203             sub is_opened {
204 10090     10090   13712 my ($self) = @_;
205 10090 100       55558 return $self->{fh} ? 1 : 0;
206             }
207              
208             sub close {
209 6044     6044   9767 my ($self) = @_;
210 6044         95965 close $self->{fh};
211 6044         11789 undef $self->{fh};
212 6044         226311 $self->{_opend} = 0;
213             }
214              
215             sub all {
216 6104     6104   870926 my ($self, $buf) = @_;
217 6104         9014 my $buffer = '';
218 6104   100     26128 my $_buf = $buf || \$buffer;
219 6104 100 100     39161 if ($self->{no_open} and not $self->is_opened) {
220 1998         5298 $self->open_and_seek;
221 1998 50       11576 seek $self->fh, $self->{start}, 0 if $self->eof;
222 1998         4901 read $self->fh, $$_buf, $self->{end} - $self->{start};
223 1998         5052 $self->close;
224             } else {
225 4106 100       9502 seek $self->fh, $self->{start}, 0 if $self->eof;
226 4106         8986 read $self->fh, $$_buf, $self->{end} - $self->{start};
227             }
228 6104 100       46156 return $buf ? () : $buffer;
229             }
230              
231             sub write_file {
232 4061     4061   2448566 my ($self, $name) = @_;
233 4061 50       11295 $name or Carp::croak("file is needed.");
234 4061 100 66     19501 if ($self->{no_open} and not $self->is_opened) {
235 4046         8777 $self->open_and_seek;
236 4046 50       349064 open my $fh, '>', $name or Carp::croak("cannot write $name: $!");
237 4046 50       17668 binmode($fh) if $^O =~m{MSWin};
238 4046         12099 print $fh $self->all;
239 4046         12239 $self->close;
240             } else {
241 15 50       1736 open my $fh, '>', $name or Carp::croak("cannot write $name: $!");
242 15 50       71 binmode($fh) if $^O =~m{MSWin};
243 15         44 print $fh $self->all;
244             }
245             }
246              
247             sub getline {
248 214     214   989 my ($self) = @_;
249 214 100       396 return () if $self->eof;
250              
251 132         198 my $fh = $self->{fh};
252 132         1279 return <$fh>;
253             }
254              
255             sub getline_parser {
256 0     0   0 my ($self) = @_;
257 0 0       0 return () if $self->eof;
258              
259 0 0       0 if ($self->{parser}) {
260 0         0 my $method = $self->{parser_method};
261 0         0 $self->{parser}->$method($self->{fh});
262             } else {
263 0         0 Carp::croak("no parser object is given.");
264             }
265             }
266              
267 6119     6119   95857 sub fh { $_[0]->{fh} }
268              
269             sub eof {
270 6542     6542   51711 my ($self) = @_;
271 6542 100       34744 $self->{end} <= tell($self->{fh}) ? 1 : 0;
272             }
273              
274             our $VERSION = '0.16';
275              
276             =head1 NAME
277              
278             Text::Parts - split text file to some parts(from one line start to another/same line end)
279              
280             =head1 SYNOPSIS
281              
282             If you want to split a text file to some number of parts:
283              
284             use Text::Parts;
285            
286             my $splitter = Text::Parts->new(file => $file);
287             my (@parts) = $splitter->split(num => 4);
288              
289             foreach my $part (@parts) {
290             while(my $l = $part->getline) { # or <$part>
291             # ...
292             }
293             }
294              
295             If you want to split a text file by about specified size:
296              
297             use Text::Parts;
298            
299             my $splitter = Text::Parts->new(file => $file);
300             my (@parts) = $splitter->split(size => 10); # size of part will be more than 10.
301             # same as the previous example
302              
303             If you want to split CSV file:
304              
305             use Text::Parts;
306             use Text::CSV_XS; # don't work with Text::CSV_PP if you want to use {binary => 1} option
307             # I don't recommend to use it for CSV which has multiline lines in columns.
308            
309             my $csv = Text::CSV_XS->new();
310             my $splitter = Text::Parts->new(file => $file, parser => $csv);
311             my (@parts) = $splitter->split(num => 4);
312            
313             foreach my $part (@parts) {
314             while(my $col = $part->getline_parser) { # getline_parser returns parsed result
315             print join "\t", @$col;
316             # ...
317             }
318             }
319              
320             Write splitted parts to files:
321              
322             $splitter->write_files('file%d.csv', num => 4);
323            
324             my $i = 0;
325             foreach my $part ($splitter->slit(num => 4)) {
326             $part->write_file("file" . $i++ . '.csv');
327             }
328              
329             with Parallel::ForkManager:
330              
331             my $splitter = Text::Parts->new(file => $file);
332             my (@parts) = $splitter->split(num => 4);
333             my $pm = new Parallel::ForkManager(4);
334            
335             foreach my $part (@parts) {
336             $pm->start and next; # do the fork
337            
338             while (my $l = $part->getline) {
339             # ...
340             }
341             }
342            
343             $pm->wait_all_children;
344              
345             NOTE THAT: If the file is on the same disk, fork is no use.
346             Maybe, using fork makes sense when the file is on RAID (I haven't try it).
347              
348             =head1 DESCRIPTION
349              
350             This module splits file by specified number of part.
351             The range of each part is from one line start to another/same line end.
352             For example, file content is the following:
353              
354             1111
355             22222222222222222222
356             3333
357             4444
358              
359             If C<< $splitter->split(num => 3) >>, split like the following:
360              
361             1st part:
362             1111
363             22222222222222222222
364              
365             2nd part:
366             3333
367              
368             3rd part:
369             4444
370              
371             At first, C method tries to split by bytes of file size / 3,
372             Secondly, tries to split by bytes of rest file size / the number of rest part.
373             So that:
374              
375             1st part : 36 bytes / 3 = 12 byte + bytes to line end(if needed)
376             2nd part : (36 - 26 bytes) / 2 = 5 byte + bytes to line end(if needed)
377             last part: rest part of file
378              
379             =head1 METHODS
380              
381             =head2 new
382              
383             $s = Text::Parts->new(file => $filename);
384             $s = Text::Parts->new(file => $filename, parser => Text::CSV_XS->new({binary => 1}));
385              
386             Constructor. It can take following options:
387              
388             =head3 num
389              
390             number how many you want to split.
391              
392             =head3 size
393              
394             file size how much you want to split.
395             This value is used for calculating C.
396             If file size is 100 and this value is 25, C is 4.
397              
398             =head3 file
399              
400             target file which you want to split.
401              
402             =head3 parser
403              
404             Pass parser object(like Text::CSV_XS->new()).
405             The object must have method which takes filehandle and whose name is C as default.
406             If the object's method is different name, pass the name to C option.
407              
408             =head3 parser_method
409              
410             name of parser's method. default is C.
411              
412             =head3 check_line_start
413              
414             If this options is true, check line start and move to this position before C<< <$fh> >> or parser's C/C.
415             It may be useful when parser's C/C method doesn't work correctly when parsing wrong format.
416              
417             default value is 0.
418              
419             =head3 no_open
420              
421             If this option is true, don't open file on creating Text::Parts::Part object.
422             You need to call C method from the object when you read the file
423             (But, C and C checks this option, so you don't need to call C).
424              
425             This option is required when you pass too much number, which is more than OS's open file limit, to split method.
426              
427             =head2 file
428              
429             my $file = $s->file;
430             $s->file($filename);
431              
432             get/set target file.
433              
434             =head2 parser
435              
436             my $parser_object = $s->parser;
437             $s->parser($parser_object);
438              
439             get/set parser object.
440              
441             =head2 parser_method
442              
443             my $method = $s->parser_method;
444             $s->parser_method($method);
445              
446             get/set parser method.
447              
448              
449             =head2 split
450              
451             my @parts = $s->split(num => $num);
452             my @parts = $s->split(size => $size);
453             my @parts = $s->split(num => $num, max_num => 3);
454              
455             Try to split target file to C<$num> of parts. The returned value is array of Text::Parts::Part object.
456             If you pass C<< size => bytes >>, calculate C<$num> from file size / C<$size>.
457              
458             This method doesn't actually split file, only calculate the start and end position of parts.
459              
460             This returns array of Text::Parts::Part object.
461             See L.
462              
463             If you set max_num, only split number of max_num.
464              
465             my @parts = $s->split(num => 5, max_num => 2);
466              
467             This tries to split 5 parts, but only 2 parts are returned.
468             This is useful to try to test a few parts of too many parts.
469              
470             =head2 eol
471              
472             my $eol = $s->eol;
473             $s->eol($eol);
474              
475             get/set end of line string. default value is $/.
476              
477             =head2 write_files
478              
479             @filenames = $s->write_files('path/to/name%d.txt', num => 4);
480              
481             C is the format of filename. %d is replaced by number.
482             For example:
483              
484             path/to/name1.txt
485             path/to/name2.txt
486             path/to/name3.txt
487             path/to/name4.txt
488              
489             The rest of arguments are as same as C except the following 2 options.
490              
491             =over 4
492              
493             =item code
494              
495             C option takes code reference which would be done immediately after file had been written.
496             If you pass C option as the following:
497              
498             @filenames = $s->write_files('path/to/name%d.txt', num => 4, code => \&do_after_split)
499              
500             splitted file name is given to &do_after_split:
501              
502             sub do_after_split {
503             my $filename = shift; # 'path/to/name1.txt'
504             # ...
505             unlink $filename;
506             }
507              
508             =item start_number
509              
510             @filenames = $s->write_files('path/to/name%d.txt', num => 4, start_number => 0);
511             # $filenames[0] is 'path/to/name0.txt'
512              
513             This is used for filename.
514              
515             if start_number is 0.
516              
517             path/to/name0.txt
518             path/to/name1.txt
519             ...
520              
521             if start_number is 1 (default).
522              
523             path/to/name1.txt
524             path/to/name2.txt
525             ...
526              
527             if start_number is 2
528              
529             path/to/name2.txt
530             path/to/name3.txt
531             ...
532              
533             =item last_number
534              
535             If last_number is specified, stop to split file when number reaches last_number.
536             Note that this option override max_num.
537              
538             @filenames = $s->write_files('path/to/name%d.txt', num => 4, start_number => 0, last_number => 1);
539             # $filenames[0] is 'path/to/name0.txt'
540             # $filenames[1] is 'path/to/name1.txt'
541             # $filenames[2] doesn't exist
542              
543             =back
544              
545             =head1 Text::Parts::Part METHODS
546              
547             Text::Parts::Part objects are returned by C method.
548              
549             =head2 getline
550              
551             my $line = $part->getline;
552              
553             return 1 line.
554             You can use C<< <$part> >>, also.
555              
556             my $line = <$part>
557              
558             =head2 getline_parser
559              
560             my $parsed = $part->getline_parser;
561              
562             returns parsed result.
563              
564             =head2 all
565              
566             my $all = $part->all;
567             $part->all(\$all);
568              
569             return all of the part.
570             just C from start to end position.
571              
572             If scalar reference is passed as argument, the content of the part is into the passed scalar.
573              
574             This method checks no_open option.
575             If no_open is true, open file before writing file and close file after writing.
576              
577             =head2 eof
578              
579             $part->eof;
580              
581             If current position is the end of parts, return true.
582              
583             =head2 write_file
584              
585             $part->write_file($filename);
586              
587             Write the contents of the part to $filename.
588              
589             This method checks no_open option.
590             If no_open is true, open file before writing file and close file after writing.
591              
592             =head2 open_and_seek
593              
594             $part->open_and_seek;
595              
596             If the object is created with no_open true, you need to call this method before reading file.
597              
598             =head2 close
599              
600             $part->close;
601              
602             close file handle.
603              
604             =head2 is_opened
605              
606             $part->is_opened;
607              
608             If file handle is opened, return true.
609              
610             =head1 AUTHOR
611              
612             Ktat, C<< >>
613              
614             =head1 BUGS
615              
616             Please report any bugs or feature requests to C, or through
617             the web interface at L. I will be notified, and then you'll
618             automatically be notified of progress on your bug as I make changes.
619              
620             =head1 SUPPORT
621              
622             You can find documentation for this module with the perldoc command.
623              
624             perldoc Text::Parts
625              
626             You can also look for information at:
627              
628             =over 4
629              
630             =item * RT: CPAN's request tracker
631              
632             L
633              
634             =item * AnnoCPAN: Annotated CPAN documentation
635              
636             L
637              
638             =item * CPAN Ratings
639              
640             L
641              
642             =item * Search CPAN
643              
644             L
645              
646             =back
647              
648              
649             =head1 ACKNOWLEDGEMENTS
650              
651              
652             =head1 LICENSE AND COPYRIGHT
653              
654             Copyright 2011 Ktat.
655              
656             This program is free software; you can redistribute it and/or modify it
657             under the terms of either: the GNU General Public License as published
658             by the Free Software Foundation; or the Artistic License.
659              
660             See http://dev.perl.org/licenses/ for more information.
661              
662              
663             =cut
664              
665             1; # End of Text::Parts