File Coverage

blib/lib/Text/Tokenize/Indented.pm
Criterion Covered Total %
statement 21 66 31.8
branch 0 22 0.0
condition n/a
subroutine 7 11 63.6
pod 3 3 100.0
total 31 102 30.3


line stmt bran cond sub pod time code
1             package Text::Tokenize::Indented;
2            
3 1     1   33002 use 5.006;
  1         4  
  1         65  
4 1     1   7 use strict;
  1         2  
  1         43  
5 1     1   6 use warnings FATAL => 'all';
  1         13  
  1         51  
6 1     1   1694 use Iterator::Simple;
  1         8012  
  1         64  
7 1     1   1017 use Iterator::Simple::Lookahead;
  1         6291  
  1         34  
8 1     1   1254 use Data::Dumper;
  1         9491  
  1         81  
9 1     1   9 use Carp;
  1         3  
  1         740  
10            
11             =head1 NAME
12            
13             Text::Tokenize::Indented - tokenize indented lines in text
14            
15             =head1 VERSION
16            
17             Version 0.01
18            
19             =cut
20            
21             our $VERSION = '0.01';
22            
23            
24             =head1 SYNOPSIS
25            
26             As part of the Decl language project (the windmill I've been tilting at since 2010), I end up working with
27             text a lot that is structured by indentation. Finally, I think, this module provides a solid underpinning
28             to working with that kind of text, in that it provides as convenient a tokenizer as possible.
29            
30             It's based on L, meaning that it (1) does a lazy tokenization of a list passed
31             into it, and (2) provides a peek and unget so that you can easily chain tokenizers; if a given piece that has
32             already been identified turns out to break into multiple tokens, you simply tokenize it and push the subpieces
33             back into the stream for later retrieval as individual tokens.
34            
35             This allows very nice compartmentalization of the details of parsing, leaving you a lot less to debug when
36             parsing more difficult items.
37            
38             You use it like this:
39            
40             use Text::Tokenize::Indented;
41            
42             my $tok = Text::Tokenize::Indented ({tab => 4}, < 8}, $trailing_iterator)
43             text
44             text
45             text
46             text
47            
48             text
49             EOF
50            
51             (For instance.) This then returns the following token stream:
52            
53             [0, 'text']
54             [0, 'text']
55             [3, 'text']
56             [3, 'text']
57             [-1]
58             [0, 'text']
59             (whatever the trailing iterator returns)
60            
61             We might then chain another tokenizer onto this one which would tokenize the individual lines into more meaningful things.
62             Note that blank lines officially have an indentation of -1.
63            
64             =head1 METHODS
65            
66             =head2 new
67            
68             Creates a new tokenizer, with or without input. Any parameters are passed to C.
69             The defaults for parameters are as follows: tabs=4 (tabs are 4 spaces), blank, newline.
70             Any parameter can be changed mid-stream by sending a hashref into the input.
71            
72             Returns an Iterator::Simple::Lookahead iterator that returns items from the input queue.
73            
74             =cut
75            
76             sub new {
77 0     0 1   my $class = shift;
78 0           my $self = bless {
79             tabs => 4,
80             blank => qr/\s+/,
81             newline => qr/\n/,
82             queue => [],
83             }, $class;
84 0 0         $self->input(@_) if @_;
85             $self->{iterator} = Iterator::Simple::Lookahead->new (
86             sub {
87 0           NEXT:
88             # End of input if the queue is empty.
89 0 0   0     return undef unless @{$self->{queue}};
90 0           my $next = $self->{queue}->[0];
91            
92             # Take care of parameter updates if the next thing is a hashref, start over.
93 0 0         if (ref $next eq 'HASH') {
94 0           foreach my $key (keys(%$next)) {
95 0           $self->{$key} = $next->{$key};
96             }
97 0           shift @{$self->{queue}};
  0            
98 0           goto NEXT;
99             }
100            
101             # Get the next value in the queue.
102             NEXTVAL:
103 0           my $nextval = $next->();
104            
105             # If the currently next iterator is finished, go to the next thing on the queue.
106 0 0         if (not defined $nextval) {
107 0           shift @{$self->{queue}};
  0            
108 0           goto NEXT;
109             }
110            
111             # If the next value itself is a hashref, we'll still get parameters out of it.
112 0 0         if (ref ($nextval) eq 'HASH') {
113 0           foreach my $key (keys(%$nextval)) {
114 0           $self->{$key} = $next->{$key};
115             }
116 0           goto NEXTVAL;
117             }
118            
119             # Return the value if it's an arrayref, as we have somehow presumably already
120             # tokenized it in an upstream tokenizer of some sort.
121 0 0         return $nextval if ref($nextval);
122            
123             # Oh! A string!
124 0 0         if ($nextval =~ /^(\s+)(.*)/) {
125 0           my ($white, $meat) = ($1, $2);
126 0 0         return [-1] unless $meat;
127 0           $white =~ s/\t/' ' x $self->{tabs}/ge;
  0            
128 0           return [length($white), $meat];
129             }
130 0           return [0, $nextval];
131 0           });
132             #print STDERR Dumper($self);
133 0           $self;
134             }
135            
136             =head2 tokenize (@input)
137            
138             Creates a tokenizer with input, but instead of returning the object, returns only
139             the iterator. No new input can be added to this tokenizer, but normally you don't
140             care.
141            
142             =cut
143            
144             sub tokenize {
145 0     0 1   my $t = new(@_);
146 0           $t->{iterator};
147             }
148            
149             =head2 input
150            
151             Input is where text is loaded up into the tokenizer. It takes a list of items, each of which can be
152             either a hashref, which will be used to set values in the tokenizer that apply to coming data,
153             a string, which will be split into lines, or an iterable object, which will be passed through
154             to the tokenizer output.
155            
156             Returns the iterator for the object.
157            
158             =cut
159            
160             sub input {
161 0     0 1   my $self = shift;
162 0           foreach my $load (@_) {
163 0 0         if (ref $load eq '') { # String input.
    0          
164 0           my @lines = split /\n/, $load;
165 0           push @{$self->{queue}}, Iterator::Simple::iter(\@lines);
  0            
166             } elsif (ref $load eq 'HASH') { # Parameters.
167 0           push @{$self->{queue}}, $load;
  0            
168             } else {
169 0 0         croak "Non-iterable input supplied" unless Iterator::Simple::is_iterable($load);
170 0           push @{$self->{queue}}, Iterator::Simple::iter($load);
  0            
171             }
172             }
173 0           $self->{iterator};
174             }
175            
176             =head1 AUTHOR
177            
178             Michael Roberts, C<< >>
179            
180             =head1 BUGS
181            
182             Please report any bugs or feature requests to C, or through
183             the web interface at L. I will be notified, and then you'll
184             automatically be notified of progress on your bug as I make changes.
185            
186            
187            
188            
189             =head1 SUPPORT
190            
191             You can find documentation for this module with the perldoc command.
192            
193             perldoc Text::Tokenize::Indented
194            
195            
196             You can also look for information at:
197            
198             =over 4
199            
200             =item * RT: CPAN's request tracker (report bugs here)
201            
202             L
203            
204             =item * AnnoCPAN: Annotated CPAN documentation
205            
206             L
207            
208             =item * CPAN Ratings
209            
210             L
211            
212             =item * Search CPAN
213            
214             L
215            
216             =back
217            
218            
219             =head1 ACKNOWLEDGEMENTS
220            
221            
222             =head1 LICENSE AND COPYRIGHT
223            
224             Copyright 2014 Michael Roberts.
225            
226             This program is free software; you can redistribute it and/or modify it
227             under the terms of the the Artistic License (2.0). You may obtain a
228             copy of the full license at:
229            
230             L
231            
232             Any use, modification, and distribution of the Standard or Modified
233             Versions is governed by this Artistic License. By using, modifying or
234             distributing the Package, you accept this license. Do not use, modify,
235             or distribute the Package, if you do not accept this license.
236            
237             If your Modified Version has been derived from a Modified Version made
238             by someone other than you, you are nevertheless required to ensure that
239             your Modified Version complies with the requirements of this license.
240            
241             This license does not grant you the right to use any trademark, service
242             mark, tradename, or logo of the Copyright Holder.
243            
244             This license includes the non-exclusive, worldwide, free-of-charge
245             patent license to make, have made, use, offer to sell, sell, import and
246             otherwise transfer the Package with respect to any patent claims
247             licensable by the Copyright Holder that are necessarily infringed by the
248             Package. If you institute patent litigation (including a cross-claim or
249             counterclaim) against any party alleging that the Package constitutes
250             direct or contributory patent infringement, then this Artistic License
251             to you shall terminate on the date that such litigation is filed.
252            
253             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
254             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
255             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
256             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
257             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
258             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
259             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
260             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
261            
262            
263             =cut
264            
265             1; # End of Text::Tokenize::Indented