File Coverage

blib/lib/RTF/Tokenizer.pm
Criterion Covered Total %
statement 152 157 96.8
branch 89 100 89.0
condition 11 14 78.5
subroutine 19 19 100.0
pod 9 9 100.0
total 280 299 93.6


line stmt bran cond sub pod time code
1             #!perl
2             # RTF::Tokenizer - Peter Sergeant
3              
4             =head1 NAME
5              
6             RTF::Tokenizer - Tokenize RTF
7              
8             =head1 VERSION
9              
10             version 1.19
11              
12             =head1 DESCRIPTION
13              
14             Tokenizes RTF
15              
16             =head1 SYNOPSIS
17              
18             use RTF::Tokenizer;
19              
20             # Create a tokenizer object
21             my $tokenizer = RTF::Tokenizer->new();
22              
23             my $tokenizer = RTF::Tokenizer->new( string => '{\rtf1}' );
24             my $tokenizer = RTF::Tokenizer->new( string => '{\rtf1}', note_escapes => 1 );
25              
26             my $tokenizer = RTF::Tokenizer->new( file => \*STDIN );
27             my $tokenizer = RTF::Tokenizer->new( file => 'lala.rtf' );
28             my $tokenizer = RTF::Tokenizer->new( file => 'lala.rtf', sloppy => 1 );
29              
30             # Populate it from a file
31             $tokenizer->read_file('filename.txt');
32              
33             # Or a file handle
34             $tokenizer->read_file( \*STDIN );
35              
36             # Or a string
37             $tokenizer->read_string( '{\*\some rtf}' );
38              
39             # Get the first token
40             my ( $token_type, $argument, $parameter ) = $tokenizer->get_token();
41              
42             # Ooops, that was wrong...
43             $tokenizer->put_token( 'control', 'b', 1 );
44              
45             # Let's have the lot...
46             my @tokens = $tokenizer->get_all_tokens();
47              
48             =head1 INTRODUCTION
49              
50             This documentation assumes some basic knowledge of RTF.
51             If you lack that, go read The_RTF_Cookbook:
52              
53             L
54              
55             =cut
56              
57             require 5;
58              
59             package RTF::Tokenizer;
60             $RTF::Tokenizer::VERSION = '1.19';
61 15     15   269318 use vars qw($VERSION);
  15         32  
  15         946  
62              
63 15     15   84 use strict;
  15         23  
  15         546  
64 15     15   87 use warnings;
  15         22  
  15         501  
65 15     15   80 use Carp;
  15         27  
  15         1549  
66 15     15   9209 use IO::File;
  15         153385  
  15         13474  
67              
68             =head1 METHODS
69              
70             =head2 new()
71              
72             Instantiates an RTF::Tokenizer object.
73              
74             B:
75              
76             C - calls the C method with the value provided after instantiation
77              
78             C - calls the C method with the value provided after instantiation
79              
80             C - boolean - whether to give RTF Escapes a token type of C (true) or
81             C (false, default)
82              
83             C - boolean - whether or not to allow some illegal but common RTF sequences found
84             'in the wild'. As of C<1.08>, this currently only allows control words with a numeric
85             argument to have a text field right after with no delimiter, like:
86              
87             \control1Plaintext
88              
89             but this may change in future releases. Defaults false.
90              
91             C - boolean - ... the RTF specification tells you to strip whitespace
92             which comes after control words, and newlines at the beginning and ending of text areas.
93             One result of that is that you can't actually round-trip the output of the tokenization
94             process. Turning this on is probably a bad idea, but someone cared enough to send me a
95             patch for it, so why not. Defaults false, and you should leave it that way.
96              
97             =cut
98              
99             sub new {
100             # Get the real class name in the highly unlikely event we've been
101             # called from an object itself.
102 54     54 1 449930 my $proto = shift;
103 54   66     330 my $class = ref($proto) || $proto;
104              
105             # Read in the named parameters
106 54         189 my %config = @_;
107              
108 54         315 my $self = {
109             _BUFFER => '', # Stores read but unparsed RTF
110             _BINARY_DATA => '', # Temporary data store if we're reading a \bin
111             _FILEHANDLE => '', # Stores the active filehandle
112             _INITIAL_READ => 512
113             , # How many characters to read by default. 512 recommended by RTF spec
114             _UC => 1, # Default number of characters to count for \uc
115             };
116              
117 54         136 bless $self, $class;
118              
119             # Call the data-reading convenience methods if required
120 54 100       285 if ( $config{'file'} ) {
    100          
121 5         37 $self->read_file( $config{'file'} );
122             } elsif ( $config{'string'} ) {
123 38         102 $self->read_string( $config{'string'} );
124             }
125              
126             # Set up final config stuff
127 54         691 $self->{_NOTE_ESCAPES} = $config{'note_escapes'};
128 54         183 $self->{_SLOPPY} = $config{'sloppy'};
129 54         138 $self->{_WHITESPACE} = $config{'preserve_whitespace'};
130              
131 54         210 return $self;
132              
133             }
134              
135             =head2 read_string( STRING )
136              
137             Appends the string to the tokenizer-object's buffer
138             (earlier versions would over-write the buffer -
139             this version does not).
140              
141             =cut
142              
143             sub read_string {
144 48     48 1 1339 my $self = shift;
145 48         252 $self->{_BUFFER} .= shift;
146             }
147              
148             =head2 read_file( \*FILEHANDLE )
149              
150             =head2 read_file( $IO_File_object )
151              
152             =head2 read_file( 'filename' )
153              
154             Appends a chunk of data from the filehandle to the buffer,
155             and remembers the filehandle, so if you ask for a token,
156             and the buffer is empty, it'll try and read the next line
157             from the file (earlier versions would over-write the buffer -
158             this version does not).
159              
160             This chunk is 500 characters, and then whatever is left until
161             the next occurrence of the IRS (a newline character in this case).
162             If for whatever reason, you want to change that number to something
163             else, use C.
164              
165             =cut
166              
167             sub read_file {
168              
169 12     12 1 4736 my $self = shift;
170 12         20 my $file = shift;
171              
172             # Accept a filehandle referenced via a GLOB
173 12 100       44 if ( ref $file eq 'GLOB' ) {
    100          
    100          
    100          
174 2         18 $self->{_FILEHANDLE} = IO::File->new_from_fd( $file, '<' );
175 2 100       330 croak
176             "Couldn't create an IO::File object from the reference you specified"
177             unless $self->{_FILEHANDLE};
178              
179             # Accept IO::File and subclassed objects
180             } elsif (
181             eval {
182 10         149 $file->isa('IO::File');
183             } )
184             {
185 1         21 $self->{_FILEHANDLE} = $file;
186              
187             # This is undocumented, because you shouldn't use it. Don't rely on it.
188             } elsif ( ref $file eq 'IO::Scalar' ) {
189 4         17 $self->{_FILEHANDLE} = $file;
190              
191             # If it's not a reference, assume it's a filename
192             } elsif ( !ref $file ) {
193 4         28 $self->{_FILEHANDLE} = IO::File->new("< $file");
194 4 100       615 croak "Couldn't open '$file' for reading" unless $self->{_FILEHANDLE};
195              
196             # Complain if we get anything else
197             } else {
198 1         112 croak "You passed a reference to read_file of type " . ref($file) .
199             " which isn't an allowed type";
200             }
201              
202             # Check what our line-endings seem to be, then set $self->{_IRS} accordingly.
203             # This also reads in the first few lines as a side effect.
204 9         45 $self->_line_endings;
205             }
206              
207             # Reads a line from an IO:File'ish object
208             sub _get_line {
209 20     20   31 my $self = shift();
210              
211             # Localize the input record separator before changing it so
212             # we don't mess up any other part of the application running
213             # us that relies on it
214 20         86 local $/ = $self->{_IRS};
215              
216             # Read the line itself
217 20         231 my $line = $self->{_FILEHANDLE}->getline();
218 20 50       823 $self->{_BUFFER} .= $line if defined $line;
219             }
220              
221             # Determine what kind of line-endings the file uses
222              
223             sub _line_endings {
224 12     12   2245 my $self = shift();
225              
226 12         18 my $temp_buffer;
227 12         64 $self->{_FILEHANDLE}->read( $temp_buffer, $self->{_INITIAL_READ} );
228              
229             # This catches all allowed cases
230 12 50       354 if ( $temp_buffer =~ m/(\cM\cJ|\cM|\cJ)/ ) {
231 12         45 $self->{_IRS} = $1;
232              
233 12 100       50 $self->{_RS} = "Macintosh" if $self->{_IRS} eq "\cM";
234 12 100       45 $self->{_RS} = "Windows" if $self->{_IRS} eq "\cM\cJ";
235 12 100       39 $self->{_RS} = "UNIX" if $self->{_IRS} eq "\cJ";
236              
237             } else {
238 0         0 $self->{_RS} = "Unknown";
239             }
240              
241             # Add back to main buffer
242 12         41 $self->{_BUFFER} .= $temp_buffer;
243              
244             # Call C<_get_line> again so we're sure we're not only
245             # reading half a line
246 12         39 $self->_get_line;
247              
248             }
249              
250             =head2 get_token()
251              
252             Returns the next token as a three-item list: 'type', 'argument', 'parameter'.
253             Token is one of: C, C, C, C or C.
254              
255             If you turned on C, then you may get a forth item for
256             C tokens.
257              
258             =over
259              
260             =item C
261              
262             'type' is set to 'text'. 'argument' is set to the text itself. 'parameter'
263             is left blank. NOTE: C<\{>, C<\}>, and C<\\> are all returned as control words,
264             rather than rendered as text for you, as are C<\_>, C<\-> and friends.
265              
266             =item C
267              
268             'type' is 'control'. 'argument' is the control word or control symbol.
269             'parameter' is the control word's parameter if it has one - this will
270             be numeric, EXCEPT when 'argument' is a literal ', in which case it
271             will be a two-letter hex string.
272              
273             If you turned on C, you'll get a forth item,
274             which will be the whitespace or a defined empty string.
275              
276             =item C
277              
278             'type' is 'group'. If it's the beginning of an RTF group, then
279             'argument' is 1, else if it's the end, argument is 0. 'parameter'
280             is not set.
281              
282             =item C
283              
284             End of file reached. 'type' is 'eof'. 'argument' is 1. 'parameter' is
285             0.
286              
287             =item C
288              
289             If you specifically turn on this functionality, you'll get an
290             C type, which is identical to C, only, it's
291             only returned for escapes.
292              
293             =back
294              
295             =cut
296              
297             # Define a regular expression that matches characters which are 'text' -
298             # that is, they're not a backspace, a scoping brace, or discardable
299             # whitespace.
300             my $non_text_standard_re = qr/[^\\{}\r\n]/;
301             my $non_text_whitespace_re = qr/[^\\{}]/;
302              
303             sub get_token {
304 5925     5925 1 53920 my $self = shift;
305              
306             # If the last token we returned was \bin, we'll now have a
307             # big chunk of binary data waiting for the user, so send that
308             # back
309 5925 100       8676 if ( $self->{_BINARY_DATA} ) {
310 2         4 my $data = $self->{_BINARY_DATA};
311 2         5 $self->{_BINARY_DATA} = '';
312 2         9 return ( 'text', $data, '' );
313             }
314              
315             # We might have a cached token, and if we do, we'll want to
316             # return that first
317 5923 100       8062 if ( $self->{_PUT_TOKEN_CACHE_FLAG} ) {
318             # Take the value from the cache
319 3         4 my @return_values = @{ pop( @{ $self->{_PUT_TOKEN_CACHE} } ) };
  3         3  
  3         9  
320              
321             # Update the flag
322 3         3 $self->{_PUT_TOKEN_CACHE_FLAG} = @{ $self->{_PUT_TOKEN_CACHE} };
  3         5  
323              
324             # Give the user the token back
325 3         15 return @return_values;
326             }
327              
328 5920 100       7043 my $non_text_re =
329             $self->{_WHITESPACE} ? $non_text_whitespace_re : $non_text_standard_re;
330              
331             # Our main parsing loop
332 5920         4313 while (1) {
333              
334 5955         7277 my $start_character = substr( $self->{_BUFFER}, 0, 1, '' );
335              
336             # Most likely to be text, so we check for that first
337 5955 100       19726 if ( $start_character =~ $non_text_re ) {
    100          
    100          
    100          
    100          
338 15     15   151 no warnings 'uninitialized';
  15         31  
  15         23135  
339              
340             # We want to return text fields that have newlines in as one
341             # token, which requires a bit of work, as we read in one line
342             # at a time from out files...
343 234         255 my $temp_text = '';
344              
345 241         670 READTEXT:
346              
347             # Grab all the next 'text' characters
348             $self->{_BUFFER} =~ s/^([^\\{}]+)//s;
349 241 100       629 $temp_text .= $1 if defined $1;
350              
351             # If the buffer is empty, try reading in some more, and
352             # then go back to READTEXT to keep going. Now, the clever
353             # thing would be to assume that if the buffer *IS* empty
354             # then there MUST be more to read, which is true if we
355             # have well-formed input. We're going to assume that the
356             # input could well be a little broken.
357 241 100 100     628 if ( ( !$self->{_BUFFER} ) && ( $self->{_FILEHANDLE} ) ) {
358 7         67 $self->_get_line;
359 7 50       51 goto READTEXT if $self->{_BUFFER};
360             }
361              
362             # Make sure we're not including newlines in our output,
363             # as RTF spec says they're to be ignored...
364 234 100       386 unless ( $self->{_WHITESPACE} ) {
365 124         386 $temp_text =~ s/(\cM\cJ|\cM|\cJ)//g;
366             }
367              
368             # Give the user a shiny token back
369 234         993 return ( 'text', $start_character . $temp_text, '' );
370              
371             # Second most likely to be a control character
372             } elsif ( $start_character eq "\\" ) {
373 5200         6184 my @args = $self->_grab_control();
374              
375             # If the control word was an escape, and the user
376             # asked to be told separately about those, this
377             # will be set, so return an 'escape'. Otherwise,
378             # return the control word as a 'control'
379 5198 100       7682 if ( $self->{_TEMP_ESCAPE_FLAG} ) {
380 2         1 $self->{_TEMP_ESCAPE_FLAG} = 0;
381 2         12 return ( 'escape', @args );
382             } else {
383 5196         14302 return ( 'control', @args );
384             }
385              
386             # Probably a group then
387             } elsif ( $start_character eq '{' ) {
388 224         548 return ( 'group', 1, '' );
389             } elsif ( $start_character eq '}' ) {
390 225         618 return ( 'group', 0, '' );
391              
392             # No start character? Either we're at the end of our input,
393             # or we need some new input
394             } elsif ( !$start_character ) {
395             # If we were read from a string, we're all done
396 38 100       155 return ( 'eof', 1, 0 ) unless $self->{_FILEHANDLE};
397              
398             # If we were read from a file, try and get some more stuff
399             # in to the buffer, or return the 'eof' character
400 1 50       7 return ( 'eof', 1, 0 ) if $self->{_FILEHANDLE}->eof;
401 1         12 $self->_get_line;
402 1 50       3 return ( 'eof', 1, 0 ) unless $self->{_BUFFER};
403             }
404             }
405             }
406              
407             =head2 get_all_tokens
408              
409             As per C, but keeps calling C until it hits EOF. Returns
410             a list of arrayrefs.
411              
412             =cut
413              
414             sub get_all_tokens {
415 34     34 1 42 my $self = shift;
416 34         37 my @tokens;
417              
418 34         28 while (1) {
419 5822         7723 my $token = [ $self->get_token() ];
420 5822         5934 push( @tokens, $token );
421 5822 100       9079 last if $token->[0] eq 'eof';
422             }
423              
424 34         1212 return @tokens;
425             }
426              
427             =head2 put_token( type, token, argument )
428              
429             Adds an item to the token cache, so that the next time you
430             call get_token, the arguments you passed here will be returned.
431             We don't check any of the values, so use this carefully. This
432             is on a first in last out basis.
433              
434             =cut
435              
436             sub put_token {
437 3     3 1 371 my $self = shift;
438              
439 3         3 push( @{ $self->{_PUT_TOKEN_CACHE} }, [@_] );
  3         7  
440              
441             # No need to set this to the real value of the token cache, as
442             # it'll get set properly when we try and read a cached token.
443 3         4 $self->{_PUT_TOKEN_CACHE_FLAG} = 1;
444             }
445              
446             =head2 sloppy( [bool] )
447              
448             Decides whether we allow some types of broken RTF. See C's docs
449             for a little more explanation about this. Pass it 1 to turn it on, 0 to
450             turn it off. This will always return undef.
451              
452             =cut
453              
454             sub sloppy {
455 2     2 1 1868 my $self = shift;
456 2         5 my $bool = shift;
457              
458 2 100       9 if ($bool) {
459 1         4 $self->{_SLOPPY} = 1;
460             } else {
461 1         3 $self->{_SLOPPY} = 0;
462             }
463              
464 2         5 return;
465             }
466              
467             =head2 initial_read( [number] )
468              
469             Don't call this unless you actually have a good reason. When
470             the Tokenizer reads from a file, it first attempts to work out
471             what the correct input record-seperator should be, by reading
472             some characters from the file handle. This value starts off
473             as 512, which is twice the amount of characters that version 1.7
474             of the RTF specification says you should go before including a
475             line feed if you're writing RTF.
476              
477             Called with no argument, this returns the current value of the
478             number of characters we're going to read. Called with a numeric
479             argument, it sets the number of characters we'll read.
480              
481             You really don't need to use this method.
482              
483             =cut
484              
485             sub initial_read {
486 4     4 1 462 my $self = shift;
487 4 100       11 if (@_) { $self->{_INITIAL_READ} = shift }
  1         2  
488 4         16 return $self->{_INITIAL_READ};
489             }
490              
491             =head2 debug( [number] )
492              
493             Returns (non-destructively) the next 50 characters from the buffer,
494             OR, the number of characters you specify. Printing these to STDERR,
495             causing fatal errors, and the like, are left as an exercise to the
496             programmer.
497              
498             Note the part about 'from the buffer'. It really means that, which means
499             if there's nothing in the buffer, but still stuff we're reading from a
500             file it won't be shown. Chances are, if you're using this function, you're
501             debugging. There's an internal method called C<_get_line>, which is called
502             without arguments (C<$self->_get_line()>) that's how we get more stuff into
503             the buffer when we're reading from filehandles. There's no guarentee that'll
504             stay, or will always work that way, but, if you're debugging, that shouldn't
505             matter.
506              
507             =cut
508              
509             sub debug {
510 4     4 1 683 my $self = shift;
511 4   100     16 my $number = shift || 50;
512              
513 4         21 return substr( $self->{_BUFFER}, 0, $number );
514             }
515              
516             # Work with control characters
517              
518             # It's ugly to repeat myself here, but I believe having two literal re's
519             # here is going to offer a small performance benefit over a regex with
520             # a scalar in it.
521             my $control_word_standard_re = qr/
522             ^([a-z]{1,32}) # Lowercase word
523             (-?\d+)? # Optional signed number
524             (?:\s|(?=[^a-z0-9])) # Either whitespace, which we gobble or a
525             # non alpha-numeric, which we leave
526             /ix;
527             my $control_word_whitespace_re = qr/
528             ^([a-z]{1,32}) # Lowercase word
529             (-?\d+)? # Optional signed number
530             (\s*)? # Capture trailing whitespace
531             /ix;
532              
533             sub _grab_control {
534 5200     5200   3962 my $self = shift;
535              
536 5200 100       6127 my $whitespace_re =
537             $self->{_WHITESPACE} ? $control_word_whitespace_re :
538             $control_word_standard_re;
539              
540             # Check for a star here, as it simplifies our regex below,
541             # and it occurs pretty often
542 5200 100 66     26893 if ( $self->{_BUFFER} =~ s/^\*// ) {
    100          
    100          
    100          
    100          
    50          
    50          
    100          
    100          
543 59         117 return ( '*', '' );
544              
545             # A standard control word:
546             } elsif ( $self->{_BUFFER} =~ s/$whitespace_re// ) {
547             # Return the control word, unless it's a \bin
548 5118         4584 my $param = '';
549 5118 100       9209 $param = $2 if defined($2);
550              
551 5118         3370 my @whitespace;
552 5118 100       7241 if ( $self->{_WHITESPACE} ) {
553 2549 50       4767 push( @whitespace, defined $3 ? $3 : '' );
554             }
555              
556 5118 100       19549 return ( $1, $param, @whitespace ) unless $1 eq 'bin';
557              
558             # Pre-grab the binary data, and return the control word
559 4         6 my $byte_count = $2;
560 4         8 $self->_grab_bin($byte_count);
561 2         8 return ( 'bin', $byte_count, @whitespace );
562              
563             # hex-dec character (escape)
564             } elsif ( $self->{_BUFFER} =~ s/^'([0-9a-f]{2})//i ) {
565 10 100       31 $self->{_TEMP_ESCAPE_FLAG}++ if $self->{_NOTE_ESCAPES};
566 10         38 return ( "'", $1 );
567              
568             # Control symbol (escape)
569             } elsif ( $self->{_BUFFER} =~ s/^([-_~:|{}'\\])// ) {
570 1 50       4 $self->{_TEMP_ESCAPE_FLAG}++ if $self->{_NOTE_ESCAPES};
571 1         4 return ( $1, '' );
572              
573             # Escaped whitespace (ew, but allowed)
574             } elsif ( $self->{_BUFFER} =~ s/^[\r\n]// ) {
575 3         16 return ( 'par', '' );
576              
577             # Escaped tab (ew, but allowed)
578             } elsif ( $self->{_BUFFER} =~ s/^\t// ) {
579 0         0 return ( 'tab', '' );
580              
581             # Escaped semi-colon - this is WRONG
582             } elsif ( $self->{_BUFFER} =~ s/^\;// ) {
583 0         0 carp(
584             "Your RTF contains an escaped semi-colon. This isn't allowed, but we'll let you have it back as a literal for now. See the RTF spec."
585             );
586 0         0 return ( ';', '' );
587              
588             # Unicode characters
589             } elsif ( $self->{_BUFFER} =~ s/^u(\d+)// ) {
590 4         18 return ( 'u', $1 );
591              
592             # Allow incorrect control words
593             } elsif ( ( $self->{_SLOPPY} ) &&
594             ( $self->{_BUFFER} =~ s/^([a-z]{1,32})(-?\d+)//i ) )
595             {
596 2         5 my $param = '';
597 2 50       12 $param = $2 if defined($2);
598              
599 2         11 return ( $1, $param );
600             }
601              
602             # If we get here, something has gone wrong. First we'll create
603             # a human readable section of RTF to show the user.
604 3         13 my $die_string = substr( $self->{_BUFFER}, 0, 50 );
605 3         8 $die_string =~ s/\r/[R]/g;
606              
607             # Get angry with the user
608 3         80 carp
609             "Your RTF is broken, trying to recover to nearest group from '\\$die_string'\n";
610 3         2976 carp
611             "Chances are you have some RTF like \\control1plaintext. Which is illegal. But you can allow that by passing the 'sloppy' attribute to new() or using the sloppy() method. Please also write to and abuse the developer of the software which wrote your RTF :-)\n";
612              
613             # Kill everything until the next group
614 3         2767 $self->{_BUFFER} =~ s/^.+?([}{])/$1/;
615 3         14 return ( '', '' );
616             }
617              
618             # A first stab at grabbing binary data
619             sub _grab_bin {
620 4     4   6 my $self = shift;
621 4         6 my $bytes = shift;
622              
623             # If the buffer is too small, attempt to read in some more data...
624 4         18 while ( length( $self->{_BUFFER} ) < $bytes ) {
625              
626             # If there's no filehandle, or the one we have is eof, complain
627 2 50 66     12 if ( !$self->{_FILEHANDLE} || $self->{_FILEHANDLE}->eof ) {
628 2         349 croak "\\bin is asking for $bytes characters, but there are only " .
629             length( $self->{_BUFFER} ) . " left.";
630             }
631              
632             # Try and read in more data
633 0         0 $self->_get_line;
634             }
635              
636             # Return the right number of characters
637 2         7 $self->{_BINARY_DATA} = substr( $self->{_BUFFER}, 0, $bytes, '' );
638             }
639              
640             =head1 NOTES
641              
642             To avoid intrusively deep parsing, if an alternative ASCII
643             representation is available for a Unicode entity, and that
644             ASCII representation contains C<{>, or C<\>, by themselves, things
645             will go I. But I'm not convinced either of those is
646             allowed by the spec.
647              
648             =head1 AUTHOR
649              
650             Pete Sergeant -- C
651              
652             =head1 LICENSE
653              
654             Copyright B.
655              
656             This program is free software; you can redistribute it and/or modify it under
657             the same terms as Perl itself.
658              
659             =cut
660              
661             1;