File Coverage

blib/lib/RTF/Parser.pm
Criterion Covered Total %
statement 117 134 87.3
branch 38 50 76.0
condition 1 3 33.3
subroutine 27 34 79.4
pod 15 16 93.7
total 198 237 83.5


line stmt bran cond sub pod time code
1             package RTF::Parser;
2             $RTF::Parser::VERSION = '1.12';
3 14     14   135526 use strict;
  14         27  
  14         470  
4 14     14   72 use warnings;
  14         20  
  14         853  
5              
6             =head1 NAME
7              
8             RTF::Parser - A DEPRECATED event-driven RTF Parser
9              
10             =head1 VERSION
11              
12             version 1.12
13              
14             =head1 DESCRIPTION
15              
16             A DEPRECATED event-driven RTF Parser
17              
18             =head1 PUBLIC SERVICE ANNOUNCEMENT
19              
20             B
21             Need rtf2*? Google for pandoc.>
22              
23             A very short history lesson...
24              
25             C<1.07> of this module was released in 1999 by the original author,
26             Philippe Verdret. I took over the module around 2004 with high intentions. I
27             added almost all of the POD, all of the tests, and most of the comments, and
28             rejigged the whole thing to use L for tokenizing the incoming
29             RTF, which fixed a whole class of problems.
30              
31             The big problem is really that the whole module is an API which happens to have
32             C and C stuck on top of it. Any serious changes involve
33             breaking the API, and that seems the greater sin than telling people to go and
34             get themselves a better RTF convertor suite.
35              
36             I had high hopes of overhauling the whole thing, but it didn't happen. I handed
37             over maintainership some years later, but no new version was forthcoming, and
38             the module has languished since then. There are many open bugs on rt.cpan.org
39             and in the reviews.
40              
41             In a moment of weakness, I've picked up the module again with the aim of
42             adding this message, fixing one or two very minor bugs, and putting a version
43             that doesn't have B in big red letters on the CPAN.
44              
45             I doubt I'll ever tackle the bigger bugs (Unicode support), but I will accept
46             patches I can understand.
47              
48             =head1 IMPORTANT HINTS
49              
50             RTF parsing is non-trivial. The inner workings of these modules are somewhat
51             scary. You should go and read the 'Introduction' document included with this
52             distribution before going any further - it explains how this distribution fits
53             together, and is B reading.
54              
55             If you just want to convert RTF to HTML or text, from inside your own script,
56             jump straight to the docs for L or L
57             respectively.
58              
59             =head1 SUBCLASSING RTF::PARSER
60              
61             When you subclass RTF::Parser, you'll want to do two things. You'll firstly
62             want to overwrite the methods below described as the API. This describes what
63             we do when we have tokens that aren't control words (except 'symbols' - see below).
64              
65             Then you'll want to create a hash that maps control words to code references
66             that you want executed. They'll get passed a copy of the RTF::Parser object,
67             the name of the control word (say, 'b'), any arguments passed with the control
68             word, and then 'start'.
69              
70             =head2 An example...
71              
72             The following code removes bold tags from RTF documents, and then spits back
73             out RTF.
74              
75             {
76              
77             # Create our subclass
78              
79             package UnboldRTF;
80              
81             # We'll be doing lots of printing without newlines, so don't buffer output
82              
83             $|++;
84              
85             # Subclassing magic...
86              
87             use RTF::Parser;
88             @UnboldRTF::ISA = ( 'RTF::Parser' );
89              
90             # Redefine the API nicely
91              
92             sub parse_start { print STDERR "Starting...\n"; }
93             sub group_start { print '{' }
94             sub group_end { print '}' }
95             sub text { print "\n" . $_[1] }
96             sub char { print "\\\'$_[1]" }
97             sub symbol { print "\\$_[1]" }
98             sub parse_end { print STDERR "All done...\n"; }
99              
100             }
101              
102             my %do_on_control = (
103              
104             # What to do when we see any control we don't have
105             # a specific action for... In this case, we print it.
106              
107             '__DEFAULT__' => sub {
108              
109             my ( $self, $type, $arg ) = @_;
110             $arg = "\n" unless defined $arg;
111             print "\\$type$arg";
112              
113             },
114              
115             # When we come across a bold tag, we just ignore it.
116              
117             'b' => sub {},
118              
119             );
120              
121             # Grab STDIN...
122              
123             my $data = join '', (<>);
124              
125             # Create an instance of the class we created above
126              
127             my $parser = UnboldRTF->new();
128              
129             # Prime the object with our control handlers...
130              
131             $parser->control_definition( \%do_on_control );
132              
133             # Don't skip undefined destinations...
134              
135             $parser->dont_skip_destinations(1);
136              
137             # Start the parsing!
138              
139             $parser->parse_string( $data );
140              
141             =head1 METHODS
142              
143             =cut
144              
145              
146 14     14   68 use vars qw($VERSION);
  14         34  
  14         835  
147              
148 14     14   75 use Carp;
  14         22  
  14         1449  
149 14     14   28035 use RTF::Tokenizer 1.01;
  14         281456  
  14         456  
150 14     14   16585 use RTF::Config;
  14         36  
  14         1836  
151              
152             my $DEBUG = 0;
153              
154             # Debugging stuff I'm leaving in in case someone is using it..,
155 14     14   79 use constant PARSER_TRACE => 0;
  14         26  
  14         8365  
156              
157             sub backtrace {
158 0     0 0 0 Carp::confess;
159             }
160              
161             $SIG{'INT'} = \&backtrace if PARSER_TRACE;
162             $SIG{__DIE__} = \&backtrace if PARSER_TRACE;
163              
164             =head2 new
165              
166             Creates a new RTF::Parser object. Doesn't accept any arguments.
167              
168             =cut
169              
170             sub new {
171              
172             # Get the real class name
173 17     17 1 7688 my $proto = shift;
174 17   33     113 my $class = ref($proto) || $proto;
175              
176 17         42 my $self = {};
177              
178 17 100       104 $self->{_RTF_CONTROL_USED}++ if $INC{'RTF/Control.pm'};
179              
180 17         44 $self->{_DONT_SKIP_DESTINATIONS} = 0;
181              
182 17         46 bless $self, $class;
183              
184 17         52 return $self;
185              
186             }
187              
188             # For backwards compatability, we import RTF::Control's %do_on_control
189             # if we've loaded RTF::Control (which would suggest we're being subclassed
190             # by RTF::Control). This isn't nice or pretty, but it doesn't break things.
191             # I'd do this in new() but there's no guarentee it'll be set by then...
192              
193             sub _install_do_on_control {
194              
195 14     14   22 my $self = shift;
196              
197 14 100       68 return if $self->{_DO_ON_CONTROL};
198              
199 8 100       28 if ( $self->{_RTF_CONTROL_USED} ) {
200              
201 4         20 $self->{_DO_ON_CONTROL} = \%RTF::Control::do_on_control;
202              
203             } else {
204              
205 4         13 $self->{_DO_ON_CONTROL} = {};
206              
207             }
208              
209             }
210              
211             =head2 parse_stream( \*FH )
212              
213             This function used to accept a second parameter - a function specifying how
214             the filehandle should be read. This is deprecated, because I could find no
215             examples of people using it, nor could I see why people might want to use it.
216              
217             Pass this function a reference to a filehandle (or, now, a filename! yay) to
218             begin reading and processing.
219              
220             =cut
221              
222             sub parse_stream {
223              
224 1     1 1 6 my $self = shift;
225 1         2 my $stream = shift;
226 1         1 my $reader = shift;
227              
228 1         5 $self->_install_do_on_control();
229              
230 1 50       3 die("parse_stream no longer accepts a reader") if $reader;
231              
232             # Put an appropriately primed RTF::Tokenizer object into our object
233 1         10 $self->{_TOKENIZER} = RTF::Tokenizer->new( file => $stream );
234              
235 1         110 $self->_parse();
236              
237 1         2 return $self;
238              
239             }
240              
241             =head2 parse_string( $string )
242              
243             Pass this function a string to begin reading and processing.
244              
245             =cut
246              
247             sub parse_string {
248              
249 13     13 1 478 my $self = shift;
250 13         24 my $string = shift;
251              
252 13         91 $self->_install_do_on_control();
253              
254             # Put an appropriately primed RTF::Tokenizer object into our object
255 13         79 $self->{_TOKENIZER} = RTF::Tokenizer->new( string => $string );
256              
257 13         549 $self->_parse();
258              
259 13         44 return $self;
260              
261             }
262              
263             =head2 control_definition
264              
265             The code that's executed when we trigger a control event is kept
266             in a hash. We're holding this somewhere in our object. Earlier
267             versions would make the assumption we're being subclassed by
268             RTF::Control, which isn't something I want to assume. If you are
269             using RTF::Control, you don't need to worry about this, because
270             we're grabbing %RTF::Control::do_on_control, and using that.
271              
272             Otherwise, you pass this method a reference to a hash where the keys
273             are control words, and the values are coderefs that you want executed.
274             This sets all the callbacks... The arguments passed to your coderefs
275             are: $self, control word itself (like, say, 'par'), any parameter the
276             control word had, and then 'start'.
277              
278             If you don't pass it a reference, you get back the reference of the
279             current control hash we're holding.
280              
281             =cut
282              
283             sub control_definition {
284              
285 4     4 1 31 my $self = shift;
286              
287 4 100       14 if (@_) {
288              
289 2 50       9 if ( ref $_[0] eq 'HASH' ) {
290              
291 2         8 $self->{_DO_ON_CONTROL} = shift;
292              
293             } else {
294              
295 0         0 die "argument of control_definition() method must be an HASHREF";
296              
297             }
298              
299             } else {
300              
301 2         15 return $self->{_DO_ON_CONTROL};
302              
303             }
304              
305             }
306              
307             =head2 rtf_control_emulation
308              
309             If you pass it a boolean argument, it'll set whether or not it thinks RTF::Control
310             has been loaded. If you don't pass it an argument, it'll return what it thinks...
311              
312             =cut
313              
314             sub rtf_control_emulation {
315              
316 0     0 1 0 my $self = shift;
317 0         0 my $bool = shift;
318              
319 0 0       0 if ( defined $bool ) {
320              
321 0         0 $self->{_RTF_CONTROL_USED} = $bool;
322              
323             } else {
324              
325 0         0 return $self->{_RTF_CONTROL_USED};
326              
327             }
328              
329             }
330              
331             =head2 dont_skip_destinations
332              
333             The RTF spec says that we skip any destinations that we don't have an explicit
334             handler for. You could well not want this. Accepts a boolean argument, true
335             to process destinations, 0 to skip the ones we don't understand.
336              
337             =cut
338              
339             sub dont_skip_destinations {
340              
341 2     2 1 24 my $self = shift;
342 2         4 my $bool = shift;
343              
344 2         6 $self->{_DONT_SKIP_DESTINATIONS} = $bool;
345              
346             }
347              
348             # This is how he decided to call control actions. Leaving
349             # it to do the right thing at the moment... Users of the
350             # module don't need to know our dirty little secret...
351              
352             {
353              
354             package RTF::Action;
355             $RTF::Action::VERSION = '1.12';
356 14     14   83 use RTF::Config;
  14         18  
  14         1450  
357              
358 14     14   68 use vars qw($AUTOLOAD);
  14         24  
  14         1919  
359              
360             my $default;
361              
362             # The original RTF::Parser allowed $LOGFILE to be set
363             # that made RTF::Config do fun things. We're allowing it
364             # to, but wrapping it up a bit more carefully...
365             if ($LOG_FILE) {
366              
367             $default = sub { $RTF::Control::not_processed{ $_[1] }++ }
368              
369             }
370              
371             my $sub;
372              
373             sub AUTOLOAD {
374              
375 61     61   90 my $self = $_[0];
376              
377 61         238 $AUTOLOAD =~ s/^.*:://;
378              
379 14     14   83 no strict 'refs';
  14         31  
  14         12758  
380              
381 61 100       196 if ( defined( $sub = $self->{_DO_ON_CONTROL}->{$AUTOLOAD} ) ) {
382              
383             # Yuck, empty if. But we're just going to leave it for a while
384              
385             } else {
386              
387 36 50       97 if ($default) {
    100          
388              
389 0         0 $sub = $default
390              
391             } elsif ( $self->{_DO_ON_CONTROL}->{'__DEFAULT__'} ) {
392              
393 10         17 $sub = $self->{_DO_ON_CONTROL}->{'__DEFAULT__'};
394              
395             } else {
396              
397 26     27   148 $sub = sub { };
  27         64  
398              
399             }
400              
401             }
402              
403             # I don't understand why he's using goto here...
404 61         157 *$AUTOLOAD = $sub;
405 61         194 goto &$sub;
406              
407             }
408              
409             }
410              
411             =head1 API
412              
413             These are some methods that you're going to want to over-ride if you
414             subclass this modules. In general though, people seem to want to subclass
415             RTF::Control, which subclasses this module.
416              
417             =head2 parse_start
418              
419             Called before we start parsing...
420              
421             =head2 parse_end
422              
423             Called when we're finished parsing
424              
425             =head2 group_start
426              
427             Called when we encounter an opening {
428              
429             =head2 group_end
430              
431             Called when we encounter a closing }
432              
433             =head2 text
434              
435             Called when we encounter plain-text. Is given the text as its
436             first argument
437              
438             =head2 char
439              
440             Called when we encounter a hex-escaped character. The hex characters
441             are passed as the first argument.
442              
443             =head2 symbol
444              
445             Called when we come across a control character. This is interesting, because,
446             I'd have treated these as control words, so, I'm using Philippe's list as control
447             words that'll trigger this for you. These are C<-_~:|{}*'\>. This needs to be
448             tested.
449              
450             =head2 bitmap
451              
452             Called when we come across a command that's talking about a linked bitmap
453             file. You're given the file name.
454              
455             =head2 binary
456              
457             Called when we have binary data. You get passed it.
458              
459             =cut
460              
461 4     4 1 8 sub parse_start { }
462 4     4 1 5 sub parse_end { }
463 4     4 1 7 sub group_start { }
464 4     4 1 6 sub group_end { }
465 0     0 1 0 sub text { }
466 0     0 1 0 sub char { }
467 0     0 1 0 sub symbol { } # -_~:|{}*'\
468 0     0 1 0 sub bitmap { } # \{bm(?:[clr]|cwd)
469 0     0 1 0 sub binary { }
470              
471             # This is the big, bad parse routine that isn't called directly.
472             # We loop around RTF::Tokenizer, making event calls when we need to.
473              
474             sub _parse {
475              
476             # Read in our object
477 14     14   26 my $self = shift;
478              
479             # Execute any pre-parse subroutines
480 14         75 $self->parse_start();
481              
482             # Loop until we find the EOF
483 14         25 while (1) {
484              
485             # Read in our initial token
486 174         711 my ( $token_type, $token_argument, $token_parameter ) =
487             $self->{_TOKENIZER}->get_token();
488              
489             # Control words
490 174 100       3642 if ( $token_type eq 'control' ) {
    100          
    100          
491              
492             # We have a special handler for control words
493 79         317 $self->_control( $token_argument, $token_parameter );
494              
495             # Plain text
496             } elsif ( $token_type eq 'text' ) {
497              
498             # Send it to the text() routine
499 34         258 $self->text($token_argument);
500              
501             # Groups
502             } elsif ( $token_type eq 'group' ) {
503              
504             # Call the appropriate handler
505 47 100       195 $token_argument ? $self->group_start :
506             $self->group_end;
507              
508             # EOF
509             } else {
510              
511 14         28 last;
512              
513             }
514              
515             }
516              
517             # All done
518 14         63 $self->parse_end();
519 14         25 $self;
520              
521             }
522              
523             # Control word handler (yeuch)
524             # purl, be RTF barbie is Control words are *HARD*!
525             sub _control {
526              
527 79     79   107 my $self = shift;
528 79         95 my $type = shift;
529 79         90 my $arg = shift;
530              
531             # standard, control_symbols, hex
532              
533             # Funky destination
534 79 100       313 if ( $type eq '*' ) {
    50          
    100          
    100          
535              
536             # We might actually want to process it...
537 2 100       5 if ( $self->{_DONT_SKIP_DESTINATIONS} ) {
538              
539 1         3 $self->_control_execute('*');
540              
541             } else {
542              
543             # Grab the next token
544 1         3 my ( $token_type, $token_argument, $token_parameter ) =
545             $self->{_TOKENIZER}->get_token();
546              
547             # Basic sanity check
548 1 50       20 croak('Malformed RTF - \* not followed by a control...')
549             unless $token_type eq 'control';
550              
551             # Do we have a handler for it?
552 1 50       3 if ( defined $self->{_DO_ON_CONTROL}->{$token_argument} ) {
553 0         0 $self->_control_execute( $token_argument, $token_parameter );
554             } else {
555 1         4 $self->_skip_group();
556 1         3 $self->group_end();
557             }
558             }
559              
560             # Binary data
561             } elsif ( $type eq 'bin' ) {
562              
563             # Grab the next token
564 0         0 my ( $token_type, $token_argument, $token_parameter ) =
565             $self->{_TOKENIZER}->get_token();
566              
567             # Basic sanity check
568 0 0       0 croak('Malformed RTF - \bin not followed by text...')
569             unless $token_type eq 'text';
570              
571             # Send it to the handler
572 0         0 $self->binary($token_argument);
573              
574             # Implement a bitmap handler here
575              
576             # Control symbols
577             } elsif ( $type =~ m/[-_~:|{}*\\]/ ) {
578              
579             # Send it to the handler
580 1         4 $self->symbol($type);
581              
582             # Entity
583             } elsif ( $type eq "'" ) {
584              
585             # Entity handler
586 3         12 $self->char($arg);
587              
588             # Some other control type - give it to the control executer
589             } else {
590              
591             # Pass it to our default executer
592 73         206 $self->_control_execute( $type, $arg )
593              
594             }
595              
596             }
597              
598             # Control word executer (this is nasty)
599             sub _control_execute {
600              
601 74     74   80 my $self = shift;
602 74         141 my $type = shift;
603 74         83 my $arg = shift;
604              
605 14     14   86 no strict 'refs';
  14         26  
  14         2488  
606 74         77 &{"RTF::Action::$type"}( $self, $type, $arg, 'start' );
  74         612  
607              
608             }
609              
610             # Skip a group
611             sub _skip_group {
612              
613 1     1   2 my $self = shift;
614              
615 1         1 my $level_counter = 1;
616              
617 1         3 while ($level_counter) {
618              
619             # Get a token
620 2         5 my ( $token_type, $token_argument, $token_parameter ) =
621             $self->{_TOKENIZER}->get_token();
622              
623             # Make sure we can't loop forever
624 2 50       31 last if $token_type eq 'eof';
625              
626             # We're in business if it's a group
627 2 100       6 if ( $token_type eq 'group' ) {
628              
629 1 50       6 $token_argument ? $level_counter++ :
630             $level_counter--;
631              
632             }
633              
634             }
635              
636             }
637              
638             1;
639              
640             =head1 AUTHOR
641              
642             Peter Sergeant C, originally by Philippe Verdret
643              
644             =head1 COPYRIGHT
645              
646             Copyright 2004 B.
647              
648             This program is free software; you can redistribute it and/or modify it under
649             the same terms as Perl itself.
650              
651             =head1 CREDITS
652              
653             This work was carried out under a grant generously provided by The Perl Foundation -
654             give them money!