File Coverage

blib/lib/RTF/Tokenizer.pm

Criterion	Covered	Total	%
statement	152	157	96.8
branch	89	100	89.0
condition	11	14	78.5
subroutine	19	19	100.0
pod	9	9	100.0
total	280	299	93.6

line	stmt	bran	cond	sub	pod	time	code
1							#!perl
2							# RTF::Tokenizer - Peter Sergeant
3
4							=head1 NAME
5
6							RTF::Tokenizer - Tokenize RTF
7
8							=head1 VERSION
9
10							version 1.19
11
12							=head1 DESCRIPTION
13
14							Tokenizes RTF
15
16							=head1 SYNOPSIS
17
18							use RTF::Tokenizer;
19
20							# Create a tokenizer object
21							my $tokenizer = RTF::Tokenizer->new();
22
23							my $tokenizer = RTF::Tokenizer->new( string => '{\rtf1}' );
24							my $tokenizer = RTF::Tokenizer->new( string => '{\rtf1}', note_escapes => 1 );
25
26							my $tokenizer = RTF::Tokenizer->new( file => \*STDIN );
27							my $tokenizer = RTF::Tokenizer->new( file => 'lala.rtf' );
28							my $tokenizer = RTF::Tokenizer->new( file => 'lala.rtf', sloppy => 1 );
29
30							# Populate it from a file
31							$tokenizer->read_file('filename.txt');
32
33							# Or a file handle
34							$tokenizer->read_file( \*STDIN );
35
36							# Or a string
37							$tokenizer->read_string( '{\*\some rtf}' );
38
39							# Get the first token
40							my ( $token_type, $argument, $parameter ) = $tokenizer->get_token();
41
42							# Ooops, that was wrong...
43							$tokenizer->put_token( 'control', 'b', 1 );
44
45							# Let's have the lot...
46							my @tokens = $tokenizer->get_all_tokens();
47
48							=head1 INTRODUCTION
49
50							This documentation assumes some basic knowledge of RTF.
51							If you lack that, go read The_RTF_Cookbook:
52
53							L
54
55							=cut
56
57							require 5;
58
59							package RTF::Tokenizer;
60							$RTF::Tokenizer::VERSION = '1.19';
61	15			15		269318	use vars qw($VERSION);
	15					32
	15					946
62
63	15			15		84	use strict;
	15					23
	15					546
64	15			15		87	use warnings;
	15					22
	15					501
65	15			15		80	use Carp;
	15					27
	15					1549
66	15			15		9209	use IO::File;
	15					153385
	15					13474
67
68							=head1 METHODS
69
70							=head2 new()
71
72							Instantiates an RTF::Tokenizer object.
73
74							B:
75
76							C - calls the C method with the value provided after instantiation
77
78							C - calls the C method with the value provided after instantiation
79
80							C - boolean - whether to give RTF Escapes a token type of C (true) or
81							C (false, default)
82
83							C - boolean - whether or not to allow some illegal but common RTF sequences found
84							'in the wild'. As of C<1.08>, this currently only allows control words with a numeric
85							argument to have a text field right after with no delimiter, like:
86
87							\control1Plaintext
88
89							but this may change in future releases. Defaults false.
90
91							C - boolean - ... the RTF specification tells you to strip whitespace
92							which comes after control words, and newlines at the beginning and ending of text areas.
93							One result of that is that you can't actually round-trip the output of the tokenization
94							process. Turning this on is probably a bad idea, but someone cared enough to send me a
95							patch for it, so why not. Defaults false, and you should leave it that way.
96
97							=cut
98
99							sub new {
100							# Get the real class name in the highly unlikely event we've been
101							# called from an object itself.
102	54			54	1	449930	my $proto = shift;
103	54		66			330	my $class = ref($proto) \|\| $proto;
104
105							# Read in the named parameters
106	54					189	my %config = @_;
107
108	54					315	my $self = {
109							_BUFFER => '', # Stores read but unparsed RTF
110							_BINARY_DATA => '', # Temporary data store if we're reading a \bin
111							_FILEHANDLE => '', # Stores the active filehandle
112							_INITIAL_READ => 512
113							, # How many characters to read by default. 512 recommended by RTF spec
114							_UC => 1, # Default number of characters to count for \uc
115							};
116
117	54					136	bless $self, $class;
118
119							# Call the data-reading convenience methods if required
120	54	100				285	if ( $config{'file'} ) {
		100
121	5					37	$self->read_file( $config{'file'} );
122							} elsif ( $config{'string'} ) {
123	38					102	$self->read_string( $config{'string'} );
124							}
125
126							# Set up final config stuff
127	54					691	$self->{_NOTE_ESCAPES} = $config{'note_escapes'};
128	54					183	$self->{_SLOPPY} = $config{'sloppy'};
129	54					138	$self->{_WHITESPACE} = $config{'preserve_whitespace'};
130
131	54					210	return $self;
132
133							}
134
135							=head2 read_string( STRING )
136
137							Appends the string to the tokenizer-object's buffer
138							(earlier versions would over-write the buffer -
139							this version does not).
140
141							=cut
142
143							sub read_string {
144	48			48	1	1339	my $self = shift;
145	48					252	$self->{_BUFFER} .= shift;
146							}
147
148							=head2 read_file( \*FILEHANDLE )
149
150							=head2 read_file( $IO_File_object )
151
152							=head2 read_file( 'filename' )
153
154							Appends a chunk of data from the filehandle to the buffer,
155							and remembers the filehandle, so if you ask for a token,
156							and the buffer is empty, it'll try and read the next line
157							from the file (earlier versions would over-write the buffer -
158							this version does not).
159
160							This chunk is 500 characters, and then whatever is left until
161							the next occurrence of the IRS (a newline character in this case).
162							If for whatever reason, you want to change that number to something
163							else, use C.
164
165							=cut
166
167							sub read_file {
168
169	12			12	1	4736	my $self = shift;
170	12					20	my $file = shift;
171
172							# Accept a filehandle referenced via a GLOB
173	12	100				44	if ( ref $file eq 'GLOB' ) {
		100
		100
		100
174	2					18	$self->{_FILEHANDLE} = IO::File->new_from_fd( $file, '<' );
175	2	100				330	croak
176							"Couldn't create an IO::File object from the reference you specified"
177							unless $self->{_FILEHANDLE};
178
179							# Accept IO::File and subclassed objects
180							} elsif (
181							eval {
182	10					149	$file->isa('IO::File');
183							} )
184							{
185	1					21	$self->{_FILEHANDLE} = $file;
186
187							# This is undocumented, because you shouldn't use it. Don't rely on it.
188							} elsif ( ref $file eq 'IO::Scalar' ) {
189	4					17	$self->{_FILEHANDLE} = $file;
190
191							# If it's not a reference, assume it's a filename
192							} elsif ( !ref $file ) {
193	4					28	$self->{_FILEHANDLE} = IO::File->new("< $file");
194	4	100				615	croak "Couldn't open '$file' for reading" unless $self->{_FILEHANDLE};
195
196							# Complain if we get anything else
197							} else {
198	1					112	croak "You passed a reference to read_file of type " . ref($file) .
199							" which isn't an allowed type";
200							}
201
202							# Check what our line-endings seem to be, then set $self->{_IRS} accordingly.
203							# This also reads in the first few lines as a side effect.
204	9					45	$self->_line_endings;
205							}
206
207							# Reads a line from an IO:File'ish object
208							sub _get_line {
209	20			20		31	my $self = shift();
210
211							# Localize the input record separator before changing it so
212							# we don't mess up any other part of the application running
213							# us that relies on it
214	20					86	local $/ = $self->{_IRS};
215
216							# Read the line itself
217	20					231	my $line = $self->{_FILEHANDLE}->getline();
218	20	50				823	$self->{_BUFFER} .= $line if defined $line;
219							}
220
221							# Determine what kind of line-endings the file uses
222
223							sub _line_endings {
224	12			12		2245	my $self = shift();
225
226	12					18	my $temp_buffer;
227	12					64	$self->{_FILEHANDLE}->read( $temp_buffer, $self->{_INITIAL_READ} );
228
229							# This catches all allowed cases
230	12	50				354	if ( $temp_buffer =~ m/(\cM\cJ\|\cM\|\cJ)/ ) {
231	12					45	$self->{_IRS} = $1;
232
233	12	100				50	$self->{_RS} = "Macintosh" if $self->{_IRS} eq "\cM";
234	12	100				45	$self->{_RS} = "Windows" if $self->{_IRS} eq "\cM\cJ";
235	12	100				39	$self->{_RS} = "UNIX" if $self->{_IRS} eq "\cJ";
236
237							} else {
238	0					0	$self->{_RS} = "Unknown";
239							}
240
241							# Add back to main buffer
242	12					41	$self->{_BUFFER} .= $temp_buffer;
243
244							# Call C<_get_line> again so we're sure we're not only
245							# reading half a line
246	12					39	$self->_get_line;
247
248							}
249
250							=head2 get_token()
251
252							Returns the next token as a three-item list: 'type', 'argument', 'parameter'.
253							Token is one of: C, C, C, C or C.
254
255							If you turned on C, then you may get a forth item for
256							C tokens.
257
258							=over
259
260							=item C
261
262							'type' is set to 'text'. 'argument' is set to the text itself. 'parameter'
263							is left blank. NOTE: C<\{>, C<\}>, and C<\\> are all returned as control words,
264							rather than rendered as text for you, as are C<\_>, C<\-> and friends.
265
266							=item C
267
268							'type' is 'control'. 'argument' is the control word or control symbol.
269							'parameter' is the control word's parameter if it has one - this will
270							be numeric, EXCEPT when 'argument' is a literal ', in which case it
271							will be a two-letter hex string.
272
273							If you turned on C, you'll get a forth item,
274							which will be the whitespace or a defined empty string.
275
276							=item C
277
278							'type' is 'group'. If it's the beginning of an RTF group, then
279							'argument' is 1, else if it's the end, argument is 0. 'parameter'
280							is not set.
281
282							=item C
283
284							End of file reached. 'type' is 'eof'. 'argument' is 1. 'parameter' is
285							0.
286
287							=item C
288
289							If you specifically turn on this functionality, you'll get an
290							C type, which is identical to C, only, it's
291							only returned for escapes.
292
293							=back
294
295							=cut
296
297							# Define a regular expression that matches characters which are 'text' -
298							# that is, they're not a backspace, a scoping brace, or discardable
299							# whitespace.
300							my $non_text_standard_re = qr/[^\\{}\r\n]/;
301							my $non_text_whitespace_re = qr/[^\\{}]/;
302
303							sub get_token {
304	5925			5925	1	53920	my $self = shift;
305
306							# If the last token we returned was \bin, we'll now have a
307							# big chunk of binary data waiting for the user, so send that
308							# back
309	5925	100				8676	if ( $self->{_BINARY_DATA} ) {
310	2					4	my $data = $self->{_BINARY_DATA};
311	2					5	$self->{_BINARY_DATA} = '';
312	2					9	return ( 'text', $data, '' );
313							}
314
315							# We might have a cached token, and if we do, we'll want to
316							# return that first
317	5923	100				8062	if ( $self->{_PUT_TOKEN_CACHE_FLAG} ) {
318							# Take the value from the cache
319	3					4	my @return_values = @{ pop( @{ $self->{_PUT_TOKEN_CACHE} } ) };
	3					3
	3					9
320
321							# Update the flag
322	3					3	$self->{_PUT_TOKEN_CACHE_FLAG} = @{ $self->{_PUT_TOKEN_CACHE} };
	3					5
323
324							# Give the user the token back
325	3					15	return @return_values;
326							}
327
328	5920	100				7043	my $non_text_re =
329							$self->{_WHITESPACE} ? $non_text_whitespace_re : $non_text_standard_re;
330
331							# Our main parsing loop
332	5920					4313	while (1) {
333
334	5955					7277	my $start_character = substr( $self->{_BUFFER}, 0, 1, '' );
335
336							# Most likely to be text, so we check for that first
337	5955	100				19726	if ( $start_character =~ $non_text_re ) {
		100
		100
		100
		100
338	15			15		151	no warnings 'uninitialized';
	15					31
	15					23135
339
340							# We want to return text fields that have newlines in as one
341							# token, which requires a bit of work, as we read in one line
342							# at a time from out files...
343	234					255	my $temp_text = '';
344
345	241					670	READTEXT:
346
347							# Grab all the next 'text' characters
348							$self->{_BUFFER} =~ s/^([^\\{}]+)//s;
349	241	100				629	$temp_text .= $1 if defined $1;
350
351							# If the buffer is empty, try reading in some more, and
352							# then go back to READTEXT to keep going. Now, the clever
353							# thing would be to assume that if the buffer IS empty
354							# then there MUST be more to read, which is true if we
355							# have well-formed input. We're going to assume that the
356							# input could well be a little broken.
357	241	100	100			628	if ( ( !$self->{_BUFFER} ) && ( $self->{_FILEHANDLE} ) ) {
358	7					67	$self->_get_line;
359	7	50				51	goto READTEXT if $self->{_BUFFER};
360							}
361
362							# Make sure we're not including newlines in our output,
363							# as RTF spec says they're to be ignored...
364	234	100				386	unless ( $self->{_WHITESPACE} ) {
365	124					386	$temp_text =~ s/(\cM\cJ\|\cM\|\cJ)//g;
366							}
367
368							# Give the user a shiny token back
369	234					993	return ( 'text', $start_character . $temp_text, '' );
370
371							# Second most likely to be a control character
372							} elsif ( $start_character eq "\\" ) {
373	5200					6184	my @args = $self->_grab_control();
374
375							# If the control word was an escape, and the user
376							# asked to be told separately about those, this
377							# will be set, so return an 'escape'. Otherwise,
378							# return the control word as a 'control'
379	5198	100				7682	if ( $self->{_TEMP_ESCAPE_FLAG} ) {
380	2					1	$self->{_TEMP_ESCAPE_FLAG} = 0;
381	2					12	return ( 'escape', @args );
382							} else {
383	5196					14302	return ( 'control', @args );
384							}
385
386							# Probably a group then
387							} elsif ( $start_character eq '{' ) {
388	224					548	return ( 'group', 1, '' );
389							} elsif ( $start_character eq '}' ) {
390	225					618	return ( 'group', 0, '' );
391
392							# No start character? Either we're at the end of our input,
393							# or we need some new input
394							} elsif ( !$start_character ) {
395							# If we were read from a string, we're all done
396	38	100				155	return ( 'eof', 1, 0 ) unless $self->{_FILEHANDLE};
397
398							# If we were read from a file, try and get some more stuff
399							# in to the buffer, or return the 'eof' character
400	1	50				7	return ( 'eof', 1, 0 ) if $self->{_FILEHANDLE}->eof;
401	1					12	$self->_get_line;
402	1	50				3	return ( 'eof', 1, 0 ) unless $self->{_BUFFER};
403							}
404							}
405							}
406
407							=head2 get_all_tokens
408
409							As per C, but keeps calling C until it hits EOF. Returns
410							a list of arrayrefs.
411
412							=cut
413
414							sub get_all_tokens {
415	34			34	1	42	my $self = shift;
416	34					37	my @tokens;
417
418	34					28	while (1) {
419	5822					7723	my $token = [ $self->get_token() ];
420	5822					5934	push( @tokens, $token );
421	5822	100				9079	last if $token->[0] eq 'eof';
422							}
423
424	34					1212	return @tokens;
425							}
426
427							=head2 put_token( type, token, argument )
428
429							Adds an item to the token cache, so that the next time you
430							call get_token, the arguments you passed here will be returned.
431							We don't check any of the values, so use this carefully. This
432							is on a first in last out basis.
433
434							=cut
435
436							sub put_token {
437	3			3	1	371	my $self = shift;
438
439	3					3	push( @{ $self->{_PUT_TOKEN_CACHE} }, [@_] );
	3					7
440
441							# No need to set this to the real value of the token cache, as
442							# it'll get set properly when we try and read a cached token.
443	3					4	$self->{_PUT_TOKEN_CACHE_FLAG} = 1;
444							}
445
446							=head2 sloppy( [bool] )
447
448							Decides whether we allow some types of broken RTF. See C's docs
449							for a little more explanation about this. Pass it 1 to turn it on, 0 to
450							turn it off. This will always return undef.
451
452							=cut
453
454							sub sloppy {
455	2			2	1	1868	my $self = shift;
456	2					5	my $bool = shift;
457
458	2	100				9	if ($bool) {
459	1					4	$self->{_SLOPPY} = 1;
460							} else {
461	1					3	$self->{_SLOPPY} = 0;
462							}
463
464	2					5	return;
465							}
466
467							=head2 initial_read( [number] )
468
469							Don't call this unless you actually have a good reason. When
470							the Tokenizer reads from a file, it first attempts to work out
471							what the correct input record-seperator should be, by reading
472							some characters from the file handle. This value starts off
473							as 512, which is twice the amount of characters that version 1.7
474							of the RTF specification says you should go before including a
475							line feed if you're writing RTF.
476
477							Called with no argument, this returns the current value of the
478							number of characters we're going to read. Called with a numeric
479							argument, it sets the number of characters we'll read.
480
481							You really don't need to use this method.
482
483							=cut
484
485							sub initial_read {
486	4			4	1	462	my $self = shift;
487	4	100				11	if (@_) { $self->{_INITIAL_READ} = shift }
	1					2
488	4					16	return $self->{_INITIAL_READ};
489							}
490
491							=head2 debug( [number] )
492
493							Returns (non-destructively) the next 50 characters from the buffer,
494							OR, the number of characters you specify. Printing these to STDERR,
495							causing fatal errors, and the like, are left as an exercise to the
496							programmer.
497
498							Note the part about 'from the buffer'. It really means that, which means
499							if there's nothing in the buffer, but still stuff we're reading from a
500							file it won't be shown. Chances are, if you're using this function, you're
501							debugging. There's an internal method called C<_get_line>, which is called
502							without arguments (C<$self->_get_line()>) that's how we get more stuff into
503							the buffer when we're reading from filehandles. There's no guarentee that'll
504							stay, or will always work that way, but, if you're debugging, that shouldn't
505							matter.
506
507							=cut
508
509							sub debug {
510	4			4	1	683	my $self = shift;
511	4		100			16	my $number = shift \|\| 50;
512
513	4					21	return substr( $self->{_BUFFER}, 0, $number );
514							}
515
516							# Work with control characters
517
518							# It's ugly to repeat myself here, but I believe having two literal re's
519							# here is going to offer a small performance benefit over a regex with
520							# a scalar in it.
521							my $control_word_standard_re = qr/
522							^([a-z]{1,32}) # Lowercase word
523							(-?\d+)? # Optional signed number
524							(?:\s\|(?=[^a-z0-9])) # Either whitespace, which we gobble or a
525							# non alpha-numeric, which we leave
526							/ix;
527							my $control_word_whitespace_re = qr/
528							^([a-z]{1,32}) # Lowercase word
529							(-?\d+)? # Optional signed number
530							(\s*)? # Capture trailing whitespace
531							/ix;
532
533							sub _grab_control {
534	5200			5200		3962	my $self = shift;
535
536	5200	100				6127	my $whitespace_re =
537							$self->{_WHITESPACE} ? $control_word_whitespace_re :
538							$control_word_standard_re;
539
540							# Check for a star here, as it simplifies our regex below,
541							# and it occurs pretty often
542	5200	100	66			26893	if ( $self->{_BUFFER} =~ s/^\*// ) {
		100
		100
		100
		100
		50
		50
		100
		100
543	59					117	return ( '*', '' );
544
545							# A standard control word:
546							} elsif ( $self->{_BUFFER} =~ s/$whitespace_re// ) {
547							# Return the control word, unless it's a \bin
548	5118					4584	my $param = '';
549	5118	100				9209	$param = $2 if defined($2);
550
551	5118					3370	my @whitespace;
552	5118	100				7241	if ( $self->{_WHITESPACE} ) {
553	2549	50				4767	push( @whitespace, defined $3 ? $3 : '' );
554							}
555
556	5118	100				19549	return ( $1, $param, @whitespace ) unless $1 eq 'bin';
557
558							# Pre-grab the binary data, and return the control word
559	4					6	my $byte_count = $2;
560	4					8	$self->_grab_bin($byte_count);
561	2					8	return ( 'bin', $byte_count, @whitespace );
562
563							# hex-dec character (escape)
564							} elsif ( $self->{_BUFFER} =~ s/^'([0-9a-f]{2})//i ) {
565	10	100				31	$self->{_TEMP_ESCAPE_FLAG}++ if $self->{_NOTE_ESCAPES};
566	10					38	return ( "'", $1 );
567
568							# Control symbol (escape)
569							} elsif ( $self->{_BUFFER} =~ s/^([-_~:\|{}'\\])// ) {
570	1	50				4	$self->{_TEMP_ESCAPE_FLAG}++ if $self->{_NOTE_ESCAPES};
571	1					4	return ( $1, '' );
572
573							# Escaped whitespace (ew, but allowed)
574							} elsif ( $self->{_BUFFER} =~ s/^[\r\n]// ) {
575	3					16	return ( 'par', '' );
576
577							# Escaped tab (ew, but allowed)
578							} elsif ( $self->{_BUFFER} =~ s/^\t// ) {
579	0					0	return ( 'tab', '' );
580
581							# Escaped semi-colon - this is WRONG
582							} elsif ( $self->{_BUFFER} =~ s/^\;// ) {
583	0					0	carp(
584							"Your RTF contains an escaped semi-colon. This isn't allowed, but we'll let you have it back as a literal for now. See the RTF spec."
585							);
586	0					0	return ( ';', '' );
587
588							# Unicode characters
589							} elsif ( $self->{_BUFFER} =~ s/^u(\d+)// ) {
590	4					18	return ( 'u', $1 );
591
592							# Allow incorrect control words
593							} elsif ( ( $self->{_SLOPPY} ) &&
594							( $self->{_BUFFER} =~ s/^([a-z]{1,32})(-?\d+)//i ) )
595							{
596	2					5	my $param = '';
597	2	50				12	$param = $2 if defined($2);
598
599	2					11	return ( $1, $param );
600							}
601
602							# If we get here, something has gone wrong. First we'll create
603							# a human readable section of RTF to show the user.
604	3					13	my $die_string = substr( $self->{_BUFFER}, 0, 50 );
605	3					8	$die_string =~ s/\r/[R]/g;
606
607							# Get angry with the user
608	3					80	carp
609							"Your RTF is broken, trying to recover to nearest group from '\\$die_string'\n";
610	3					2976	carp
611							"Chances are you have some RTF like \\control1plaintext. Which is illegal. But you can allow that by passing the 'sloppy' attribute to new() or using the sloppy() method. Please also write to and abuse the developer of the software which wrote your RTF :-)\n";
612
613							# Kill everything until the next group
614	3					2767	$self->{_BUFFER} =~ s/^.+?([}{])/$1/;
615	3					14	return ( '', '' );
616							}
617
618							# A first stab at grabbing binary data
619							sub _grab_bin {
620	4			4		6	my $self = shift;
621	4					6	my $bytes = shift;
622
623							# If the buffer is too small, attempt to read in some more data...
624	4					18	while ( length( $self->{_BUFFER} ) < $bytes ) {
625
626							# If there's no filehandle, or the one we have is eof, complain
627	2	50	66			12	if ( !$self->{_FILEHANDLE} \|\| $self->{_FILEHANDLE}->eof ) {
628	2					349	croak "\\bin is asking for $bytes characters, but there are only " .
629							length( $self->{_BUFFER} ) . " left.";
630							}
631
632							# Try and read in more data
633	0					0	$self->_get_line;
634							}
635
636							# Return the right number of characters
637	2					7	$self->{_BINARY_DATA} = substr( $self->{_BUFFER}, 0, $bytes, '' );
638							}
639
640							=head1 NOTES
641
642							To avoid intrusively deep parsing, if an alternative ASCII
643							representation is available for a Unicode entity, and that
644							ASCII representation contains C<{>, or C<\>, by themselves, things
645							will go I. But I'm not convinced either of those is
646							allowed by the spec.
647
648							=head1 AUTHOR
649
650							Pete Sergeant -- C
651
652							=head1 LICENSE
653
654							Copyright B.
655
656							This program is free software; you can redistribute it and/or modify it under
657							the same terms as Perl itself.
658
659							=cut
660
661							1;