File Coverage

blib/lib/Image/ExifTool/RTF.pm
Criterion Covered Total %
statement 119 142 83.8
branch 61 90 67.7
condition 9 21 42.8
subroutine 6 6 100.0
pod 0 3 0.0
total 195 262 74.4


line stmt bran cond sub pod time code
1             #------------------------------------------------------------------------------
2             # File: RTF.pm
3             #
4             # Description: Read Rich Text Format meta information
5             #
6             # Revisions: 2010/06/17 - P. Harvey Created
7             #
8             # References: 1) http://download.microsoft.com/download/2/f/5/2f599e18-07ee-4ec5-a1e7-f4e6a9423592/Word2007RTFSpec9.doc
9             # 2) http://search.cpan.org/dist/RTF-Writer/lib/RTF/Cookbook.pod
10             #------------------------------------------------------------------------------
11              
12             package Image::ExifTool::RTF;
13              
14 1     1   4576 use strict;
  1         2  
  1         37  
15 1     1   6 use vars qw($VERSION);
  1         2  
  1         44  
16 1     1   5 use Image::ExifTool qw(:DataAccess :Utils);
  1         2  
  1         2168  
17              
18             $VERSION = '1.04';
19              
20             sub ProcessUserProps($$$);
21              
22             # supported RTF character entities
23             my %rtfEntity = (
24             par => 0x0a,
25             tab => 0x09,
26             endash => 0x2013,
27             emdash => 0x2014,
28             lquote => 0x2018,
29             rquote => 0x2019,
30             ldblquote => 0x201c,
31             rdblquote => 0x201d,
32             bullet => 0x2022,
33             );
34              
35             # RTF tags (ref 1)
36             %Image::ExifTool::RTF::Main = (
37             GROUPS => { 2 => 'Document' },
38             NOTES => q{
39             This table lists standard tags of the RTF information group, but ExifTool
40             will also extract any non-standard tags found in this group. As well,
41             ExifTool will extract any custom properties that are found. See
42             L for the
43             specification.
44             },
45             title => { },
46             subject => { },
47             author => { Groups => { 2 => 'Author' } },
48             manager => { },
49             company => { },
50             copyright=> { Groups => { 2 => 'Author' } }, # (written by Apple TextEdit)
51             operator => { Name => 'LastModifiedBy' },
52             category => { },
53             keywords => { },
54             comment => { },
55             doccomm => { Name => 'Comments' },
56             hlinkbase=> { Name => 'HyperlinkBase' },
57             creatim => {
58             Name => 'CreateDate',
59             Format => 'date',
60             Groups => { 2 => 'Time' },
61             PrintConv => '$self->ConvertDateTime($val)',
62             },
63             revtim => {
64             Name => 'ModifyDate',
65             Format => 'date',
66             Groups => { 2 => 'Time' },
67             PrintConv => '$self->ConvertDateTime($val)',
68             },
69             printim => {
70             Name => 'LastPrinted',
71             Format => 'date',
72             Groups => { 2 => 'Time' },
73             PrintConv => '$self->ConvertDateTime($val)',
74             },
75             buptim => {
76             Name => 'BackupTime',
77             Format => 'date',
78             Groups => { 2 => 'Time' },
79             PrintConv => '$self->ConvertDateTime($val)',
80             },
81             edmins => {
82             Name => 'TotalEditTime', # in minutes
83             PrintConv => 'ConvertTimeSpan($val, 60)',
84             },
85             nofpages => { Name => 'Pages' },
86             nofwords => { Name => 'Words' },
87             nofchars => { Name => 'Characters' },
88             nofcharsws=>{
89             Name => 'CharactersWithSpaces',
90             Notes => q{
91             according to the 2007 Microsoft RTF specification this is clearly the number
92             of characters NOT including spaces, but Microsoft Word writes this as the
93             number WITH spaces, so ExifTool names this tag according to the de facto
94             standard
95             },
96             },
97             id => { Name => 'InternalIDNumber' },
98             version => { Name => 'RevisionNumber' },
99             vern => { Name => 'InternalVersionNumber' },
100             );
101              
102             # lookup for user-defined properties
103             # (none are pre-defined and this table doesn't appear in the docs)
104             %Image::ExifTool::RTF::UserProps = (
105             GROUPS => { 2 => 'Document' },
106             );
107              
108             #------------------------------------------------------------------------------
109             # Read to nested closing curly bracket "}"
110             # Inputs: 0) data ref, 1) optional RAF ref to read more data if available
111             # Returns: text inside brackets, or undef on error
112             # Notes: On entry the current position in the data must be set to immediately
113             # after the command that opens the bracket. On return the current
114             # position is immediately following the closing brace if the return
115             # value is defined.
116             sub ReadToNested($;$)
117             {
118 14     14 0 27 my ($dataPt, $raf) = @_;
119 14         25 my $pos = pos $$dataPt;
120 14         17 my $level = 1;
121 14         19 for (;;) {
122             # look for the next bracket
123 46 50       146 unless ($$dataPt =~ /(\\*)([{}])/g) {
124             # must read some more data
125 0         0 my $p = length $$dataPt;
126 0         0 my $buff;
127 0 0 0     0 last unless $raf and $raf->Read($buff, 65536);
128 0         0 $$dataPt .= $buff;
129             # rewind position to include any leading backslashes
130 0   0     0 --$p while $p and substr($$dataPt, $p - 1, 1) eq '\\';
131 0         0 pos($$dataPt) = $p; # set position to continue search
132 0         0 next;
133             }
134             # bracket is escaped if preceded by an odd number of backslashes
135 46 100 100     117 next if $1 and length($1) & 0x01;
136 42 100       78 $2 eq '{' and ++$level, next;
137 28 100       56 next unless --$level <= 0;
138 14         57 return substr($$dataPt, $pos, pos($$dataPt) - $pos - 1);
139             }
140 0         0 return undef;
141             }
142              
143             #------------------------------------------------------------------------------
144             # Unescape RTF escape sequences
145             # Inputs: 0) ExifTool ref, 1) RTF text, 2) RTF character set (for hex characters)
146             # Returns: Unescaped text (in current ExifTool Charset)
147             sub UnescapeRTF($$$)
148             {
149 11     11 0 24 my ($et, $val, $charset) = @_;
150              
151             # return now unless we have a control sequence
152 11 100       29 unless ($val =~ /\\/) {
153 5         11 $val =~ tr/\n\r//d; # ignore CR's and LF's
154 5         13 return $val;
155             }
156             # CR/LF is significant if it terminates a control sequence (so change these to a space)
157             # (was $val =~ s/(^|[^\\])((?:\\\\)*)(\\[a-zA-Z]+(?:-?\d+)?)[\n\r]/$1$2$3 /g;)
158 6 100       32 $val =~ s/\\(?:([a-zA-Z]+(?:-?\d+)?)[\n\r]|(.))/'\\'.($1 ? "$1 " : $2)/sge;
  35         142  
159             # protect the newline control sequence by converting to a \par command
160             # (was $val =~ s/(^|[^\\])((?:\\\\)*)(\\[\n\r])/$1$2\\par /g;)
161 6 100       28 $val =~ s/(\\[\n\r])|(\\.)/$2 || '\\par '/sge;
  35         151  
162             # all other CR/LF's are ignored (so delete them)
163 6         16 $val =~ tr/\n\r//d;
164              
165 6         14 my $rtnVal = '';
166 6         8 my $len = length $val;
167 6         10 my $skip = 1; # default Unicode skip count
168 6         9 my $p0 = 0;
169              
170 6         7 for (;;) {
171             # find next backslash
172 40 100       95 my $p1 = ($val =~ /\\/g) ? pos($val) : $len + 1;
173             # add text up to start of this control sequence (or up to end)
174 40         53 my $n = $p1 - $p0 - 1;
175 40 100       88 $rtnVal .= substr($val, $p0, $n) if $n > 0;
176             # all done if at the end or if control sequence is empty
177 40 100       93 last if $p1 >= $len;
178             # look for an ASCII-letter control word or Unicode control
179 34 100       108 if ($val =~ /\G([a-zA-Z]+)(-?\d+)? ?/g) {
180             # interpret command if recognized
181 25 100       67 if ($1 eq 'uc') { # \ucN
    100          
    50          
182 2         18 $skip = $2;
183             } elsif ($1 eq 'u') { # \uN
184 20 50       43 if ($2 < 0) {
185 0         0 $et->WarnOnce('Invalid Unicode character(s) in text');
186 0         0 $rtnVal .= '?';
187             } else {
188 20         62 require Image::ExifTool::Charset;
189 20         59 $rtnVal .= Image::ExifTool::Charset::Recompose($et, [$2]);
190 20 100       59 if ($skip) {
191             # must skip the specified number of characters
192             # (not simple because RTF control words count as a single character)
193 1 50       62 last unless $val =~ /\G([^\\]|\\([a-zA-Z]+)(-?\d+)? ?|\\'.{2}|\\.){$skip}/g;
194             }
195             }
196             } elsif ($rtfEntity{$1}) {
197 3         17 require Image::ExifTool::Charset;
198 3         12 $rtnVal .= Image::ExifTool::Charset::Recompose($et, [$rtfEntity{$1}]);
199             } # (else ignore the command)
200             } else {
201 9         17 my $ch = substr($val, $p1, 1);
202 9 100       23 if ($ch eq "'") {
203             # hex character code
204 5 50       14 last if $p1 + 3 > $len;
205 5         15 my $hex = substr($val, $p1 + 1, 2);
206 5 50       18 if ($hex =~ /^[0-9a-fA-F]{2}$/) {
207 5         614 require Image::ExifTool::Charset;
208 5         28 $rtnVal .= $et->Decode(chr(hex($hex)), $charset);
209             }
210 5         17 pos($val) = $p1 + 3; # skip to after the hex code
211             } else {
212             # assume a standard control symbol (\, {, }, etc)
213             # (note, this may not be valid for some uncommon
214             # control symbols like \~ for non-breaking space)
215 4         18 $rtnVal .= $ch;
216 4         9 pos($val) = $p1 + 1; # skip to after this character
217             }
218             }
219 34         55 $p0 = pos($val);
220             }
221 6         16 return $rtnVal;
222             }
223              
224             #------------------------------------------------------------------------------
225             # Read information in a RTF document
226             # Inputs: 0) ExifTool ref, 1) dirInfo ref
227             # Returns: 1 on success, 0 if this wasn't a valid RTF file
228             sub ProcessRTF($$)
229             {
230 1     1 0 4 my ($et, $dirInfo) = @_;
231 1         10 my $raf = $$dirInfo{RAF};
232 1         6 my ($buff, $buf2, $cs);
233              
234 1 50 33     5 return 0 unless $raf->Read($buff, 64) and $raf->Seek(0,0);
235 1 50       15 return 0 unless $buff =~ /^[\n\r]*\{[\n\r]*\\rtf[^a-zA-Z]/;
236 1         9 $et->SetFileType();
237             #
238             # determine the RTF character set
239             #
240 1 50       10 if ($buff=~ /\\ansicpg(\d*)/) {
    0          
241 1         5 $cs = "cp$1";
242             } elsif ($buff=~ /\\(ansi|mac|pc|pca)[^a-zA-Z]/) {
243 0         0 my %trans = (
244             ansi => 'Latin',
245             mac => 'MacRoman',
246             pc => 'cp437',
247             pca => 'cp850',
248             );
249 0         0 $cs = $trans{$1};
250             } else {
251 0         0 $et->Warn('Unspecified RTF encoding. Will assume Latin');
252 0         0 $cs = 'Latin';
253             }
254 1         5 my $charset = $Image::ExifTool::charsetName{lc $cs};
255 1 50       5 unless ($charset) {
256 0         0 $et->Warn("Unsupported RTF encoding $cs. Will assume Latin.");
257 0         0 $charset = 'Latin';
258             }
259 1         3 my $tagTablePtr = GetTagTable('Image::ExifTool::RTF::Main');
260 1         11 undef $buff;
261             #
262             # scan for \info group
263             #
264 1         4 for (;;) {
265 2 100       7 $raf->Read($buf2, 65536) or last;
266 1 50       11 if (defined $buff) {
267             # read more but leave some overlap for the match
268 0         0 $buff = substr($buff, -16) . $buf2;
269             } else {
270 1         10 $buff = $buf2;
271             }
272 1 50       22 next unless $buff =~ /[^\\]\{[\n\r]*\\info([^a-zA-Z])/g;
273             # anything but a space is included in the contents
274 1 50       10 pos($buff) = pos($buff) - 1 if $1 ne ' ';
275 1         8 my $info = ReadToNested(\$buff, $raf);
276 1 50       5 unless (defined $info) {
277 0         0 $et->Warn('Unterminated information group');
278 0         0 last;
279             }
280             # process info commands (eg. "\author", "\*\copyright");
281 1         12 while ($info =~ /\{[\n\r]*(\\\*[\n\r]*)?\\([a-zA-Z]+)([^a-zA-Z])/g) {
282 8 100       41 pos($info) = pos($info) - 1 if $3 ne ' ';
283 8         23 my $tag = $2;
284 8         19 my $val = ReadToNested(\$info);
285 8 50       25 last unless defined $val;
286 8         18 my $tagInfo = $$tagTablePtr{$tag};
287 8 100 66     38 if ($tagInfo and $$tagInfo{Format} and $$tagInfo{Format} eq 'date') {
      66        
288             # parse RTF date commands
289 1         7 my %idx = (yr=>0,mo=>1,dy=>2,hr=>3,min=>4,sec=>5);
290 1         4 my @t = (0) x 6;
291 1         7 while ($val =~ /\\([a-z]+)(\d+)/g) {
292 4 50       11 next unless defined $idx{$1};
293 4         15 $t[$idx{$1}] = $2;
294             }
295 1         10 $val = sprintf("%.4d:%.2d:%.2d %.2d:%.2d:%.2d", @t);
296             } else {
297             # unescape RTF string value
298 7         16 $val = UnescapeRTF($et, $val, $charset);
299             }
300             # create tagInfo for unknown tags
301 8 50       18 if (not $tagInfo) {
302 0         0 AddTagToTable($tagTablePtr, $tag, { Name => ucfirst($tag) });
303             }
304 8         24 $et->HandleTag($tagTablePtr, $tag, $val);
305             }
306             }
307 1 50       6 return 1 unless defined $buff;
308             #
309             # scan for \userprops (but don't read more from file to find the start of this command)
310             #
311 1         3 pos($buff) = 0;
312 1         12 while ($buff =~ /[^\\]\{[\n\r]*\\\*[\n\r]*\\userprops([^a-zA-Z])/g) {
313             # Note: The RTF spec places brackets around each propinfo structure,
314             # but Microsoft Word doesn't write it this way, so tolerate either.
315 1 50       6 pos($buff) = pos($buff) - 1 if $1 ne ' ';
316 1         5 my $props = ReadToNested(\$buff, $raf);
317 1         4 $tagTablePtr = Image::ExifTool::GetTagTable('Image::ExifTool::RTF::UserProps');
318 1 50       4 unless (defined $props) {
319 0         0 $et->Warn('Unterminated user properties');
320 0         0 last;
321             }
322             # process user properties
323 1         2 my $tag;
324 1         11 while ($props =~ /\{[\n\r]*(\\\*[\n\r]*)?\\([a-zA-Z]+)([^a-zA-Z])/g) {
325 4 50       11 pos($props) = pos($props) - 1 if $3 ne ' ';
326 4         9 my $t = $2;
327 4         8 my $val = ReadToNested(\$props);
328 4 50       9 last unless defined $val;
329 4         8 $val = UnescapeRTF($et, $val, $charset);
330 4 100 33     17 if ($t eq 'propname') {
    50          
331 2         5 $tag = $val;
332 2         10 next;
333             } elsif ($t ne 'staticval' or not defined $tag) {
334 0         0 next; # ignore \linkval and \proptype for now
335             }
336 2         5 $tag =~ s/\s(.)/\U$1/g; # capitalize all words in tag name
337 2         4 $tag =~ tr/-_a-zA-Z0-9//dc; # remove illegal characters
338 2 50       5 next unless $tag;
339             # create tagInfo for unknown tags
340 2 50       6 unless ($$tagTablePtr{$tag}) {
341 2         19 AddTagToTable($tagTablePtr, $tag, { Name => $tag });
342             }
343 2         5 $et->HandleTag($tagTablePtr, $tag, $val);
344             }
345 1         3 last; # (didn't really want to loop)
346             }
347 1         4 return 1;
348             }
349              
350             1; # end
351              
352             __END__