File Coverage

blib/lib/Text/Quoted.pm
Criterion Covered Total %
statement 76 77 98.7
branch 24 26 92.3
condition 31 34 91.1
subroutine 11 11 100.0
pod 3 3 100.0
total 145 151 96.0


line stmt bran cond sub pod time code
1             package Text::Quoted;
2             our $VERSION = "2.08";
3 7     7   151505 use 5.006;
  7         22  
  7         253  
4 7     7   35 use strict;
  7         13  
  7         234  
5 7     7   31 use warnings;
  7         16  
  7         642  
6              
7             require Exporter;
8              
9             our @ISA = qw(Exporter);
10             our @EXPORT = qw(extract);
11             our @EXPORT_OK = qw(set_quote_characters combine_hunks);
12              
13 7     7   7849 use Text::Autoformat(); # Provides the Hang package, heh, heh.
  7         453247  
  7         9351  
14              
15             =head1 NAME
16              
17             Text::Quoted - Extract the structure of a quoted mail message
18              
19             =head1 SYNOPSIS
20              
21             use Text::Quoted;
22             Text::Quoted::set_quote_characters( qr/[:]/ ); # customize recognized quote characters
23             my $structure = extract($text);
24              
25             =head1 DESCRIPTION
26              
27             C examines the structure of some text which may contain
28             multiple different levels of quoting, and turns the text into a nested
29             data structure.
30              
31             The structure is an array reference containing hash references for each
32             paragraph belonging to the same author. Each level of quoting recursively
33             adds another list reference. So for instance, this:
34              
35             > foo
36             > # Bar
37             > baz
38              
39             quux
40              
41             turns into:
42              
43             [
44             [
45             { text => 'foo', quoter => '>', raw => '> foo' },
46             [
47             { text => 'Bar', quoter => '> #', raw => '> # Bar' }
48             ],
49             { text => 'baz', quoter => '>', raw => '> baz' }
50             ],
51              
52             { empty => 1 },
53             { text => 'quux', quoter => '', raw => 'quux' }
54             ];
55              
56             This also tells you about what's in the hash references: C is the
57             paragraph of text as it appeared in the original input; C is what
58             it looked like when we stripped off the quotation characters, and C
59             is the quotation string.
60              
61             =head1 FUNCTIONS
62              
63             =head2 extract
64              
65             Takes a single string argument which is the text to extract quote structure
66             from. Returns a nested datastructure as described above.
67              
68             Exported by default.
69              
70             =cut
71              
72             sub extract {
73 20     20 1 1026 return _organize( "", _classify( @_ ) );
74             }
75              
76             sub _organize {
77 60     60   81 my $top_level = shift;
78 60         102 my @todo = @_;
79 60 50       163 $top_level = '' unless defined $top_level;
80              
81 60         62 my @ret;
82              
83             # Recursively form a data structure which reflects the quoting
84             # structure of the list.
85 60         142 while (my $line = shift @todo) {
86 147 100       293 my $q = defined $line->{quoter}? $line->{quoter}: '';
87 147 100       533 if ( $q eq $top_level ) {
    100          
88              
89             # Just append lines at "my" level.
90 101 100 66     467 push @ret, $line
91             if exists $line->{quoter}
92             or exists $line->{empty};
93             }
94             elsif ( $q =~ /^\Q$top_level\E./ ) {
95              
96             # Find all the lines at a quoting level "below" me.
97 40         88 my $newquoter = _find_below( $top_level, $line, @todo );
98 40         111 my @next = $line;
99 40   100     407 push @next, shift @todo while defined $todo[0]->{quoter}
100             and $todo[0]->{quoter} =~ /^\Q$newquoter/;
101              
102             # Find the
103             # And pass them on to _organize()!
104             #print "Trying to organise the following lines over $newquoter:\n";
105             #print $_->{raw}."\n" for @next;
106             #print "!-!-!-\n";
107 40         121 push @ret, _organize( $newquoter, @next );
108             } # else { die "bugger! I had $top_level, but now I have $line->{raw}\n"; }
109             }
110 60         339 return \@ret;
111             }
112              
113             # Given, say:
114             # X
115             # > > hello
116             # > foo bar
117             # Stuff
118             #
119             # After "X", we're moving to another level of quoting - but which one?
120             # Naively, you'd pick out the prefix of the next line, "> >", but this
121             # is incorrect - "> >" is actually a "sub-quote" of ">". This routine
122             # works out which is the next level below us.
123              
124             sub _find_below {
125 40     40   85 my ( $top_level, @stuff ) = @_;
126              
127             # Find the prefices, shortest first.
128             # And return the first one which is "below" where we are right
129             # now but is a proper subset of the next line.
130             return (
131 40   100     1506 sort { length $a <=> length $b }
  22         72  
132             grep $_ && /^\Q$top_level\E./ && $stuff[0]->{quoter} =~ /^\Q$_\E/,
133             map $_->{quoter},
134             @stuff
135             )[0];
136             }
137              
138             # BITS OF A TEXT LINE
139              
140             =head2 set_quote_characters
141              
142             Takes a regex (C) matching characters that should indicate a quoted line.
143             By default, a very liberal set is used:
144              
145             set_quote_characters(qr/[!#%=|:]/);
146              
147             The character C<< E >> is always recognized as a quoting character.
148              
149             If C is provided instead of a regex, only C<< E >> will remain as a
150             quote character.
151              
152             Not exported by default, but exportable.
153              
154             =cut
155              
156             my $separator = qr/[-_]{2,} | [=#*]{3,} | [+~]{4,}/x;
157             my ($quotechar, $quotechunk, $quoter);
158              
159             set_quote_characters(qr/[!#%=|:]/);
160              
161             sub set_quote_characters {
162 9     9 1 17 $quotechar = shift;
163 9 100       571 $quotechunk = $quotechar
164             ? qr/(?!$separator *\z)(?:$quotechar(?!\w)|\w*>+)/
165             : qr/(?!$separator *\z)\w*>+/;
166 9         687 $quoter = qr/$quotechunk(?:[ \t]*$quotechunk)*/;
167             }
168              
169             =head2 combine_hunks
170              
171             my $text = combine_hunks( $arrayref_of_hunks );
172              
173             Takes the output of C and turns it back into text.
174              
175             Not exported by default, but exportable.
176              
177             =cut
178              
179             sub combine_hunks {
180 13     13 1 16 my ($hunks) = @_;
181              
182             join "",
183 13 100       18 map {; ref $_ eq 'HASH' ? "$_->{raw}\n" : combine_hunks($_) } @$hunks;
  35         152  
184             }
185              
186             sub _classify {
187 20     20   37 my $text = shift;
188 20 100 100     155 return { raw => undef, text => undef, quoter => undef }
189             unless defined $text && length $text;
190             # If the user passes in a null string, we really want to end up with _something_
191              
192             # DETABIFY
193 18         116 my @lines = _expand_tabs( split /\n/, $text );
194              
195             # PARSE EACH LINE
196 18         62 foreach (splice @lines) {
197 126         280 my %line = ( raw => $_ );
198 126         2579 @line{'quoter', 'text'} = (/\A *($quoter?) *(.*?)\s*\Z/);
199 126         478 $line{hang} = Hang->new( $line{'text'} );
200 126 100 66     998 $line{empty} = 1 if $line{hang}->empty() && $line{'text'} !~ /\S/;
201 126 100       1257 $line{separator} = 1 if $line{text} =~ /\A *$separator *\Z/o;
202 126         294 push @lines, \%line;
203             }
204              
205             # SUBDIVIDE DOCUMENT INTO COHERENT SUBSECTIONS
206              
207 18         40 my @chunks;
208 18         42 push @chunks, [ shift @lines ];
209 18         38 foreach my $line (@lines) {
210 108 100 100     715 if ( $line->{separator}
      100        
      100        
211             || $line->{quoter} ne $chunks[-1][-1]->{quoter}
212             || $line->{empty}
213             || $chunks[-1][-1]->{empty} )
214             {
215 69         158 push @chunks, [$line];
216             }
217             else {
218 39         35 push @{ $chunks[-1] }, $line;
  39         110  
219             }
220             }
221              
222             # REDIVIDE INTO PARAGRAPHS
223              
224 18         31 my @paras;
225 18         32 foreach my $chunk (@chunks) {
226 87         98 my $first = 1;
227 87         81 my $firstfrom;
228 87         81 foreach my $line ( @{$chunk} ) {
  87         117  
229 126 100 66     472 if ( $first
      100        
230             || $line->{quoter} ne $paras[-1]->{quoter}
231             || $paras[-1]->{separator} )
232             {
233 92         104 push @paras, $line;
234 92         92 $first = 0;
235             # We get warnings from undefined raw and text values if we don't supply alternates
236 92   100     625 $firstfrom = length( $line->{raw} ||'' ) - length( $line->{text} || '');
      100        
237             }
238             else {
239 34         106 my $extraspace =
240             length( $line->{raw} ) - length( $line->{text} ) - $firstfrom;
241 34         90 $paras[-1]->{text} .= "\n" . q{ } x $extraspace . $line->{text};
242 34         124 $paras[-1]->{raw} .= "\n" . $line->{raw};
243             }
244             }
245             }
246              
247             # Reapply hangs
248 18         142 for (grep $_->{'hang'}, @paras) {
249 92 50       568 next unless my $str = (delete $_->{hang})->stringify;
250 0         0 $_->{text} = $str . " " . $_->{text};
251             }
252 18         200 return @paras;
253             }
254              
255             # we don't use Text::Tabs anymore as it may segfault on perl 5.8.x with
256             # UTF-8 strings and tabs mixed. http://rt.perl.org/rt3/Public/Bug/Display.html?id=40989
257             # This bug unlikely to be fixed in 5.8.x, however we use workaround.
258             # As soon as Text::Tabs will be fixed we can return back to it
259             my $tabstop = 8;
260             sub _expand_tabs {
261 18     18   27 my $pad;
262 18         41 for ( @_ ) {
263 126         117 my $offs = 0;
264 126         177 s{(.*?)\t}{
265 1         4 $pad = $tabstop - (length($1) + $offs) % $tabstop;
266 1         2 $offs += length($1)+$pad;
267 1         8 $1 . (" " x $pad);
268             }eg;
269             }
270 18         79 return @_;
271             }
272              
273             =head1 CREDITS
274              
275             Most of the heavy lifting is done by a modified version of Damian Conway's
276             C.
277              
278             =head1 COPYRIGHT
279              
280             Copyright (C) 2002-2003 Kasei Limited
281             Copyright (C) 2003-2004 Simon Cozens
282             Copyright (C) 2004-2013 Best Practical Solutions, LLC
283              
284             This software is distributed WITHOUT ANY WARRANTY; without even the implied
285             warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
286              
287             This library is free software; you can redistribute it and/or modify
288             it under the same terms as Perl itself.
289              
290             =cut
291              
292             1;