File Coverage

blib/lib/Text/Quoted.pm
Criterion Covered Total %
statement 72 73 98.6
branch 25 28 89.2
condition 36 39 92.3
subroutine 11 11 100.0
pod 3 3 100.0
total 147 154 95.4


line stmt bran cond sub pod time code
1             package Text::Quoted;
2             our $VERSION = "2.10";
3 7     7   301501 use 5.006;
  7         53  
4 7     7   25 use strict;
  7         9  
  7         125  
5 7     7   29 use warnings;
  7         22  
  7         420  
6              
7             require Exporter;
8              
9             our @ISA = qw(Exporter);
10             our @EXPORT = qw(extract);
11             our @EXPORT_OK = qw(set_quote_characters combine_hunks);
12              
13             # Provides either the 'Hang' package -- or, on since version 1.69 of
14             # Text::Autoformat, the 'Text::Autoformat::Hang' package.
15 7     7   3310 use Text::Autoformat();
  7         215138  
  7         286  
16             my $hang_package = Hang->can('new') ? "Hang" : "Text::Autoformat::Hang";
17              
18 7     7   45 use Text::Tabs();
  7         8  
  7         4614  
19              
20             =head1 NAME
21              
22             Text::Quoted - Extract the structure of a quoted mail message
23              
24             =head1 SYNOPSIS
25              
26             use Text::Quoted;
27             my $structure = extract($text);
28              
29             # Optionally, customize recognized quote characters:
30             Text::Quoted::set_quote_characters( qr/[:]/ );
31              
32             =head1 DESCRIPTION
33              
34             C examines the structure of some text which may contain
35             multiple different levels of quoting, and turns the text into a nested
36             data structure.
37              
38             The structure is an array reference containing hash references for each
39             paragraph belonging to the same author. Each level of quoting
40             recursively adds another list reference. So for instance, this:
41              
42             > foo
43             > # Bar
44             > baz
45              
46             quux
47              
48             turns into:
49              
50             [
51             [
52             { text => 'foo', quoter => '>', raw => '> foo' },
53             [
54             { text => 'Bar', quoter => '> #', raw => '> # Bar' }
55             ],
56             { text => 'baz', quoter => '>', raw => '> baz' }
57             ],
58              
59             { empty => 1 },
60             { text => 'quux', quoter => '', raw => 'quux' }
61             ];
62              
63             This also tells you about what's in the hash references: C is the
64             paragraph of text as it appeared in the original input; C is what
65             it looked like when we stripped off the quotation characters, and
66             C is the quotation string.
67              
68             =head1 FUNCTIONS
69              
70             =head2 extract
71              
72             my $struct = extract($text, \%arg);
73              
74             Takes a single string argument which is the text to extract quote
75             structure from. Returns a nested datastructure as described above.
76              
77             Second argument is optional: a hashref of options. The only valid
78             argument at present is:
79              
80             no_separators - never mark paragraphs as "separators"
81              
82             Exported by default.
83              
84             =cut
85              
86             sub extract {
87 22     22 1 919 return _organize( "", _classify( @_ ) );
88             }
89              
90             sub _organize {
91 63     63   72 my $top_level = shift;
92 63         94 my @todo = @_;
93 63 50       90 $top_level = '' unless defined $top_level;
94              
95 63         68 my @ret;
96              
97             # Recursively form a data structure which reflects the quoting
98             # structure of the list.
99 63         102 while (my $line = shift @todo) {
100 152 100       229 my $q = defined $line->{quoter}? $line->{quoter}: '';
101 152 100       369 if ( $q eq $top_level ) {
    100          
102              
103             # Just append lines at "my" level.
104             push @ret, $line
105             if exists $line->{quoter}
106 105 100 66     250 or exists $line->{empty};
107             }
108             elsif ( $q =~ /^\Q$top_level\E./ ) {
109              
110             # Find all the lines at a quoting level "below" me.
111 41         69 my $newquoter = _find_below( $top_level, $line, @todo );
112 41         72 my @next = $line;
113             push @next, shift @todo while defined $todo[0]->{quoter}
114 41   100     252 and $todo[0]->{quoter} =~ /^\Q$newquoter/;
115              
116             # And pass them on to _organize()!
117 41         85 push @ret, _organize( $newquoter, @next );
118             }
119             }
120 63         210 return \@ret;
121             }
122              
123             # Given, say:
124             # X
125             # > > hello
126             # > foo bar
127             # Stuff
128             #
129             # After "X", we're moving to another level of quoting - but which one?
130             # Naively, you'd pick out the prefix of the next line, "> >", but this
131             # is incorrect - "> >" is actually a "sub-quote" of ">". This routine
132             # works out which is the next level below us.
133              
134             sub _find_below {
135 41     41   66 my ( $top_level, @stuff ) = @_;
136              
137             # Find the prefixes, shortest first; return the first one which is
138             # "below" where we are right now but is a proper subset of the next
139             # line.
140             return (
141 22         59 sort { length $a <=> length $b }
142             grep $_ && /^\Q$top_level\E./ && $stuff[0]->{quoter} =~ /^\Q$_\E/,
143             map $_->{quoter},
144             @stuff
145 41   100     809 )[0];
146             }
147              
148             # BITS OF A TEXT LINE
149              
150             =head2 set_quote_characters
151              
152             Takes a regex (C) matching characters that should indicate a
153             quoted line. By default, a very liberal set is used:
154              
155             set_quote_characters(qr/[!#%=|:]/);
156              
157             The character C<< E >> is always recognized as a quoting character.
158              
159             If C is provided instead of a regex, only C<< E >> will
160             remain as a quote character.
161              
162             Not exported by default, but exportable.
163              
164             =cut
165              
166             my $separator = qr/[-_]{2,} | [=#*]{3,} | [+~]{4,}/x;
167             my ($quotechar, $quotechunk, $quoter);
168              
169             set_quote_characters(qr/[!#%=|:]/);
170              
171             sub set_quote_characters {
172 9     9 1 16 $quotechar = shift;
173 9 100       442 $quotechunk = $quotechar
174             ? qr/(?!$separator *\z)(?:$quotechar(?!\w)|\w*>+)/
175             : qr/(?!$separator *\z)\w*>+/;
176 9         430 $quoter = qr/$quotechunk(?:[ \t]*$quotechunk)*/;
177             }
178              
179             =head2 combine_hunks
180              
181             my $text = combine_hunks( $arrayref_of_hunks );
182              
183             Takes the output of C and turns it back into text.
184              
185             Not exported by default, but exportable.
186              
187             =cut
188              
189             sub combine_hunks {
190 13     13 1 15 my ($hunks) = @_;
191              
192             join "",
193 13 100       16 map {; ref $_ eq 'HASH' ? "$_->{raw}\n" : combine_hunks($_) } @$hunks;
  35         93  
194             }
195              
196             sub _classify {
197 22     22   40 my ($text, $arg) = @_;
198 22   100     104 $arg ||= {};
199              
200             # If the user passes in a null string, we really want to end up with
201             # _something_
202 22 100 100     75 return { raw => undef, text => undef, quoter => undef }
203             unless defined $text && length $text;
204              
205             # DETABIFY
206 20         104 my @lines = Text::Tabs::expand( split /\n/, $text );
207              
208             # PARSE EACH LINE
209 20         855 foreach (splice @lines) {
210 136         247 my %line = ( raw => $_ );
211 136         1666 @line{'quoter', 'text'} = (/\A *($quoter?) *(.*?)\s*\Z/);
212 136         369 $line{hang} = $hang_package->new( $line{'text'} );
213 136 100 66     925 $line{empty} = 1 if $line{hang}->empty() && $line{'text'} !~ /\S/;
214             $line{separator} = 1 if $line{text} =~ /\A *$separator *\Z/o
215 136 100 100     954 and ! $arg->{no_separators};
216 136         246 push @lines, \%line;
217             }
218              
219             # SUBDIVIDE DOCUMENT INTO COHERENT SUBSECTIONS
220              
221 20         32 my @chunks;
222 20         36 push @chunks, [ shift @lines ];
223 20         29 foreach my $line (@lines) {
224 116 100 100     378 if ( $line->{separator}
      100        
      100        
225             || $line->{quoter} ne $chunks[-1][-1]->{quoter}
226             || $line->{empty}
227             || $chunks[-1][-1]->{empty} )
228             {
229 70         98 push @chunks, [$line];
230             }
231             else {
232 46         46 push @{ $chunks[-1] }, $line;
  46         63  
233             }
234             }
235              
236             # REDIVIDE INTO PARAGRAPHS
237              
238 20         33 my @paras;
239 20         34 foreach my $chunk (@chunks) {
240 90         94 my $first = 1;
241 90         80 my $firstfrom;
242 90         72 foreach my $line ( @{$chunk} ) {
  90         97  
243 136 100 66     277 if ( $first
      100        
244             || $line->{quoter} ne $paras[-1]->{quoter}
245             || $paras[-1]->{separator} )
246             {
247 95         105 push @paras, $line;
248 95         93 $first = 0;
249             # We get warnings from undefined raw and text values if we don't supply alternates
250 95   100     211 $firstfrom = length( $line->{raw} ||'' ) - length( $line->{text} || '');
      100        
251             }
252             else {
253             my $extraspace =
254 41         54 length( $line->{raw} ) - length( $line->{text} ) - $firstfrom;
255 41 50       58 $extraspace = 0 if $extraspace < 0;
256 41         73 $paras[-1]->{text} .= "\n" . q{ } x $extraspace . $line->{text};
257 41         92 $paras[-1]->{raw} .= "\n" . $line->{raw};
258             }
259             }
260             }
261              
262             # Reapply hangs
263 20         84 for (grep $_->{'hang'}, @paras) {
264 95 50       320 next unless my $str = (delete $_->{hang})->stringify;
265 0         0 $_->{text} = $str . " " . $_->{text};
266             }
267 20         129 return @paras;
268             }
269              
270             =head1 CREDITS
271              
272             Most of the heavy lifting is done by a modified version of Damian
273             Conway's C.
274              
275             =head1 AUTHOR
276              
277             Best Practical Solutions, LLC Emodules@bestpractical.comE
278              
279             =head1 LICENSE AND COPYRIGHT
280              
281             This software is Copyright (c) 2004-2015 by Best Practical Solutions, LLC
282              
283             This library is free software; you can redistribute it and/or modify it
284             under the same terms as Perl itself.
285              
286             =cut
287              
288             1;