File Coverage

blib/lib/Text/Quoted.pm
Criterion Covered Total %
statement 71 72 98.6
branch 24 26 92.3
condition 31 34 91.1
subroutine 11 11 100.0
pod 3 3 100.0
total 140 146 95.8


line stmt bran cond sub pod time code
1             package Text::Quoted;
2             our $VERSION = "2.09";
3 7     7   125738 use 5.006;
  7         23  
  7         285  
4 7     7   39 use strict;
  7         11  
  7         269  
5 7     7   31 use warnings;
  7         13  
  7         525  
6              
7             require Exporter;
8              
9             our @ISA = qw(Exporter);
10             our @EXPORT = qw(extract);
11             our @EXPORT_OK = qw(set_quote_characters combine_hunks);
12              
13             # Provides either the 'Hang' package -- or, on since version 1.69 of
14             # Text::Autoformat, the 'Text::Autoformat::Hang' package.
15 7     7   4940 use Text::Autoformat();
  7         332418  
  7         489  
16             my $hang_package = Hang->can('new') ? "Hang" : "Text::Autoformat::Hang";
17              
18 7     7   73 use Text::Tabs();
  7         11  
  7         6812  
19              
20             =head1 NAME
21              
22             Text::Quoted - Extract the structure of a quoted mail message
23              
24             =head1 SYNOPSIS
25              
26             use Text::Quoted;
27             my $structure = extract($text);
28              
29             # Optionally, customize recognized quote characters:
30             Text::Quoted::set_quote_characters( qr/[:]/ );
31              
32             =head1 DESCRIPTION
33              
34             C examines the structure of some text which may contain
35             multiple different levels of quoting, and turns the text into a nested
36             data structure.
37              
38             The structure is an array reference containing hash references for each
39             paragraph belonging to the same author. Each level of quoting
40             recursively adds another list reference. So for instance, this:
41              
42             > foo
43             > # Bar
44             > baz
45              
46             quux
47              
48             turns into:
49              
50             [
51             [
52             { text => 'foo', quoter => '>', raw => '> foo' },
53             [
54             { text => 'Bar', quoter => '> #', raw => '> # Bar' }
55             ],
56             { text => 'baz', quoter => '>', raw => '> baz' }
57             ],
58              
59             { empty => 1 },
60             { text => 'quux', quoter => '', raw => 'quux' }
61             ];
62              
63             This also tells you about what's in the hash references: C is the
64             paragraph of text as it appeared in the original input; C is what
65             it looked like when we stripped off the quotation characters, and
66             C is the quotation string.
67              
68             =head1 FUNCTIONS
69              
70             =head2 extract
71              
72             Takes a single string argument which is the text to extract quote
73             structure from. Returns a nested datastructure as described above.
74              
75             Exported by default.
76              
77             =cut
78              
79             sub extract {
80 20     20 1 614 return _organize( "", _classify( @_ ) );
81             }
82              
83             sub _organize {
84 60     60   70 my $top_level = shift;
85 60         91 my @todo = @_;
86 60 50       106 $top_level = '' unless defined $top_level;
87              
88 60         64 my @ret;
89              
90             # Recursively form a data structure which reflects the quoting
91             # structure of the list.
92 60         116 while (my $line = shift @todo) {
93 147 100       240 my $q = defined $line->{quoter}? $line->{quoter}: '';
94 147 100       477 if ( $q eq $top_level ) {
    100          
95              
96             # Just append lines at "my" level.
97 101 100 66     364 push @ret, $line
98             if exists $line->{quoter}
99             or exists $line->{empty};
100             }
101             elsif ( $q =~ /^\Q$top_level\E./ ) {
102              
103             # Find all the lines at a quoting level "below" me.
104 40         77 my $newquoter = _find_below( $top_level, $line, @todo );
105 40         66 my @next = $line;
106 40   100     348 push @next, shift @todo while defined $todo[0]->{quoter}
107             and $todo[0]->{quoter} =~ /^\Q$newquoter/;
108              
109             # And pass them on to _organize()!
110 40         117 push @ret, _organize( $newquoter, @next );
111             }
112             }
113 60         281 return \@ret;
114             }
115              
116             # Given, say:
117             # X
118             # > > hello
119             # > foo bar
120             # Stuff
121             #
122             # After "X", we're moving to another level of quoting - but which one?
123             # Naively, you'd pick out the prefix of the next line, "> >", but this
124             # is incorrect - "> >" is actually a "sub-quote" of ">". This routine
125             # works out which is the next level below us.
126              
127             sub _find_below {
128 40     40   74 my ( $top_level, @stuff ) = @_;
129              
130             # Find the prefixes, shortest first; return the first one which is
131             # "below" where we are right now but is a proper subset of the next
132             # line.
133             return (
134 40   100     1180 sort { length $a <=> length $b }
  22         67  
135             grep $_ && /^\Q$top_level\E./ && $stuff[0]->{quoter} =~ /^\Q$_\E/,
136             map $_->{quoter},
137             @stuff
138             )[0];
139             }
140              
141             # BITS OF A TEXT LINE
142              
143             =head2 set_quote_characters
144              
145             Takes a regex (C) matching characters that should indicate a
146             quoted line. By default, a very liberal set is used:
147              
148             set_quote_characters(qr/[!#%=|:]/);
149              
150             The character C<< E >> is always recognized as a quoting character.
151              
152             If C is provided instead of a regex, only C<< E >> will
153             remain as a quote character.
154              
155             Not exported by default, but exportable.
156              
157             =cut
158              
159             my $separator = qr/[-_]{2,} | [=#*]{3,} | [+~]{4,}/x;
160             my ($quotechar, $quotechunk, $quoter);
161              
162             set_quote_characters(qr/[!#%=|:]/);
163              
164             sub set_quote_characters {
165 9     9 1 18 $quotechar = shift;
166 9 100       478 $quotechunk = $quotechar
167             ? qr/(?!$separator *\z)(?:$quotechar(?!\w)|\w*>+)/
168             : qr/(?!$separator *\z)\w*>+/;
169 9         503 $quoter = qr/$quotechunk(?:[ \t]*$quotechunk)*/;
170             }
171              
172             =head2 combine_hunks
173              
174             my $text = combine_hunks( $arrayref_of_hunks );
175              
176             Takes the output of C and turns it back into text.
177              
178             Not exported by default, but exportable.
179              
180             =cut
181              
182             sub combine_hunks {
183 13     13 1 12 my ($hunks) = @_;
184              
185             join "",
186 13 100       19 map {; ref $_ eq 'HASH' ? "$_->{raw}\n" : combine_hunks($_) } @$hunks;
  35         114  
187             }
188              
189             sub _classify {
190 20     20   37 my $text = shift;
191 20 100 100     141 return { raw => undef, text => undef, quoter => undef }
192             unless defined $text && length $text;
193             # If the user passes in a null string, we really want to end up with _something_
194              
195             # DETABIFY
196 18         121 my @lines = Text::Tabs::expand( split /\n/, $text );
197              
198             # PARSE EACH LINE
199 18         1800 foreach (splice @lines) {
200 126         266 my %line = ( raw => $_ );
201 126         1965 @line{'quoter', 'text'} = (/\A *($quoter?) *(.*?)\s*\Z/);
202 126         422 $line{hang} = $hang_package->new( $line{'text'} );
203 126 100 66     915 $line{empty} = 1 if $line{hang}->empty() && $line{'text'} !~ /\S/;
204 126 100       1086 $line{separator} = 1 if $line{text} =~ /\A *$separator *\Z/o;
205 126         253 push @lines, \%line;
206             }
207              
208             # SUBDIVIDE DOCUMENT INTO COHERENT SUBSECTIONS
209              
210 18         33 my @chunks;
211 18         41 push @chunks, [ shift @lines ];
212 18         30 foreach my $line (@lines) {
213 108 100 100     578 if ( $line->{separator}
      100        
      100        
214             || $line->{quoter} ne $chunks[-1][-1]->{quoter}
215             || $line->{empty}
216             || $chunks[-1][-1]->{empty} )
217             {
218 69         101 push @chunks, [$line];
219             }
220             else {
221 39         31 push @{ $chunks[-1] }, $line;
  39         64  
222             }
223             }
224              
225             # REDIVIDE INTO PARAGRAPHS
226              
227 18         21 my @paras;
228 18         29 foreach my $chunk (@chunks) {
229 87         76 my $first = 1;
230 87         56 my $firstfrom;
231 87         86 foreach my $line ( @{$chunk} ) {
  87         88  
232 126 100 66     374 if ( $first
      100        
233             || $line->{quoter} ne $paras[-1]->{quoter}
234             || $paras[-1]->{separator} )
235             {
236 92         77 push @paras, $line;
237 92         74 $first = 0;
238             # We get warnings from undefined raw and text values if we don't supply alternates
239 92   100     345 $firstfrom = length( $line->{raw} ||'' ) - length( $line->{text} || '');
      100        
240             }
241             else {
242 34         49 my $extraspace =
243             length( $line->{raw} ) - length( $line->{text} ) - $firstfrom;
244 34         77 $paras[-1]->{text} .= "\n" . q{ } x $extraspace . $line->{text};
245 34         128 $paras[-1]->{raw} .= "\n" . $line->{raw};
246             }
247             }
248             }
249              
250             # Reapply hangs
251 18         106 for (grep $_->{'hang'}, @paras) {
252 92 50       419 next unless my $str = (delete $_->{hang})->stringify;
253 0         0 $_->{text} = $str . " " . $_->{text};
254             }
255 18         179 return @paras;
256             }
257              
258             =head1 CREDITS
259              
260             Most of the heavy lifting is done by a modified version of Damian
261             Conway's C.
262              
263             =head1 AUTHOR
264              
265             Best Practical Solutions, LLC Emodules@bestpractical.comE
266              
267             =head1 LICENSE AND COPYRIGHT
268              
269             This software is Copyright (c) 2004-2015 by Best Practical Solutions, LLC
270              
271             This library is free software; you can redistribute it and/or modify it
272             under the same terms as Perl itself.
273              
274             =cut
275              
276             1;