File Coverage

blib/lib/Text/Quoted.pm

Criterion	Covered	Total	%
statement	72	73	98.6
branch	25	28	89.2
condition	36	39	92.3
subroutine	11	11	100.0
pod	3	3	100.0
total	147	154	95.4

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Quoted;
2							our $VERSION = "2.10";
3	7			7		301501	use 5.006;
	7					53
4	7			7		25	use strict;
	7					9
	7					125
5	7			7		29	use warnings;
	7					22
	7					420
6
7							require Exporter;
8
9							our @ISA = qw(Exporter);
10							our @EXPORT = qw(extract);
11							our @EXPORT_OK = qw(set_quote_characters combine_hunks);
12
13							# Provides either the 'Hang' package -- or, on since version 1.69 of
14							# Text::Autoformat, the 'Text::Autoformat::Hang' package.
15	7			7		3310	use Text::Autoformat();
	7					215138
	7					286
16							my $hang_package = Hang->can('new') ? "Hang" : "Text::Autoformat::Hang";
17
18	7			7		45	use Text::Tabs();
	7					8
	7					4614
19
20							=head1 NAME
21
22							Text::Quoted - Extract the structure of a quoted mail message
23
24							=head1 SYNOPSIS
25
26							use Text::Quoted;
27							my $structure = extract($text);
28
29							# Optionally, customize recognized quote characters:
30							Text::Quoted::set_quote_characters( qr/[:]/ );
31
32							=head1 DESCRIPTION
33
34							C examines the structure of some text which may contain
35							multiple different levels of quoting, and turns the text into a nested
36							data structure.
37
38							The structure is an array reference containing hash references for each
39							paragraph belonging to the same author. Each level of quoting
40							recursively adds another list reference. So for instance, this:
41
42							> foo
43							> # Bar
44							> baz
45
46							quux
47
48							turns into:
49
50							[
51							[
52							{ text => 'foo', quoter => '>', raw => '> foo' },
53							[
54							{ text => 'Bar', quoter => '> #', raw => '> # Bar' }
55							],
56							{ text => 'baz', quoter => '>', raw => '> baz' }
57							],
58
59							{ empty => 1 },
60							{ text => 'quux', quoter => '', raw => 'quux' }
61							];
62
63							This also tells you about what's in the hash references: C is the
64							paragraph of text as it appeared in the original input; C is what
65							it looked like when we stripped off the quotation characters, and
66							C is the quotation string.
67
68							=head1 FUNCTIONS
69
70							=head2 extract
71
72							my $struct = extract($text, \%arg);
73
74							Takes a single string argument which is the text to extract quote
75							structure from. Returns a nested datastructure as described above.
76
77							Second argument is optional: a hashref of options. The only valid
78							argument at present is:
79
80							no_separators - never mark paragraphs as "separators"
81
82							Exported by default.
83
84							=cut
85
86							sub extract {
87	22			22	1	919	return _organize( "", _classify( @_ ) );
88							}
89
90							sub _organize {
91	63			63		72	my $top_level = shift;
92	63					94	my @todo = @_;
93	63	50				90	$top_level = '' unless defined $top_level;
94
95	63					68	my @ret;
96
97							# Recursively form a data structure which reflects the quoting
98							# structure of the list.
99	63					102	while (my $line = shift @todo) {
100	152	100				229	my $q = defined $line->{quoter}? $line->{quoter}: '';
101	152	100				369	if ( $q eq $top_level ) {
		100
102
103							# Just append lines at "my" level.
104							push @ret, $line
105							if exists $line->{quoter}
106	105	100	66			250	or exists $line->{empty};
107							}
108							elsif ( $q =~ /^\Q$top_level\E./ ) {
109
110							# Find all the lines at a quoting level "below" me.
111	41					69	my $newquoter = _find_below( $top_level, $line, @todo );
112	41					72	my @next = $line;
113							push @next, shift @todo while defined $todo[0]->{quoter}
114	41		100			252	and $todo[0]->{quoter} =~ /^\Q$newquoter/;
115
116							# And pass them on to _organize()!
117	41					85	push @ret, _organize( $newquoter, @next );
118							}
119							}
120	63					210	return \@ret;
121							}
122
123							# Given, say:
124							# X
125							# > > hello
126							# > foo bar
127							# Stuff
128							#
129							# After "X", we're moving to another level of quoting - but which one?
130							# Naively, you'd pick out the prefix of the next line, "> >", but this
131							# is incorrect - "> >" is actually a "sub-quote" of ">". This routine
132							# works out which is the next level below us.
133
134							sub _find_below {
135	41			41		66	my ( $top_level, @stuff ) = @_;
136
137							# Find the prefixes, shortest first; return the first one which is
138							# "below" where we are right now but is a proper subset of the next
139							# line.
140							return (
141	22					59	sort { length $a <=> length $b }
142							grep $_ && /^\Q$top_level\E./ && $stuff[0]->{quoter} =~ /^\Q$_\E/,
143							map $_->{quoter},
144							@stuff
145	41		100			809	)[0];
146							}
147
148							# BITS OF A TEXT LINE
149
150							=head2 set_quote_characters
151
152							Takes a regex (C) matching characters that should indicate a
153							quoted line. By default, a very liberal set is used:
154
155							set_quote_characters(qr/[!#%=\|:]/);
156
157							The character C<< E >> is always recognized as a quoting character.
158
159							If C is provided instead of a regex, only C<< E >> will
160							remain as a quote character.
161
162							Not exported by default, but exportable.
163
164							=cut
165
166							my $separator = qr/[-_]{2,} \| [=#*]{3,} \| [+~]{4,}/x;
167							my ($quotechar, $quotechunk, $quoter);
168
169							set_quote_characters(qr/[!#%=\|:]/);
170
171							sub set_quote_characters {
172	9			9	1	16	$quotechar = shift;
173	9	100				442	$quotechunk = $quotechar
174							? qr/(?!$separator \z)(?:$quotechar(?!\w)\|\w>+)/
175							: qr/(?!$separator \z)\w>+/;
176	9					430	$quoter = qr/$quotechunk(?:[ \t]$quotechunk)/;
177							}
178
179							=head2 combine_hunks
180
181							my $text = combine_hunks( $arrayref_of_hunks );
182
183							Takes the output of C and turns it back into text.
184
185							Not exported by default, but exportable.
186
187							=cut
188
189							sub combine_hunks {
190	13			13	1	15	my ($hunks) = @_;
191
192							join "",
193	13	100				16	map {; ref $_ eq 'HASH' ? "$_->{raw}\n" : combine_hunks($_) } @$hunks;
	35					93
194							}
195
196							sub _classify {
197	22			22		40	my ($text, $arg) = @_;
198	22		100			104	$arg \|\|= {};
199
200							# If the user passes in a null string, we really want to end up with
201							# _something_
202	22	100	100			75	return { raw => undef, text => undef, quoter => undef }
203							unless defined $text && length $text;
204
205							# DETABIFY
206	20					104	my @lines = Text::Tabs::expand( split /\n/, $text );
207
208							# PARSE EACH LINE
209	20					855	foreach (splice @lines) {
210	136					247	my %line = ( raw => $_ );
211	136					1666	@line{'quoter', 'text'} = (/\A ($quoter?) (.?)\s\Z/);
212	136					369	$line{hang} = $hang_package->new( $line{'text'} );
213	136	100	66			925	$line{empty} = 1 if $line{hang}->empty() && $line{'text'} !~ /\S/;
214							$line{separator} = 1 if $line{text} =~ /\A $separator \Z/o
215	136	100	100			954	and ! $arg->{no_separators};
216	136					246	push @lines, \%line;
217							}
218
219							# SUBDIVIDE DOCUMENT INTO COHERENT SUBSECTIONS
220
221	20					32	my @chunks;
222	20					36	push @chunks, [ shift @lines ];
223	20					29	foreach my $line (@lines) {
224	116	100	100			378	if ( $line->{separator}
			100
			100
225							\|\| $line->{quoter} ne $chunks[-1][-1]->{quoter}
226							\|\| $line->{empty}
227							\|\| $chunks[-1][-1]->{empty} )
228							{
229	70					98	push @chunks, [$line];
230							}
231							else {
232	46					46	push @{ $chunks[-1] }, $line;
	46					63
233							}
234							}
235
236							# REDIVIDE INTO PARAGRAPHS
237
238	20					33	my @paras;
239	20					34	foreach my $chunk (@chunks) {
240	90					94	my $first = 1;
241	90					80	my $firstfrom;
242	90					72	foreach my $line ( @{$chunk} ) {
	90					97
243	136	100	66			277	if ( $first
			100
244							\|\| $line->{quoter} ne $paras[-1]->{quoter}
245							\|\| $paras[-1]->{separator} )
246							{
247	95					105	push @paras, $line;
248	95					93	$first = 0;
249							# We get warnings from undefined raw and text values if we don't supply alternates
250	95		100			211	$firstfrom = length( $line->{raw} \|\|'' ) - length( $line->{text} \|\| '');
			100
251							}
252							else {
253							my $extraspace =
254	41					54	length( $line->{raw} ) - length( $line->{text} ) - $firstfrom;
255	41	50				58	$extraspace = 0 if $extraspace < 0;
256	41					73	$paras[-1]->{text} .= "\n" . q{ } x $extraspace . $line->{text};
257	41					92	$paras[-1]->{raw} .= "\n" . $line->{raw};
258							}
259							}
260							}
261
262							# Reapply hangs
263	20					84	for (grep $_->{'hang'}, @paras) {
264	95	50				320	next unless my $str = (delete $_->{hang})->stringify;
265	0					0	$_->{text} = $str . " " . $_->{text};
266							}
267	20					129	return @paras;
268							}
269
270							=head1 CREDITS
271
272							Most of the heavy lifting is done by a modified version of Damian
273							Conway's C.
274
275							=head1 AUTHOR
276
277							Best Practical Solutions, LLC Emodules@bestpractical.comE
278
279							=head1 LICENSE AND COPYRIGHT
280
281							This software is Copyright (c) 2004-2015 by Best Practical Solutions, LLC
282
283							This library is free software; you can redistribute it and/or modify it
284							under the same terms as Perl itself.
285
286							=cut
287
288							1;