File Coverage

blib/lib/Text/Quoted.pm

Criterion	Covered	Total	%
statement	76	77	98.7
branch	24	26	92.3
condition	31	34	91.1
subroutine	11	11	100.0
pod	3	3	100.0
total	145	151	96.0

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Quoted;
2							our $VERSION = "2.08";
3	7			7		151505	use 5.006;
	7					22
	7					253
4	7			7		35	use strict;
	7					13
	7					234
5	7			7		31	use warnings;
	7					16
	7					642
6
7							require Exporter;
8
9							our @ISA = qw(Exporter);
10							our @EXPORT = qw(extract);
11							our @EXPORT_OK = qw(set_quote_characters combine_hunks);
12
13	7			7		7849	use Text::Autoformat(); # Provides the Hang package, heh, heh.
	7					453247
	7					9351
14
15							=head1 NAME
16
17							Text::Quoted - Extract the structure of a quoted mail message
18
19							=head1 SYNOPSIS
20
21							use Text::Quoted;
22							Text::Quoted::set_quote_characters( qr/[:]/ ); # customize recognized quote characters
23							my $structure = extract($text);
24
25							=head1 DESCRIPTION
26
27							C examines the structure of some text which may contain
28							multiple different levels of quoting, and turns the text into a nested
29							data structure.
30
31							The structure is an array reference containing hash references for each
32							paragraph belonging to the same author. Each level of quoting recursively
33							adds another list reference. So for instance, this:
34
35							> foo
36							> # Bar
37							> baz
38
39							quux
40
41							turns into:
42
43							[
44							[
45							{ text => 'foo', quoter => '>', raw => '> foo' },
46							[
47							{ text => 'Bar', quoter => '> #', raw => '> # Bar' }
48							],
49							{ text => 'baz', quoter => '>', raw => '> baz' }
50							],
51
52							{ empty => 1 },
53							{ text => 'quux', quoter => '', raw => 'quux' }
54							];
55
56							This also tells you about what's in the hash references: C is the
57							paragraph of text as it appeared in the original input; C is what
58							it looked like when we stripped off the quotation characters, and C
59							is the quotation string.
60
61							=head1 FUNCTIONS
62
63							=head2 extract
64
65							Takes a single string argument which is the text to extract quote structure
66							from. Returns a nested datastructure as described above.
67
68							Exported by default.
69
70							=cut
71
72							sub extract {
73	20			20	1	1026	return _organize( "", _classify( @_ ) );
74							}
75
76							sub _organize {
77	60			60		81	my $top_level = shift;
78	60					102	my @todo = @_;
79	60	50				163	$top_level = '' unless defined $top_level;
80
81	60					62	my @ret;
82
83							# Recursively form a data structure which reflects the quoting
84							# structure of the list.
85	60					142	while (my $line = shift @todo) {
86	147	100				293	my $q = defined $line->{quoter}? $line->{quoter}: '';
87	147	100				533	if ( $q eq $top_level ) {
		100
88
89							# Just append lines at "my" level.
90	101	100	66			467	push @ret, $line
91							if exists $line->{quoter}
92							or exists $line->{empty};
93							}
94							elsif ( $q =~ /^\Q$top_level\E./ ) {
95
96							# Find all the lines at a quoting level "below" me.
97	40					88	my $newquoter = _find_below( $top_level, $line, @todo );
98	40					111	my @next = $line;
99	40		100			407	push @next, shift @todo while defined $todo[0]->{quoter}
100							and $todo[0]->{quoter} =~ /^\Q$newquoter/;
101
102							# Find the
103							# And pass them on to _organize()!
104							#print "Trying to organise the following lines over $newquoter:\n";
105							#print $_->{raw}."\n" for @next;
106							#print "!-!-!-\n";
107	40					121	push @ret, _organize( $newquoter, @next );
108							} # else { die "bugger! I had $top_level, but now I have $line->{raw}\n"; }
109							}
110	60					339	return \@ret;
111							}
112
113							# Given, say:
114							# X
115							# > > hello
116							# > foo bar
117							# Stuff
118							#
119							# After "X", we're moving to another level of quoting - but which one?
120							# Naively, you'd pick out the prefix of the next line, "> >", but this
121							# is incorrect - "> >" is actually a "sub-quote" of ">". This routine
122							# works out which is the next level below us.
123
124							sub _find_below {
125	40			40		85	my ( $top_level, @stuff ) = @_;
126
127							# Find the prefices, shortest first.
128							# And return the first one which is "below" where we are right
129							# now but is a proper subset of the next line.
130							return (
131	40		100			1506	sort { length $a <=> length $b }
	22					72
132							grep $_ && /^\Q$top_level\E./ && $stuff[0]->{quoter} =~ /^\Q$_\E/,
133							map $_->{quoter},
134							@stuff
135							)[0];
136							}
137
138							# BITS OF A TEXT LINE
139
140							=head2 set_quote_characters
141
142							Takes a regex (C) matching characters that should indicate a quoted line.
143							By default, a very liberal set is used:
144
145							set_quote_characters(qr/[!#%=\|:]/);
146
147							The character C<< E >> is always recognized as a quoting character.
148
149							If C is provided instead of a regex, only C<< E >> will remain as a
150							quote character.
151
152							Not exported by default, but exportable.
153
154							=cut
155
156							my $separator = qr/[-_]{2,} \| [=#*]{3,} \| [+~]{4,}/x;
157							my ($quotechar, $quotechunk, $quoter);
158
159							set_quote_characters(qr/[!#%=\|:]/);
160
161							sub set_quote_characters {
162	9			9	1	17	$quotechar = shift;
163	9	100				571	$quotechunk = $quotechar
164							? qr/(?!$separator \z)(?:$quotechar(?!\w)\|\w>+)/
165							: qr/(?!$separator \z)\w>+/;
166	9					687	$quoter = qr/$quotechunk(?:[ \t]$quotechunk)/;
167							}
168
169							=head2 combine_hunks
170
171							my $text = combine_hunks( $arrayref_of_hunks );
172
173							Takes the output of C and turns it back into text.
174
175							Not exported by default, but exportable.
176
177							=cut
178
179							sub combine_hunks {
180	13			13	1	16	my ($hunks) = @_;
181
182							join "",
183	13	100				18	map {; ref $_ eq 'HASH' ? "$_->{raw}\n" : combine_hunks($_) } @$hunks;
	35					152
184							}
185
186							sub _classify {
187	20			20		37	my $text = shift;
188	20	100	100			155	return { raw => undef, text => undef, quoter => undef }
189							unless defined $text && length $text;
190							# If the user passes in a null string, we really want to end up with _something_
191
192							# DETABIFY
193	18					116	my @lines = _expand_tabs( split /\n/, $text );
194
195							# PARSE EACH LINE
196	18					62	foreach (splice @lines) {
197	126					280	my %line = ( raw => $_ );
198	126					2579	@line{'quoter', 'text'} = (/\A ($quoter?) (.?)\s\Z/);
199	126					478	$line{hang} = Hang->new( $line{'text'} );
200	126	100	66			998	$line{empty} = 1 if $line{hang}->empty() && $line{'text'} !~ /\S/;
201	126	100				1257	$line{separator} = 1 if $line{text} =~ /\A $separator \Z/o;
202	126					294	push @lines, \%line;
203							}
204
205							# SUBDIVIDE DOCUMENT INTO COHERENT SUBSECTIONS
206
207	18					40	my @chunks;
208	18					42	push @chunks, [ shift @lines ];
209	18					38	foreach my $line (@lines) {
210	108	100	100			715	if ( $line->{separator}
			100
			100
211							\|\| $line->{quoter} ne $chunks[-1][-1]->{quoter}
212							\|\| $line->{empty}
213							\|\| $chunks[-1][-1]->{empty} )
214							{
215	69					158	push @chunks, [$line];
216							}
217							else {
218	39					35	push @{ $chunks[-1] }, $line;
	39					110
219							}
220							}
221
222							# REDIVIDE INTO PARAGRAPHS
223
224	18					31	my @paras;
225	18					32	foreach my $chunk (@chunks) {
226	87					98	my $first = 1;
227	87					81	my $firstfrom;
228	87					81	foreach my $line ( @{$chunk} ) {
	87					117
229	126	100	66			472	if ( $first
			100
230							\|\| $line->{quoter} ne $paras[-1]->{quoter}
231							\|\| $paras[-1]->{separator} )
232							{
233	92					104	push @paras, $line;
234	92					92	$first = 0;
235							# We get warnings from undefined raw and text values if we don't supply alternates
236	92		100			625	$firstfrom = length( $line->{raw} \|\|'' ) - length( $line->{text} \|\| '');
			100
237							}
238							else {
239	34					106	my $extraspace =
240							length( $line->{raw} ) - length( $line->{text} ) - $firstfrom;
241	34					90	$paras[-1]->{text} .= "\n" . q{ } x $extraspace . $line->{text};
242	34					124	$paras[-1]->{raw} .= "\n" . $line->{raw};
243							}
244							}
245							}
246
247							# Reapply hangs
248	18					142	for (grep $_->{'hang'}, @paras) {
249	92	50				568	next unless my $str = (delete $_->{hang})->stringify;
250	0					0	$_->{text} = $str . " " . $_->{text};
251							}
252	18					200	return @paras;
253							}
254
255							# we don't use Text::Tabs anymore as it may segfault on perl 5.8.x with
256							# UTF-8 strings and tabs mixed. http://rt.perl.org/rt3/Public/Bug/Display.html?id=40989
257							# This bug unlikely to be fixed in 5.8.x, however we use workaround.
258							# As soon as Text::Tabs will be fixed we can return back to it
259							my $tabstop = 8;
260							sub _expand_tabs {
261	18			18		27	my $pad;
262	18					41	for ( @_ ) {
263	126					117	my $offs = 0;
264	126					177	s{(.*?)\t}{
265	1					4	$pad = $tabstop - (length($1) + $offs) % $tabstop;
266	1					2	$offs += length($1)+$pad;
267	1					8	$1 . (" " x $pad);
268							}eg;
269							}
270	18					79	return @_;
271							}
272
273							=head1 CREDITS
274
275							Most of the heavy lifting is done by a modified version of Damian Conway's
276							C.
277
278							=head1 COPYRIGHT
279
280							Copyright (C) 2002-2003 Kasei Limited
281							Copyright (C) 2003-2004 Simon Cozens
282							Copyright (C) 2004-2013 Best Practical Solutions, LLC
283
284							This software is distributed WITHOUT ANY WARRANTY; without even the implied
285							warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
286
287							This library is free software; you can redistribute it and/or modify
288							it under the same terms as Perl itself.
289
290							=cut
291
292							1;