File Coverage

blib/lib/HTML/FormatText/WithLinks/AndTables.pm
Criterion Covered Total %
statement 107 108 99.0
branch 30 36 83.3
condition 8 10 80.0
subroutine 8 8 100.0
pod 2 3 66.6
total 155 165 93.9


line stmt bran cond sub pod time code
1             package HTML::FormatText::WithLinks::AndTables;
2              
3 7     7   16182 use strict;
  7         9  
  7         162  
4 7     7   21 use warnings;
  7         7  
  7         250  
5              
6             our $VERSION = '0.07'; # VERSION
7              
8 7     7   21 use base 'HTML::FormatText::WithLinks';
  7         8  
  7         3186  
9 7     7   252582 use HTML::TreeBuilder;
  7         10  
  7         27  
10              
11             ################################################################################
12             # configuration defaults
13             ################################################################################
14             my $cellpadding = 1; # number of horizontal spaces to pad interior of cells
15             my $no_rowspacing = 0; # boolean, suppress space between table rows and rows with empty s
16             ################################################################################
17              
18             =head1 NAME
19              
20             HTML::FormatText::WithLinks::AndTables - Converts HTML to Text with tables intact
21              
22             =head1 VERSION
23              
24             version 0.07
25              
26             =cut
27              
28             =head1 SYNOPSIS
29              
30             use HTML::FormatText::WithLinks::AndTables;
31              
32             my $text = HTML::FormatText::WithLinks::AndTables->convert($html);
33              
34             Or optionally...
35              
36             my $conf = { # same as HTML::FormatText excepting below
37             cellpadding => 2, # defaults to 1
38             no_rowspacing => 1, # bool, suppress vertical space between table rows
39             };
40              
41             my $text = HTML::FormatText::WithLinks::AndTables->convert($html, $conf);
42              
43             =head1 DESCRIPTION
44              
45             This module was inspired by HTML::FormatText::WithLinks which has proven to be a
46             useful `lynx -dump` work-alike. However one frustration was that no other HTML
47             converters I came across had the ability to deal affectively with HTML s.
48             This module can in a rudimentary sense do so. The aim was to provide facility to take
49             a simple HTML based email template, and to also convert it to text with the
50             structure intact for inclusion as "multipart/alternative" content. Further, it will
51             preserve both the formatting specified by the tag's "align" attribute, and will
52             also preserve multiline text inside of a element provided it is broken using
53             tags.
54              
55             =head2 EXPORT
56              
57             None by default.
58              
59              
60             =head1 METHODS
61              
62             =head2 convert
63              
64             =cut
65              
66             my $parser_indent = 3; # HTML::FormatText::WithLinks adds this indent to data in each
67             my $conf_defaults = {};
68              
69             # the one and only public interface
70             sub convert {
71 6 50   6 1 123 shift if $_[0] eq __PACKAGE__; # to make it function friendly
72 6         12 my ($html, $conf) = @_;
73              
74             # over-ride our defaults
75 6 50 33     53 if ($conf and ref $conf eq 'HASH') {
76 6 50       24 $no_rowspacing = $$conf{no_rowspacing} if $$conf{no_rowspacing};
77 6         12 delete $$conf{no_rowspacing};
78 6 100       25 $cellpadding = $$conf{cellpadding} if $$conf{cellpadding};
79 6         8 delete $$conf{cellpadding};
80 6         34 %$conf_defaults = (%$conf_defaults, %$conf);
81             }
82              
83 6         65 return __PACKAGE__->new->parse($html);
84             }
85              
86             # sub-class configure
87             sub configure {
88             # SUPER::configure actually modifies the hash, so we need to pass a copy
89 18     18 0 422 my %configure = %$conf_defaults;
90              
91 18         66 shift()->SUPER::configure(\%configure);
92             }
93              
94             # sub-class parse
95             sub parse {
96              
97 6     6 1 382 my $self = shift;
98 6         10 my $html = shift;
99              
100 6 50       19 return unless defined $html;
101 6 50       19 return '' if $html eq '';
102              
103 6         47 my $tree = HTML::TreeBuilder->new->parse( $html );
104 6         10968 return $self->_format_tables( $tree ); # we work our magic...
105              
106             }
107              
108             # a private method
109             sub _format_tables {
110 6     6   17 my $self = shift;
111 6         8 my $tree = shift;
112              
113 6         15 my $formatted_tables = []; # a nested stack for our formatted table text
114              
115             # the result of an all night programming session...
116             #
117             # essentially we take two passes over each table
118             # and modify the structure of text and html by replacing content with tokens
119             # then replacing the tokens after _parse() has converted it to text
120             #
121             # for each
...
122             # we grab all it's inner text (and/or parsed html), rearrange it into a
123             # single string of formatted text, and put a token into it's first
124             # once we have processed the html with _parse(), we replace the tokens with the
125             # corresponding formatted text
126              
127 6         74 my @tables = $tree->look_down(_tag=>'table');
128 6         640 my $table_count = 0;
129 6         12 for my $table (@tables) {
130 6         14 $formatted_tables->[$table_count] = [];
131 6         21 my @trs = $table->look_down(_tag=>'tr');
132 6         358 my @max_col_width; # max column widths by index
133             my @max_col_heights; # max column heights (for multi-line text) by index
134 0         0 my @col_lines; # a stack for our redesigned rows of column () text
135             FIRST_PASS: {
136 6         10 my $row_count = 0; # obviously a counter...
  6         13  
137 6         13 for my $tr (@trs) { # *** 1st pass over rows
138 9         25 $max_col_heights[$row_count] = 0;
139 9         12 $col_lines[$row_count] = [];
140 9         51 my @cols = $tr->look_down(_tag=>qr/^(td|th)$/); # no support for . sorry.
141 9         499 for (my $i = 0; $i < scalar @cols; $i++) {
142 12         44 my $td = $cols[$i]->clone;
143 12         453 my $new_tree = HTML::TreeBuilder->new;
144 12         1578 $new_tree->{_content} = [ $td ];
145             # parse the contents of the td into text
146             # this doesn't work well with nested tables...
147 12         50 my $text = __PACKAGE__->new->_parse($new_tree);
148             # we don't want leading or tailing whitespace
149 12         46398 $text =~ s/\xA0+/ /s; #   -> space
150 12         47 $text =~ s/^\s+//s;
151 12         35 $text =~ s/\s+\z//s;
152             # now we figure out the maximum widths and heights needed for each column
153 12         84 my $max_line_width = 0;
154 12         38 my @lines = split "\n", $text; # take the parsed text and break it into virtual rows
155 12 100       38 $max_col_heights[$row_count] = scalar @lines if scalar @lines > $max_col_heights[$row_count];
156 12         20 for my $line (@lines) {
157 25         20 my $line_width = length $line;
158 25 100       38 $max_line_width = $line_width if $line_width > $max_line_width;
159             }
160 12         29 $cols[$i]->{_content} = [ $text ];
161 12   100     58 $max_col_width[$i] ||= 0;
162 12 100       27 $max_col_width[$i] = $max_line_width if $max_line_width > $max_col_width[$i];
163             # now put the accumulated lines onto our stack
164 12         167 $col_lines[$row_count]->[$i] = \@lines;
165             }
166 9         14 $tr->{_content} = \@cols;
167 9         19 $row_count++;
168             }
169             }
170              
171             SECOND_PASS: {
172 6         8 my $row_count = 0; # obviously, another counter...
  6         9  
173 6         11 for my $tr (@trs) { # *** 2nd pass over rows
174 9         49 my @cols = $tr->look_down(_tag=>qr/^(td|th)$/); # no support for . sorry.
175              
176 9         423 my $row_text; # the final string representing each row of reformatted text
177              
178             my @col_rows; # a stack for each virtual $new_line spliced together from a group of 's
179              
180             # iterate over each column of the maximum rows of parsed multiline text per
181             # for each virtual row of each virtual column, concat the text with alignment spacings
182             # the final concatinated string value will be placed in column 0
183 9         35 for (my $j = 0; $j < $max_col_heights[$row_count]; $j++) {
184 14         12 my $new_line;
185 14         26 for (my $i = 0; $i < scalar @cols; $i++) { # here are the actual elements we're iterating over...
186 26         26 my $width = $max_col_width[$i] + $cellpadding; # how wide is this column of text
187 26         23 my $line = $col_lines[$row_count]->[$i]->[$j]; # get the text to fit into it
188 26 100       33 $line = defined $line ? $line : '';
189              
190             # strip the whitespace from beginning and end of each line
191 26         44 $line =~ s/^\s+//gs;
192 26         31 $line =~ s/\s+\z//gs;
193 26         22 my $n_space = $width - length $line; # the difference between the column and text widths
194              
195             # we are creating virtual rows of text within a single
196             # so we need to add an indent to all but the first row to
197             # match the indent added by _parse() for presenting table contents
198 26 100 100     68 $line = ((' ')x$parser_indent). $line if $j != 0 and $i == 0;
199              
200             # here we adjust the text alignment by wrapping the text in occulted whitespace
201 26 100 100     49 my $justify = $cols[$i]->tag eq 'td' ? ( $cols[$i]->attr('align') || 'left' ) : 'center';
202 26 100       284 if ($justify eq 'center') {
    100          
203 1         4 my $pre = int( ($n_space + $cellpadding) / 2 ); # divide remaining space in half
204 1         1 my $post = $n_space - $pre; # assign any uneven remainder to the end
205 1         6 $new_line .= ((' ')x$pre). $line .((' ')x$post); # wrap the text in spaces
206             } elsif ($justify eq 'left') {
207 15         49 $new_line .= ((' ')x$cellpadding). $line .((' ')x$n_space);
208             } else {
209 10         24 $new_line .= ((' ')x$n_space). $line .((' ')x$cellpadding);
210             }
211             }
212 14 100       32 $new_line .= "\n" if $j != $max_col_heights[$row_count] - 1; # add a newline to all but the last text row
213 14         26 $col_rows[$j] = $new_line; # put the line into the stack for this row
214             }
215 9         26 $row_text .= $_ for @col_rows;
216 9         33 for (my $i = 1; $i < scalar @cols; $i++) {
217 4         7 $cols[$i]->delete; # get rid of unneeded 's
218             }
219             # put the fully formatted text into our accumulator
220 9         98 $formatted_tables->[$table_count]->[$row_count] = $row_text;
221 9 100       20 if (scalar @cols) {
222 8         45 $cols[0]->content->[0] = "__TOKEN__${table_count}__${row_count}__"; # place a token into the row at col 0
223             }
224 9         38 $row_count++;
225             }
226             }
227 6         19 $table_count++;
228             }
229              
230             # now replace our tokens
231 6         24 my $text = $self->_parse( $tree );
232 6         15071 for (my $i = 0; $i < scalar @$formatted_tables; $i++) {
233 6         13 for (my $j = 0; $j < scalar @{ $$formatted_tables[$i] }; $j++) {
  15         51  
234 9         24 my $token = "__TOKEN__${i}__${j}__";
235 9 50       21 $token .= "\n?" if $no_rowspacing;
236 9         14 my $new_text = $$formatted_tables[$i][$j];
237 9 100       20 if (defined $new_text) {
238 6         80 $text =~ s/$token/$new_text/;
239             }
240             else {
241 3         37 $text =~ s/$token//;
242             }
243             }
244             }
245              
246 6         85 return $text;
247             }
248              
249             1;
250             __END__