File Coverage

blib/lib/HTML/FormatText/WithLinks/AndTables.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package HTML::FormatText::WithLinks::AndTables;
2              
3 7     7   22920 use strict;
  7         14  
  7         233  
4 7     7   27 use warnings;
  7         9  
  7         361  
5              
6             our $VERSION = '0.06'; # VERSION
7              
8 7     7   33 use base 'HTML::FormatText::WithLinks';
  7         17  
  7         2711  
9 7     7   6377 use HTML::TreeBuilder;
  0            
  0            
10              
11             ################################################################################
12             # configuration defaults
13             ################################################################################
14             my $cellpadding = 1; # number of horizontal spaces to pad interior of cells
15             my $no_rowspacing = 0; # boolean, suppress space between table rows and rows with empty s
16             ################################################################################
17              
18             =head1 NAME
19              
20             HTML::FormatText::WithLinks::AndTables - Converts HTML to Text with tables intact
21              
22             =head1 VERSION
23              
24             version 0.06
25              
26             =cut
27              
28             =head1 SYNOPSIS
29              
30             use HTML::FormatText::WithLinks::AndTables;
31              
32             my $text = HTML::FormatText::WithLinks::AndTables->convert($html);
33              
34             Or optionally...
35              
36             my $conf = { # same as HTML::FormatText excepting below
37             cellpadding => 2, # defaults to 1
38             no_rowspacing => 1, # bool, suppress vertical space between table rows
39             };
40              
41             my $text = HTML::FormatText::WithLinks::AndTables->convert($html, $conf);
42              
43             =head1 DESCRIPTION
44              
45             This module was inspired by HTML::FormatText::WithLinks which has proven to be a
46             useful `lynx -dump` work-alike. However one frustration was that no other HTML
47             converters I came across had the ability to deal affectively with HTML s.
48             This module can in a rudimentary sense do so. The aim was to provide facility to take
49             a simple HTML based email template, and to also convert it to text with the
50             structure intact for inclusion as "multipart/alternative" content. Further, it will
51             preserve both the formatting specified by the tag's "align" attribute, and will
52             also preserve multiline text inside of a element provided it is broken using
53             tags.
54              
55             =head2 EXPORT
56              
57             None by default.
58              
59              
60             =head1 METHODS
61              
62             =head2 convert
63              
64             =cut
65              
66             my $parser_indent = 3; # HTML::FormatText::WithLinks adds this indent to data in each
67             my $conf_defaults = {};
68              
69             # the one and only public interface
70             sub convert {
71             shift if $_[0] eq __PACKAGE__; # to make it function friendly
72             my ($html, $conf) = @_;
73              
74             # over-ride our defaults
75             if ($conf and ref $conf eq 'HASH') {
76             $no_rowspacing = $$conf{no_rowspacing} if $$conf{no_rowspacing};
77             delete $$conf{no_rowspacing};
78             $cellpadding = $$conf{cellpadding} if $$conf{cellpadding};
79             delete $$conf{cellpadding};
80             %$conf_defaults = (%$conf_defaults, %$conf);
81             }
82              
83             return __PACKAGE__->new->parse($html);
84             }
85              
86             # sub-class configure
87             sub configure {
88             # SUPER::configure actually modifies the hash, so we need to pass a copy
89             my %configure = %$conf_defaults;
90              
91             shift()->SUPER::configure(\%configure);
92             }
93              
94             # sub-class parse
95             sub parse {
96              
97             my $self = shift;
98             my $html = shift;
99              
100             return unless defined $html;
101             return '' if $html eq '';
102              
103             my $tree = HTML::TreeBuilder->new->parse( $html );
104             return $self->_format_tables( $tree ); # we work our magic...
105              
106             }
107              
108             # a private method
109             sub _format_tables {
110             my $self = shift;
111             my $tree = shift;
112              
113             my $formatted_tables = []; # a nested stack for our formatted table text
114              
115             # the result of an all night programming session...
116             #
117             # essentially we take two passes over each table
118             # and modify the structure of text and html by replacing content with tokens
119             # then replacing the tokens after _parse() has converted it to text
120             #
121             # for each
...
122             # we grab all it's inner text (and/or parsed html), rearrange it into a
123             # single string of formatted text, and put a token into it's first
124             # once we have processed the html with _parse(), we replace the tokens with the
125             # corresponding formatted text
126              
127             my @tables = $tree->look_down(_tag=>'table');
128             my $table_count = 0;
129             for my $table (@tables) {
130             $formatted_tables->[$table_count] = [];
131             my @trs = $table->look_down(_tag=>'tr');
132             my @max_col_width; # max column widths by index
133             my @max_col_heights; # max column heights (for multi-line text) by index
134             my @col_lines; # a stack for our redesigned rows of column () text
135             FIRST_PASS: {
136             my $row_count = 0; # obviously a counter...
137             for my $tr (@trs) { # *** 1st pass over rows
138             $max_col_heights[$row_count] = 0;
139             $col_lines[$row_count] = [];
140             my @cols = $tr->look_down(_tag=>qr/^(td|th)$/); # no support for . sorry.
141             for (my $i = 0; $i < scalar @cols; $i++) {
142             my $td = $cols[$i]->clone;
143             my $new_tree = HTML::TreeBuilder->new;
144             $new_tree->{_content} = [ $td ];
145             # parse the contents of the td into text
146             # this doesn't work well with nested tables...
147             my $text = __PACKAGE__->new->_parse($new_tree);
148             # we don't want leading or tailing whitespace
149             $text =~ s/^\s+//s;
150             $text =~ s/\s+\z//s;
151             # now we figure out the maximum widths and heights needed for each column
152             my $max_line_width = 0;
153             my @lines = split "\n", $text; # take the parsed text and break it into virtual rows
154             $max_col_heights[$row_count] = scalar @lines if scalar @lines > $max_col_heights[$row_count];
155             for my $line (@lines) {
156             my $line_width = length $line;
157             $max_line_width = $line_width if $line_width > $max_line_width;
158             }
159             $cols[$i]->{_content} = [ $text ];
160             $max_col_width[$i] ||= 0;
161             $max_col_width[$i] = $max_line_width if $max_line_width > $max_col_width[$i];
162             # now put the accumulated lines onto our stack
163             $col_lines[$row_count]->[$i] = \@lines;
164             }
165             $tr->{_content} = \@cols;
166             $row_count++;
167             }
168             }
169              
170             SECOND_PASS: {
171             my $row_count = 0; # obviously, another counter...
172             for my $tr (@trs) { # *** 2nd pass over rows
173             my @cols = $tr->look_down(_tag=>qr/^(td|th)$/); # no support for . sorry.
174              
175             my $row_text; # the final string representing each row of reformatted text
176              
177             my @col_rows; # a stack for each virtual $new_line spliced together from a group of 's
178              
179             # iterate over each column of the maximum rows of parsed multiline text per
180             # for each virtual row of each virtual column, concat the text with alignment spacings
181             # the final concatinated string value will be placed in column 0
182             for (my $j = 0; $j < $max_col_heights[$row_count]; $j++) {
183             my $new_line;
184             for (my $i = 0; $i < scalar @cols; $i++) { # here are the actual elements we're iterating over...
185             my $width = $max_col_width[$i] + $cellpadding; # how wide is this column of text
186             my $line = $col_lines[$row_count]->[$i]->[$j]; # get the text to fit into it
187             $line = defined $line ? $line : '';
188              
189             # strip the whitespace from beginning and end of each line
190             $line =~ s/^\s+//gs;
191             $line =~ s/\s+\z//gs;
192             my $n_space = $width - length $line; # the difference between the column and text widths
193              
194             # we are creating virtual rows of text within a single
195             # so we need to add an indent to all but the first row to
196             # match the indent added by _parse() for presenting table contents
197             $line = ((' ')x$parser_indent). $line if $j != 0 and $i == 0;
198              
199             # here we adjust the text alignment by wrapping the text in occulted whitespace
200             my $justify = $cols[$i]->tag eq 'td' ? ( $cols[$i]->attr('align') || 'left' ) : 'center';
201             if ($justify eq 'center') {
202             my $pre = int( ($n_space + $cellpadding) / 2 ); # divide remaining space in half
203             my $post = $n_space - $pre; # assign any uneven remainder to the end
204             $new_line .= ((' ')x$pre). $line .((' ')x$post); # wrap the text in spaces
205             } elsif ($justify eq 'left') {
206             $new_line .= ((' ')x$cellpadding). $line .((' ')x$n_space);
207             } else {
208             $new_line .= ((' ')x$n_space). $line .((' ')x$cellpadding);
209             }
210             }
211             $new_line .= "\n" if $j != $max_col_heights[$row_count] - 1; # add a newline to all but the last text row
212             $col_rows[$j] = $new_line; # put the line into the stack for this row
213             }
214             $row_text .= $_ for @col_rows;
215             for (my $i = 1; $i < scalar @cols; $i++) {
216             $cols[$i]->delete; # get rid of unneeded 's
217             }
218             # put the fully formatted text into our accumulator
219             $formatted_tables->[$table_count]->[$row_count] = $row_text;
220             if (scalar @cols) {
221             $cols[0]->content->[0] = "__TOKEN__${table_count}__${row_count}__"; # place a token into the row at col 0
222             }
223             $row_count++;
224             }
225             }
226             $table_count++;
227             }
228              
229             # now replace our tokens
230             my $text = $self->_parse( $tree );
231             for (my $i = 0; $i < scalar @$formatted_tables; $i++) {
232             for (my $j = 0; $j < scalar @{ $$formatted_tables[$i] }; $j++) {
233             my $token = "__TOKEN__${i}__${j}__";
234             $token .= "\n?" if $no_rowspacing;
235             my $new_text = $$formatted_tables[$i][$j];
236             if (defined $new_text) {
237             $text =~ s/$token/$new_text/;
238             }
239             else {
240             $text =~ s/$token//;
241             }
242             }
243             }
244              
245             return $text;
246             }
247              
248             1;
249             __END__