File Coverage

blib/lib/HTML/FormatText/WithLinks/AndTables.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package HTML::FormatText::WithLinks::AndTables;
2              
3 2     2   39135 use strict;
  2         5  
  2         66  
4 2     2   11 use warnings;
  2         5  
  2         59  
5              
6 2     2   10 use base 'HTML::FormatText::WithLinks';
  2         7  
  2         1340  
7 2     2   1901 use HTML::TreeBuilder;
  0            
  0            
8              
9             ################################################################################
10             # configuration defaults
11             ################################################################################
12             my $cellpadding = 1; # number of horizontal spaces to pad interior of cells
13             my $no_rowspacing = 0; # boolean, suppress space between table rows and rows with empty s
14             ################################################################################
15              
16             =head1 NAME
17              
18             HTML::FormatText::WithLinks::AndTables - Converts HTML to Text with tables in tact
19              
20             =head1 VERSION
21              
22             Version 0.01
23              
24             =cut
25              
26             our $VERSION = '0.02';
27              
28             =head1 SYNOPSIS
29              
30             use HTML::FormatText::WithLinks::AndTables;
31              
32             my $text = HTML::FormatText::WithLinks::AndTables->convert($html);
33              
34             Or optionally...
35              
36             my $conf = { # same as HTML::FormatText excepting below
37             cellpadding => 2, # defaults to 1
38             no_rowspacing => 1, # bool, suppress vertical space between table rows
39             };
40              
41             my $text = HTML::FormatText::WithLinks::AndTables->convert($html, $conf);
42              
43             =head1 DESCRIPTION
44              
45             This module was inspired by HTML::FormatText::WithLinks which has proven to be a
46             useful `lynx -dump` work-alike. However one frustration was that no other HTML
47             converters I came across had the ability to deal affectively with HTML s.
48             This module can in a rudimentary sense do so. The aim was to provide facility to take
49             a simple HTML based email template, and to also convert it to text with the
50             structure in tact for inclusion as "multipart/alternative" content. Further, it will
51             preserve both the formatting specified by the tag's "align" attribute, and will
52             also preserve multiline text inside of a element provided it is broken using
53             tags.
54              
55             =head2 EXPORT
56              
57             None by default.
58              
59              
60             =head1 METHODS
61              
62             =head2 convert
63              
64             =cut
65              
66             my $parser_indent = 3; # HTML::FormatText::WithLinks adds this indent to data in each
67             my $conf_defaults = {};
68              
69             # the one and only public interface
70             sub convert {
71             shift if $_[0] eq __PACKAGE__; # to make it function friendly
72             my ($html, $conf) = @_;
73              
74             # over-ride our defaults
75             if ($conf and ref $conf eq 'HASH') {
76             $no_rowspacing = $$conf{no_rowspacing} if $$conf{no_rowspacing};
77             delete $$conf{no_rowspacing};
78             $cellpadding = $$conf{cellpadding} if $$conf{cellpadding};
79             delete $$conf{cellpadding};
80             %$conf_defaults = (%$conf_defaults, %$conf);
81             }
82              
83             return __PACKAGE__->new->parse($html);
84             }
85              
86             # sub-class configure
87             sub configure {
88             shift()->SUPER::configure($conf_defaults);
89             }
90              
91             # sub-class parse
92             sub parse {
93              
94             my $self = shift;
95             my $html = shift;
96              
97             return undef unless defined $html;
98             return '' if $html eq '';
99              
100             my $tree = HTML::TreeBuilder->new->parse( $html );
101             return $self->_format_tables( $tree ); # we work our magic...
102              
103             }
104              
105             # a private method
106             sub _format_tables {
107             my $self = shift;
108             my $tree = shift;
109              
110             my $formatted_tables = []; # a nested stack for our formatted table text
111              
112             # the result of an all night programming session...
113             #
114             # essentially we take two passes over each table
115             # and modify the structure of text and html by replacing content with tokens
116             # then replacing the tokens after _parse() has converted it to text
117             #
118             # for each
...
119             # we grab all it's inner text (and/or parsed html), rearrange it into a
120             # single string of formatted text, and put a token into it's first
121             # once we have processed the html with _parse(), we replace the tokens with the
122             # corresponding formatted text
123              
124             my @tables = $tree->look_down(_tag=>'table');
125             my $table_count = 0;
126             for my $table (@tables) {
127             $formatted_tables->[$table_count] = [];
128             my @trs = $table->look_down(_tag=>'tr');
129             my @max_col_width; # max column widths by index
130             my @max_col_heights; # max column heights (for multi-line text) by index
131             my @col_lines; # a stack for our redesigned rows of column () text
132             FIRST_PASS: {
133             my $row_count = 0; # obviously a counter...
134             for my $tr (@trs) { # *** 1st pass over rows
135             $max_col_heights[$row_count] = 0;
136             $col_lines[$row_count] = [];
137             my @cols = $tr->look_down(_tag=>'td'); # no support for . sorry.
138             for (my $i = 0; $i < scalar @cols; $i++) {
139             my $td = $cols[$i]->clone;
140             my $new_tree = HTML::TreeBuilder->new;
141             $new_tree->{_content} = [ $td ];
142             # parse the contents of the td into text
143             # this doesn't work well with nested tables...
144             my $text = __PACKAGE__->new->_parse($new_tree);
145             # we don't want leading or tailing whitespace
146             $text =~ s/^\s+//s;
147             $text =~ s/\s+\z//s;
148             # now we figure out the maximum widths and heights needed for each column
149             my $max_line_width = 0;
150             my @lines = split "\n", $text; # take the parsed text and break it into virtual rows
151             $max_col_heights[$row_count] = scalar @lines if scalar @lines > $max_col_heights[$row_count];
152             for my $line (@lines) {
153             my $line_width = length $line;
154             $max_line_width = $line_width if $line_width > $max_line_width;
155             }
156             $cols[$i]->{_content} = [ $text ];
157             $max_col_width[$i] ||= 0;
158             $max_col_width[$i] = $max_line_width if $max_line_width > $max_col_width[$i];
159             # now put the accumulated lines onto our stack
160             $col_lines[$row_count]->[$i] = \@lines;
161             }
162             $tr->{_content} = \@cols;
163             $row_count++;
164             }
165             }
166              
167             SECOND_PASS: {
168             my $row_count = 0; # obviously, another counter...
169             for my $tr (@trs) { # *** 2nd pass over rows
170             my @cols = $tr->look_down(_tag=>'td'); # no support for . sorry.
171              
172             my $row_text; # the final string representing each row of reformatted text
173              
174             my @col_rows; # a stack for each virtual $new_line spliced together from a group of 's
175              
176             # iterate over each column of the maximum rows of parsed multiline text per
177             # for each virtual row of each virtual column, concat the text with alignment spacings
178             # the final concatinated string value will be placed in column 0
179             for (my $j = 0; $j < $max_col_heights[$row_count]; $j++) {
180             my $new_line;
181             for (my $i = 0; $i < scalar @cols; $i++) { # here are the actual elements we're iterating over...
182             my $width = $max_col_width[$i] + $cellpadding; # how wide is this column of text
183             my $line = $col_lines[$row_count]->[$i]->[$j]; # get the text to fit into it
184             $line = defined $line ? $line : '';
185              
186             # strip the whitespace from beginning and end of each line
187             $line =~ s/^\s+//gs;
188             $line =~ s/\s+\z//gs;
189             my $n_space = $width - length $line; # the difference between the column and text widths
190              
191             # we are creating virtual rows of text within a single
192             # so we need to add an indent to all but the first row to
193             # match the indent added by _parse() for presenting table contents
194             $line = ((' ')x$parser_indent). $line if $j != 0 and $i == 0;
195              
196             # here we adjust the text alignment by wrapping the text in occulted whitespace
197             my $justify = $cols[$i]->tag eq 'td' ? ( $cols[$i]->attr('align') || 'left' ) : 'center';
198             if ($justify eq 'center') {
199             my $pre = int( ($n_space + $cellpadding) / 2 ); # divide remaining space in half
200             my $post = $n_space - $pre; # assign any uneven remainder to the end
201             $new_line .= ((' ')x$pre). $line .((' ')x$post); # wrap the text in spaces
202             } elsif ($justify eq 'left') {
203             $new_line .= ((' ')x$cellpadding). $line .((' ')x$n_space);
204             } else {
205             $new_line .= ((' ')x$n_space). $line .((' ')x$cellpadding);
206             }
207             }
208             $new_line .= "\n" if $j != $max_col_heights[$row_count] - 1; # add a newline to all but the last text row
209             $col_rows[$j] = $new_line; # put the line into the stack for this row
210             }
211             $row_text .= $_ for @col_rows;
212             for (my $i = 1; $i < scalar @cols; $i++) {
213             $cols[$i]->delete; # get rid of unneeded 's
214             }
215             # put the fully formatted text into our accumulator
216             $formatted_tables->[$table_count]->[$row_count] = $row_text;
217             $cols[0]->content->[0] = "__TOKEN__${table_count}__${row_count}__"; # place a token into the row at col 0
218             $row_count++;
219             }
220             }
221             $table_count++;
222             }
223              
224             # now replace our tokens
225             my $text = $self->_parse( $tree );
226             for (my $i = 0; $i < scalar @$formatted_tables; $i++) {
227             for (my $j = 0; $j < scalar @{ $$formatted_tables[$i] }; $j++) {
228             my $token = "__TOKEN__${i}__${j}__";
229             $token .= "\n?" if $no_rowspacing;
230             my $new_text = $$formatted_tables[$i][$j];
231             $text =~ s/$token/$new_text/;
232             }
233             }
234              
235             return $text;
236             }
237              
238             1;
239             __END__