File Coverage

blib/lib/Formatter/HTML/HTML.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package Formatter::HTML::HTML;
2              
3 2     2   15195 use 5.006;
  2         8  
  2         81  
4 2     2   12 use strict;
  2         3  
  2         99  
5 2     2   11 use warnings;
  2         7  
  2         139  
6 2     2   2057 use HTML::Tidy;
  0            
  0            
7             use HTML::TokeParser;
8              
9             use base qw( HTML::Tidy );
10              
11              
12             our $VERSION = '0.97';
13              
14             =head1 NAME
15              
16             Formatter::HTML::HTML - Formatter to clean existing HTML
17              
18             =head1 SYNOPSIS
19              
20             use Formatter::HTML::HTML;
21             my $formatter = Formatter::HTML::HTML->format($data);
22             print $formatter->document;
23             print $formatter->title;
24             my $links = $text->links;
25             print ${$links}[0]->{url};
26              
27             =head1 DESCRIPTION
28              
29             This module will clean the document using L. It also
30             inherits from that module, so you can use methods of that class. It
31             can also parse and return links and the title (using
32             L).
33              
34             =head1 METHODS
35              
36             This module conforms with the L API specification, version 0.95:
37              
38             =over
39              
40             =item C 'path/to/tidy.cfg'} )>
41              
42             The format function that you call to initialise the formatter. It
43             takes the plain text as a string argument and returns an object of
44             this class.
45              
46             Optionally, you may give a hashref with the full file name of the tidy
47             config. This enables you to have this Formatter return valid XHTML,
48             just set it correctly in the config. Note also that you may break the
49             Formatter by e.g. returning configuring tidy to return just a
50             fragment, and it is your own resonsibility to make sure you don't.
51              
52             =cut
53              
54             sub format {
55             my ($that, $text, $config) = @_;
56             my $class = ref($that) || $that;
57             my $tidy = new HTML::Tidy($config); # In fact, we let it do the hard work
58             my $clean = $tidy->clean($text); # allready. It has to be done anyway.
59             my $self = {
60             _out => $clean,
61             };
62             bless($self, $class);
63             return $self;
64             }
65              
66              
67             =item C
68              
69             Will return a full, cleaned and valid HTML document. You may specify
70             an optional C<$charset> parameter. This will include a HTML C
71             element with the chosen character set. It will still be your
72             responsibility to ensure that the document served is encoded with this
73             character set.
74              
75              
76             =cut
77              
78             sub document {
79             my $self = shift;
80             my $charset = shift;
81             my $cleaned = $self->{_out};
82             if (($charset) && ($cleaned !~ m/charset/)) {
83             $cleaned =~ s|()|$1\n|si;
84             }
85             return $cleaned;
86             }
87              
88              
89             =item C
90              
91             This will return only the contents of the C element.
92              
93             =cut
94              
95             sub fragment {
96             my $self = shift;
97             if ($self->{_out} =~ m|(.*)|si) {
98             return $1;
99             } else {
100             return $self->{_out}
101             }
102             }
103              
104             =item C
105              
106             Will return all links found the input plain text string as an
107             arrayref. The arrayref will for each element keys url and title, the
108             former containing the URL, the latter the text of the link.
109              
110              
111             =cut
112              
113             sub links {
114             my $self = shift;
115             my @arr;
116             my $p = HTML::TokeParser->new(\$self->{_out});
117              
118             while (my $token = $p->get_tag("a")) {
119             my $url = $token->[1]{href} || "-";
120             my $text = $p->get_trimmed_text("/a");
121             push(@arr, {url => $url, title => $text});
122             }
123             return \@arr;
124             }
125              
126             # Both links and title are taken right from examples in TokeParser!
127             # Nice of them, huh? :-)
128              
129             =item C </td> </tr> <tr> <td class="h" > <a name="130">130</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="131">131</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> Will return the title of the document as seen in the HTML C<title> </td> </tr> <tr> <td class="h" > <a name="132">132</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> element or undef if none can be found. </td> </tr> <tr> <td class="h" > <a name="133">133</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="134">134</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> =cut </td> </tr> <tr> <td class="h" > <a name="135">135</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="136">136</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="137">137</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> sub title { </td> </tr> <tr> <td class="h" > <a name="138">138</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> my $self = shift; </td> </tr> <tr> <td class="h" > <a name="139">139</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> my $p = HTML::TokeParser->new(\$self->{_out}); </td> </tr> <tr> <td class="h" > <a name="140">140</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="141">141</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> if ($p->get_tag("title")) { </td> </tr> <tr> <td class="h" > <a name="142">142</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> return $p->get_trimmed_text; </td> </tr> <tr> <td class="h" > <a name="143">143</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> } </td> </tr> <tr> <td class="h" > <a name="144">144</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> return undef; </td> </tr> <tr> <td class="h" > <a name="145">145</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> } </td> </tr> <tr> <td class="h" > <a name="146">146</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="147">147</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s">   </td> </tr> <tr> <td class="h" > <a name="148">148</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> 1; </td> </tr> <tr> <td class="h" > <a name="149">149</a> </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td >   </td> <td class="s"> __END__ </td> </tr> </table> </body> </html>