File Coverage

lib/DataFlow/Proc/HTMLFilter.pm
Criterion Covered Total %
statement 21 32 65.6
branch 0 8 0.0
condition n/a
subroutine 7 10 70.0
pod n/a
total 28 50 56.0


line stmt bran cond sub pod time code
1             package DataFlow::Proc::HTMLFilter;
2              
3 2     2   89316 use strict;
  2         5  
  2         76  
4 2     2   10 use warnings;
  2         4  
  2         101  
5              
6             # ABSTRACT: A HTML filtering processor
7              
8             our $VERSION = '1.112100'; # VERSION
9              
10 2     2   491493 use Moose;
  2         7678026  
  2         19  
11             extends 'DataFlow::Proc';
12              
13 2     2   16937 use namespace::autoclean;
  2         2867  
  2         11  
14              
15 2     2   1989 use HTML::TreeBuilder::XPath;
  2         212146  
  2         33  
16 2     2   2436 use MooseX::Aliases;
  2         2659  
  2         13  
17 2     2   133363 use Moose::Util::TypeConstraints 1.01;
  2         77  
  2         19  
18              
19             enum 'HTMLFilterTypes', [qw(NODE HTML VALUE)];
20              
21             has 'search_xpath' => (
22             'is' => 'ro',
23             'isa' => 'Str',
24             'required' => 1,
25             'alias' => 'xpath',
26             );
27              
28             has 'result_type' => (
29             'is' => 'ro',
30             'isa' => 'HTMLFilterTypes',
31             'default' => 'HTML',
32             'alias' => 'type',
33             );
34              
35             has 'ref_result' => (
36             'is' => 'ro',
37             'isa' => 'Bool',
38             'default' => 0,
39             );
40              
41             has 'nochomp' => (
42             'is' => 'ro',
43             'isa' => 'Bool',
44             'default' => 0,
45             );
46              
47             sub _build_p {
48 0     0     my $self = shift;
49              
50             my $proc = sub {
51 0     0     my $html = HTML::TreeBuilder::XPath->new_from_content($_);
52              
53             #warn 'xpath is built';
54             #warn 'values if VALUES';
55 0 0         return $html->findvalues( $self->search_xpath )
56             if $self->result_type eq 'VALUE';
57              
58             #warn 'not values, find nodes';
59 0           my @result = $html->findnodes( $self->search_xpath );
60              
61             #use Data::Dumper; warn 'result = '.Dumper(\@result);
62 0 0         return () unless @result;
63 0 0         return @result if $self->result_type eq 'NODE';
64              
65             #warn 'wants HTML';
66 0           return map { $_->as_HTML } @result;
  0            
67 0           };
68              
69             #my $proc2 = $self->nochomp ? $proc : sub { return chomp $proc->(@_) };
70             #my $proc3 = $self->ref_result ? sub { return [ $proc2->(@_) ] } : $proc2;
71              
72 0 0   0     return $self->ref_result ? sub { return [ $proc->(@_) ] } : $proc;
  0            
73             }
74              
75             __PACKAGE__->meta->make_immutable;
76              
77             1;
78              
79              
80              
81             =pod
82              
83             =encoding utf-8
84              
85             =head1 NAME
86              
87             DataFlow::Proc::HTMLFilter - A HTML filtering processor
88              
89             =head1 VERSION
90              
91             version 1.112100
92              
93             =head1 SYNOPSIS
94              
95             use DataFlow::Proc::HTMLFilter;
96              
97             my $filter_html = DataFlow::Proc::HTMLFilter->new(
98             search_xpath => '//td',
99             result_type => 'HTML',
100             );
101              
102             my $filter_value = DataFlow::Proc::HTMLFilter->new(
103             search_xpath => '//td',
104             result_type => 'VALUE',
105             );
106              
107             my $input = <<EOM;
108             <html><body>
109             <table>
110             <tr><td>Line 1</td><td>L1, Column 2</td>
111             <tr><td>Line 2</td><td>L2, Column 2</td>
112             </table>
113             </html></body>
114             EOM
115              
116             $filter_html->process( $input );
117             # @result == '<td>Line 1</td>', ... '<td>L2, Column 2</td>'
118              
119             $filter_value->process( $input );
120             # @result == q{Line 1}, ... q{L2, Column 2}
121              
122             =head1 DESCRIPTION
123              
124             This processor type provides a filter for HTML content.
125             Each item will be considered as a HTML content and will be filtered
126             using L<HTML::TreeBuilder::XPath>.
127              
128             =head1 ATTRIBUTES
129              
130             =head2 search_xpath
131              
132             This attribute is a XPath string used to filter down the HTML content.
133             The C<search_xpath> attribute is mandatory.
134              
135             =head2 result_type
136              
137             This attribute is a string, but its value B<must> be one of:
138             C<HTML>, C<VALUE>, C<NODE>. The default is C<HTML>.
139              
140             =over 4
141              
142             =item *
143              
144             HTML
145              
146             The result will be the HTML content specified by C<search_xpath>.
147              
148             =item *
149              
150             VALUE
151              
152             The result will be the literal value enclosed by the tag and/or attribute
153             specified by C<search_xpath>.
154              
155             =item *
156              
157             NODE
158              
159             The result will be a list of L<HTML::Element> objects, as returned by the
160             C<findnodes> method of L<HTML::TreeBuilder::XPath> class.
161              
162             =back
163              
164             Most people will probably use C<HTML> or C<VALUE>, but this option is also
165             provided in case someone wants to manipulate the HTML elements directly.
166              
167             =head2 ref_result
168              
169             This attribute is a boolean, and it signals whether the result list should be
170             added as a list of items to the output queue, or as a reference to an array
171             of items. The default is 0 (false).
172              
173             There is a semantic subtlety here: if C<ref_result> is 1 (true),
174             then one HTML item (input) may generate one or zero ArrayRef item (output),
175             i.e. it is a one-to-one mapping.
176             On the other hand, by keeping C<ref_result> as 0 (false), one HTML item
177             may produce any number of items as result,
178             i.e. it is a one-to-many mapping.
179              
180             =for :stopwords cpan testmatrix url annocpan anno bugtracker rt cpants kwalitee diff irc mailto metadata placeholders
181              
182             =head1 SUPPORT
183              
184             =head2 Perldoc
185              
186             You can find documentation for this module with the perldoc command.
187              
188             perldoc DataFlow::Proc::HTMLFilter
189              
190             =head2 Websites
191              
192             The following websites have more information about this module, and may be of help to you. As always,
193             in addition to those websites please use your favorite search engine to discover more resources.
194              
195             =over 4
196              
197             =item *
198              
199             Search CPAN
200              
201             The default CPAN search engine, useful to view POD in HTML format.
202              
203             L<http://search.cpan.org/dist/DataFlow-Proc-HTMLFilter>
204              
205             =item *
206              
207             AnnoCPAN
208              
209             The AnnoCPAN is a website that allows community annonations of Perl module documentation.
210              
211             L<http://annocpan.org/dist/DataFlow-Proc-HTMLFilter>
212              
213             =item *
214              
215             CPAN Ratings
216              
217             The CPAN Ratings is a website that allows community ratings and reviews of Perl modules.
218              
219             L<http://cpanratings.perl.org/d/DataFlow-Proc-HTMLFilter>
220              
221             =item *
222              
223             CPAN Forum
224              
225             The CPAN Forum is a web forum for discussing Perl modules.
226              
227             L<http://cpanforum.com/dist/DataFlow-Proc-HTMLFilter>
228              
229             =item *
230              
231             CPANTS
232              
233             The CPANTS is a website that analyzes the Kwalitee ( code metrics ) of a distribution.
234              
235             L<http://cpants.perl.org/dist/overview/DataFlow-Proc-HTMLFilter>
236              
237             =item *
238              
239             CPAN Testers
240              
241             The CPAN Testers is a network of smokers who run automated tests on uploaded CPAN distributions.
242              
243             L<http://www.cpantesters.org/distro/D/DataFlow-Proc-HTMLFilter>
244              
245             =item *
246              
247             CPAN Testers Matrix
248              
249             The CPAN Testers Matrix is a website that provides a visual way to determine what Perls/platforms PASSed for a distribution.
250              
251             L<http://matrix.cpantesters.org/?dist=DataFlow-Proc-HTMLFilter>
252              
253             =back
254              
255             =head2 Email
256              
257             You can email the author of this module at C<RUSSOZ at cpan.org> asking for help with any problems you have.
258              
259             =head2 Internet Relay Chat
260              
261             You can get live help by using IRC ( Internet Relay Chat ). If you don't know what IRC is,
262             please read this excellent guide: L<http://en.wikipedia.org/wiki/Internet_Relay_Chat>. Please
263             be courteous and patient when talking to us, as we might be busy or sleeping! You can join
264             those networks/channels and get help:
265              
266             =over 4
267              
268             =item *
269              
270             irc.perl.org
271              
272             You can connect to the server at 'irc.perl.org' and join this channel: #sao-paulo.pm then talk to this person for help: russoz.
273              
274             =back
275              
276             =head2 Bugs / Feature Requests
277              
278             Please report any bugs or feature requests by email to C<bug-dataflow-proc-htmlfilter at rt.cpan.org>, or through
279             the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=DataFlow-Proc-HTMLFilter>. You will be automatically notified of any
280             progress on the request by the system.
281              
282             =head2 Source Code
283              
284             The code is open to the world, and available for you to hack on. Please feel free to browse it and play
285             with it, or whatever. If you want to contribute patches, please send me a diff or prod me to pull
286             from your repository :)
287              
288             L<https://github.com/russoz/DataFlow-Proc-HTMLFilter>
289              
290             git clone https://github.com/russoz/DataFlow-Proc-HTMLFilter
291              
292             =head1 AUTHOR
293              
294             Alexei Znamensky <russoz@cpan.org>
295              
296             =head1 COPYRIGHT AND LICENSE
297              
298             This software is copyright (c) 2011 by Alexei Znamensky.
299              
300             This is free software; you can redistribute it and/or modify it under
301             the same terms as the Perl 5 programming language system itself.
302              
303             =head1 BUGS AND LIMITATIONS
304              
305             No bugs have been reported.
306              
307             Please report any bugs or feature requests through the web interface at
308             L<http://rt.cpan.org>.
309              
310             =head1 DISCLAIMER OF WARRANTY
311              
312             BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
313             FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT
314             WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER
315             PARTIES PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND,
316             EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
317             IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
318             PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
319             SOFTWARE IS WITH YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME
320             THE COST OF ALL NECESSARY SERVICING, REPAIR, OR CORRECTION.
321              
322             IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
323             WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
324             REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE LIABLE
325             TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, OR
326             CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
327             SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
328             RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
329             FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
330             SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
331             DAMAGES.
332              
333             =cut
334              
335              
336             __END__
337