File Coverage

blib/lib/WebSource/Extract.pm
Criterion Covered Total %
statement 4 6 66.6
branch n/a
condition n/a
subroutine 2 2 100.0
pod n/a
total 6 8 75.0


line stmt bran cond sub pod time code
1             package WebSource::Extract;
2 1     1   5014 use strict;
  1         3  
  1         44  
3 1     1   105 use WebSource::Parser;
  0            
  0            
4             use XML::LibXSLT;
5             use XML::LibXML::XPathContext;
6             use Carp;
7              
8             our @ISA = ('WebSource::Module');
9              
10             =head1 NAME
11              
12             WebSource::Extract - Extract parts of the input
13              
14             =head1 DESCRIPTION
15              
16             An B operator allows to extract sub parts of its input.
17             There exists different flavors of such an operator. The main one consists
18             in querying the input using an XPath expression.
19              
20             Such an operator is described by a DOM Node having the following form :
21              
22            
23             //an/xpath/expression
24            
25              
26             The operator queries any input with the expression found in the path sub-element
27             an returns the found results.
28              
29             To use a different flavor of the B operator (for example B) it is
30             necessary to add a C attribut to the C element. The parameters
31             (sub-elements of C) depend on the type of operator used.
32              
33             Each flavor of the B operator is implemented by a perl module
34             named WebSource::Extract::flavor (eg. WebSource::Extract::xslt). See the
35             corresponding man page for a full description.
36              
37             Current existing flavors include :
38              
39             =over 2
40              
41             =item xslt : apply an XSL stylesheet to the input
42              
43             =item form : extract form data
44              
45             =item regexep : extract data using a regular expression
46              
47             =back
48              
49             =head1 SYNOPSIS
50              
51             $exop = WebSource::Extract->new(wsdnode => $desc);
52              
53             =head1 METHODS
54              
55             See B<< WebSource::Module >>
56              
57             =cut
58              
59             sub _init_ {
60             my $self = shift;
61             $self->SUPER::_init_;
62             my $wsd = $self->{wsdnode};
63             if($wsd) {
64             $self->{xpath} = $wsd->findvalue('path');
65             $self->{format} = $wsd->getAttribute("format");
66             $self->{limit} = $wsd->getAttribute("limit");
67             }
68             $self->{xpath} or croak "No xpath given";
69             return $self;
70             }
71              
72             sub handle {
73             my $self = shift;
74             my $env = shift;
75            
76             $self->log(5,"Got document ",$env->{baseuri});
77             if(!($env->type eq "object/dom-node")) {
78             $self->log(1,"Oooops we haven't got an object/dom-node");
79             return ();
80             }
81             $self->log(6,"Extracting from :\n",$env->data->toString(1));
82             $self->log(5,"Extracting with ",$self->{xpath});
83            
84             my $xpc = XML::LibXML::XPathContext->new($env->data);
85             $xpc->registerNs('html','http://www.w3.org/1999/xhtml');
86            
87             my @nodes = $xpc->findnodes($self->{xpath});
88             if($self->{format} eq "string") {
89             @nodes = map { $_->textContent; } @nodes;
90             }
91             $self->log(5,"Extracted ",$#nodes + 1," nodes");
92             if(my $l = $self->{limit}) {
93             $self->log(5,"Limiting to first $l results");
94             $l--;
95             if($#nodes > $l) {
96             @nodes = @nodes[0..$l];
97             }
98             }
99             my %meta = %$env;
100             return map {
101             WebSource::Envelope->new (
102             %meta,
103             type => $self->{format} eq "string" ?
104             "text/string" : "object/dom-node",
105             data => $_,
106             )
107             } @nodes;
108             }
109              
110             =head1 SEE ALSO
111              
112             WebSource, WebSource::Extract::xslt, WebSource::Extract::form,
113             WebSource::Extract::regexp
114              
115             =cut
116              
117             1;