| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package WebSource; | 
| 2 |  |  |  |  |  |  | our $REVSTR = '$Revision: 1.13 $'; | 
| 3 |  |  |  |  |  |  | $REVSTR =~ m/Revision: ([^ ]+)/; | 
| 4 |  |  |  |  |  |  | our $REVISION = $1; | 
| 5 |  |  |  |  |  |  | our $VERSION='2.4.5'; | 
| 6 |  |  |  |  |  |  |  | 
| 7 | 2 |  |  | 2 |  | 46268 | use strict; | 
|  | 2 |  |  |  |  | 22 |  | 
|  | 2 |  |  |  |  | 72 |  | 
| 8 | 2 |  |  | 2 |  | 12 | use Carp; | 
|  | 2 |  |  |  |  | 2 |  | 
|  | 2 |  |  |  |  | 208 |  | 
| 9 |  |  |  |  |  |  |  | 
| 10 | 2 |  |  | 2 |  | 13130 | use LWP::UserAgent; | 
|  | 2 |  |  |  |  | 197491 |  | 
|  | 2 |  |  |  |  | 77 |  | 
| 11 | 2 |  |  | 2 |  | 5121 | use HTTP::Cookies; | 
|  | 2 |  |  |  |  | 18176 |  | 
|  | 2 |  |  |  |  | 68 |  | 
| 12 | 2 |  |  | 2 |  | 660 | use WebSource::Parser; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | use WebSource::Envelope; | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | use File::Spec; | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | our $NameSpace = 'http://wwwsource.free.fr/ns/websource'; | 
| 18 |  |  |  |  |  |  | our %ModClass = ( | 
| 19 |  |  |  |  |  |  | "fetch"		=> 'WebSource::Fetcher', | 
| 20 |  |  |  |  |  |  | "extract"		=> 'WebSource::Extract', | 
| 21 |  |  |  |  |  |  | "filter"		=> 'WebSource::Filter', | 
| 22 |  |  |  |  |  |  | "query"		=> 'WebSource::Query', | 
| 23 |  |  |  |  |  |  | "format"		=> 'WebSource::Format', | 
| 24 |  |  |  |  |  |  | "xmlparser"	=> 'WebSource::XMLParser', | 
| 25 |  |  |  |  |  |  | "cache"		=> 'WebSource::Cache', | 
| 26 |  |  |  |  |  |  | "soap"			=> 'WebSource::Soap', | 
| 27 |  |  |  |  |  |  | "database"		=> 'WebSource::DB', | 
| 28 |  |  |  |  |  |  | "map"			=> 'WebSource::Map', | 
| 29 |  |  |  |  |  |  | "dummy"		=> 'WebSource::Module', | 
| 30 |  |  |  |  |  |  | "file"			=> 'WebSource::File', | 
| 31 |  |  |  |  |  |  | "xmlsender" 	=> 'WebSource::XMLSender', | 
| 32 |  |  |  |  |  |  | "meta-tag"		=> 'WebSource::MetaTag' | 
| 33 |  |  |  |  |  |  | ); | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  |  | 
| 36 |  |  |  |  |  |  | =head1 NAME | 
| 37 |  |  |  |  |  |  |  | 
| 38 |  |  |  |  |  |  | WebSource - a general data wrapping tool particularly well suited for online data | 
| 39 |  |  |  |  |  |  | (but what data in not online in some way today ;) ) | 
| 40 |  |  |  |  |  |  |  | 
| 41 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 42 |  |  |  |  |  |  |  | 
| 43 |  |  |  |  |  |  | WebSource gives a general and normalized framework way to access | 
| 44 |  |  |  |  |  |  | data made available via the web. An access to subparts of the | 
| 45 |  |  |  |  |  |  | Web is made by defining a task. This task is built by composing | 
| 46 |  |  |  |  |  |  | query building, extraction, fetching and filtering subtasks. | 
| 47 |  |  |  |  |  |  |  | 
| 48 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 49 |  |  |  |  |  |  |  | 
| 50 |  |  |  |  |  |  | $source = WebSource->new(wsd => $description); | 
| 51 |  |  |  |  |  |  | @results = $source->query($query); | 
| 52 |  |  |  |  |  |  | or | 
| 53 |  |  |  |  |  |  | $result = $source->set_query($query); | 
| 54 |  |  |  |  |  |  | while($result = $source->next_result()) { | 
| 55 |  |  |  |  |  |  | ... | 
| 56 |  |  |  |  |  |  | } | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | =head1 ABSTRACT | 
| 59 |  |  |  |  |  |  |  | 
| 60 |  |  |  |  |  |  | WebSource originally was a generic wrapper around a Web Source. | 
| 61 |  |  |  |  |  |  | Given an XML description of a source it allows to query the source | 
| 62 |  |  |  |  |  |  | and retreive its results. The format of the query and the result | 
| 63 |  |  |  |  |  |  | remain source dependant however. | 
| 64 |  |  |  |  |  |  |  | 
| 65 |  |  |  |  |  |  | It is now configurable enough allow to do complex tasks on the web : such as | 
| 66 |  |  |  |  |  |  | fetching, extracting, filtering data one the Web. Each complex task is | 
| 67 |  |  |  |  |  |  | described by an XML task description file (WebSource description). This task | 
| 68 |  |  |  |  |  |  | is decomposed into simple subtasks of different flavors. | 
| 69 |  |  |  |  |  |  |  | 
| 70 |  |  |  |  |  |  | Existing subtask flavors are : | 
| 71 |  |  |  |  |  |  | - B | 
| 72 |  |  |  |  |  |  | I   an XML::LibXML::Document | 
| 73 |  |  |  |  |  |  | I | 
| 74 |  |  |  |  |  |  | Applys an Xpath on the document and returns the set of nodes | 
| 75 |  |  |  |  |  |  | - B | 
| 76 |  |  |  |  |  |  | I   a URL (or XML::LibXML::Node containing a url) | 
| 77 |  |  |  |  |  |  | I | 
| 78 |  |  |  |  |  |  | - B | 
| 79 |  |  |  |  |  |  | I   an XML::Document | 
| 80 |  |  |  |  |  |  | I | 
| 81 |  |  |  |  |  |  | - B | 
| 82 |  |  |  |  |  |  | I   anything | 
| 83 |  |  |  |  |  |  | I | 
| 84 |  |  |  |  |  |  | - B | 
| 85 |  |  |  |  |  |  | This type of subtask uses an external perl module as a task. | 
| 86 |  |  |  |  |  |  | This allows to define highly configurable tasks. | 
| 87 |  |  |  |  |  |  | I   depends on external module | 
| 88 |  |  |  |  |  |  | I | 
| 89 |  |  |  |  |  |  | - B | 
| 90 |  |  |  |  |  |  | I   anything | 
| 91 |  |  |  |  |  |  | I | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | =head1 METHODS | 
| 94 |  |  |  |  |  |  |  | 
| 95 |  |  |  |  |  |  | =over 2 | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | =item B<< $source = WebSource->new(wsd => $wsd); >> | 
| 98 |  |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | Create a new WebSource object working with the given a WebSource description | 
| 100 |  |  |  |  |  |  |  | 
| 101 |  |  |  |  |  |  | The following named paramters can be given : | 
| 102 |  |  |  |  |  |  |  | 
| 103 |  |  |  |  |  |  | =over 2 | 
| 104 |  |  |  |  |  |  |  | 
| 105 |  |  |  |  |  |  | =item C | 
| 106 |  |  |  |  |  |  |  | 
| 107 |  |  |  |  |  |  | Use a generic engine with the given source description file | 
| 108 |  |  |  |  |  |  |  | 
| 109 |  |  |  |  |  |  | =item C | 
| 110 |  |  |  |  |  |  |  | 
| 111 |  |  |  |  |  |  | Do not output more than max_results | 
| 112 |  |  |  |  |  |  |  | 
| 113 |  |  |  |  |  |  | =back | 
| 114 |  |  |  |  |  |  |  | 
| 115 |  |  |  |  |  |  | =cut | 
| 116 |  |  |  |  |  |  |  | 
| 117 |  |  |  |  |  |  | sub new { | 
| 118 |  |  |  |  |  |  | my $class = shift; | 
| 119 |  |  |  |  |  |  | my %param = @_; | 
| 120 |  |  |  |  |  |  | $param{wsd} or croak("No WebSource description given"); | 
| 121 |  |  |  |  |  |  | $param{useragent}      or $param{useragent} = | 
| 122 |  |  |  |  |  |  | LWP::UserAgent->new( | 
| 123 |  |  |  |  |  |  | agent => "WebSource/1.0", | 
| 124 |  |  |  |  |  |  | keep_alive => 1, | 
| 125 |  |  |  |  |  |  | timeout => 30, | 
| 126 |  |  |  |  |  |  | requests_redirectable => ['GET', 'HEAD', 'POST'], | 
| 127 |  |  |  |  |  |  | env_proxy => 1, | 
| 128 |  |  |  |  |  |  | ); | 
| 129 |  |  |  |  |  |  | $param{cookies}        or $param{cookies} = HTTP::Cookies->new; | 
| 130 |  |  |  |  |  |  | $param{useragent}->cookie_jar($param{cookies}); | 
| 131 |  |  |  |  |  |  | $param{maxreqinterval} or $param{maxreqinterval} = 3; | 
| 132 |  |  |  |  |  |  | $param{maxtries}       or $param{maxtries} = 3; | 
| 133 |  |  |  |  |  |  | $param{parser}         or $param{parser} = XML::LibXML->new; | 
| 134 |  |  |  |  |  |  | $param{parser}->expand_xinclude(1); | 
| 135 |  |  |  |  |  |  | $param{result_count} = 0; | 
| 136 |  |  |  |  |  |  | my $self = bless \%param, $class; | 
| 137 |  |  |  |  |  |  | $self->_init; | 
| 138 |  |  |  |  |  |  | return $self; | 
| 139 |  |  |  |  |  |  | } | 
| 140 |  |  |  |  |  |  |  | 
| 141 |  |  |  |  |  |  | sub _init { | 
| 142 |  |  |  |  |  |  | my $self = shift; | 
| 143 |  |  |  |  |  |  | my $wsd = $self->{wsd}; | 
| 144 |  |  |  |  |  |  | my $doc = $self->load_wsd($wsd); | 
| 145 |  |  |  |  |  |  | $self->{wsddoc} = $doc; | 
| 146 |  |  |  |  |  |  | $self->apply_imports; | 
| 147 |  |  |  |  |  |  | } | 
| 148 |  |  |  |  |  |  |  | 
| 149 |  |  |  |  |  |  | sub load_wsd { | 
| 150 |  |  |  |  |  |  | my ($self, $wsd, $base) = @_; | 
| 151 |  |  |  |  |  |  | my $parser = $self->{parser}; | 
| 152 |  |  |  |  |  |  | my $doc; | 
| 153 |  |  |  |  |  |  | if($base) { | 
| 154 |  |  |  |  |  |  | if(-f $base) { | 
| 155 |  |  |  |  |  |  | my @path = File::Spec->splitpath(); | 
| 156 |  |  |  |  |  |  | pop @path; | 
| 157 |  |  |  |  |  |  | $base = File::Spec->catpath(@path); | 
| 158 |  |  |  |  |  |  | } | 
| 159 |  |  |  |  |  |  | $wsd = $base ? File::Spec->rel2abs($wsd,$base) : File::Spec->rel2abs($wsd); | 
| 160 |  |  |  |  |  |  | } | 
| 161 |  |  |  |  |  |  | $self->log(2,"Loading " .$wsd); | 
| 162 |  |  |  |  |  |  | if(-f $wsd) { | 
| 163 |  |  |  |  |  |  | $parser->base_uri("file://" . $wsd); | 
| 164 |  |  |  |  |  |  | $doc = $parser->parse_file($wsd); | 
| 165 |  |  |  |  |  |  | $parser->base_uri(""); | 
| 166 |  |  |  |  |  |  | } else { | 
| 167 |  |  |  |  |  |  | my $resp = $self->{useragent}->get($wsd); | 
| 168 |  |  |  |  |  |  | $resp->is_success or croak "Couldn't download description $wsd"; | 
| 169 |  |  |  |  |  |  | $parser->base_uri($wsd); | 
| 170 |  |  |  |  |  |  | $doc = $parser->parse_string($resp->content); | 
| 171 |  |  |  |  |  |  | $parser->base_uri(""); | 
| 172 |  |  |  |  |  |  | } | 
| 173 |  |  |  |  |  |  | $doc or croak "Couldn't parse document $wsd"; | 
| 174 |  |  |  |  |  |  | return $doc; | 
| 175 |  |  |  |  |  |  | } | 
| 176 |  |  |  |  |  |  |  | 
| 177 |  |  |  |  |  |  | sub init { | 
| 178 |  |  |  |  |  |  | my $self = shift; | 
| 179 |  |  |  |  |  |  |  | 
| 180 |  |  |  |  |  |  | $self->apply_options; | 
| 181 |  |  |  |  |  |  |  | 
| 182 |  |  |  |  |  |  | my $wsd = $self->{wsd}; | 
| 183 |  |  |  |  |  |  | my $parser = $self->{parser}; | 
| 184 |  |  |  |  |  |  | my $doc = $self->{wsddoc}; | 
| 185 |  |  |  |  |  |  |  | 
| 186 |  |  |  |  |  |  | # | 
| 187 |  |  |  |  |  |  | # Fetch all module descriptions and build the | 
| 188 |  |  |  |  |  |  | # corresponding module | 
| 189 |  |  |  |  |  |  | # | 
| 190 |  |  |  |  |  |  |  | 
| 191 |  |  |  |  |  |  | my $root = $doc->documentElement; | 
| 192 |  |  |  |  |  |  | my $first; | 
| 193 |  |  |  |  |  |  | my $last; | 
| 194 |  |  |  |  |  |  | my %modules; | 
| 195 |  |  |  |  |  |  | my %forwards; | 
| 196 |  |  |  |  |  |  | my %feedbacks; | 
| 197 |  |  |  |  |  |  | my @nodes = $root->childNodes; | 
| 198 |  |  |  |  |  |  | while (@nodes) { | 
| 199 |  |  |  |  |  |  | my $mnode = shift(@nodes); | 
| 200 |  |  |  |  |  |  | $mnode->nodeType == 1 or next; | 
| 201 |  |  |  |  |  |  | $mnode->namespaceURI eq $NameSpace or next; | 
| 202 |  |  |  |  |  |  | my $type = $mnode->localname; | 
| 203 |  |  |  |  |  |  | my %params = %$self; | 
| 204 |  |  |  |  |  |  | my $name = $mnode->getAttribute("name"); | 
| 205 |  |  |  |  |  |  | if($mnode->hasAttribute("abort-if-empty")) { | 
| 206 |  |  |  |  |  |  | $params{abortIfEmpty} = ($mnode->getAttribute("abort-if-empty") eq "yes"); | 
| 207 |  |  |  |  |  |  | } else { | 
| 208 |  |  |  |  |  |  | $params{abortIfEmpty} = 0; | 
| 209 |  |  |  |  |  |  | } | 
| 210 |  |  |  |  |  |  | if($type eq 'options' || $type eq 'include') { | 
| 211 |  |  |  |  |  |  | # do nothing these are handled seperately | 
| 212 |  |  |  |  |  |  | } elsif($type eq 'init') { | 
| 213 |  |  |  |  |  |  | my $uri = $mnode->getAttribute("browse"); | 
| 214 |  |  |  |  |  |  | my $resp = $self->{useragent}->get($uri); | 
| 215 |  |  |  |  |  |  | $self->{cookies}->extract_cookies($resp); | 
| 216 |  |  |  |  |  |  | } elsif($ModClass{$type} || $type eq 'external') { | 
| 217 |  |  |  |  |  |  | $self->log(5,"Creating subtask of type ",$type); | 
| 218 |  |  |  |  |  |  | my $class; | 
| 219 |  |  |  |  |  |  | if($type eq 'external') { | 
| 220 |  |  |  |  |  |  | $class = $mnode->getAttribute("module"); | 
| 221 |  |  |  |  |  |  | $class or croak("No module declared for external"); | 
| 222 |  |  |  |  |  |  | } else { | 
| 223 |  |  |  |  |  |  | my $subtype = $mnode->getAttribute("type"); | 
| 224 |  |  |  |  |  |  | $class = $subtype ? | 
| 225 |  |  |  |  |  |  | $ModClass{$type} . "::" . $subtype : | 
| 226 |  |  |  |  |  |  | $ModClass{$type}; | 
| 227 |  |  |  |  |  |  | } | 
| 228 |  |  |  |  |  |  | $self->log(5,"Using perl module ",$class); | 
| 229 |  |  |  |  |  |  | eval "require $class"; | 
| 230 |  |  |  |  |  |  | if(!$@) { | 
| 231 |  |  |  |  |  |  | $modules{$name} = $class->new( %params, | 
| 232 |  |  |  |  |  |  | wsdnode => $mnode, name => $name); | 
| 233 |  |  |  |  |  |  | if($mnode->hasAttribute("forward-to")) { | 
| 234 |  |  |  |  |  |  | $forwards{$name} = $mnode->getAttribute("forward-to"); | 
| 235 |  |  |  |  |  |  | } | 
| 236 |  |  |  |  |  |  | if($mnode->hasAttribute("feedback-to")) { | 
| 237 |  |  |  |  |  |  | $feedbacks{$name} = $mnode->getAttribute("feedback-to"); | 
| 238 |  |  |  |  |  |  | } | 
| 239 |  |  |  |  |  |  | $first or $first = $name; | 
| 240 |  |  |  |  |  |  | $last = $name; | 
| 241 |  |  |  |  |  |  | } else { | 
| 242 |  |  |  |  |  |  | croak("Couldn't load '$class' : $@"); | 
| 243 |  |  |  |  |  |  | } | 
| 244 |  |  |  |  |  |  | } else { | 
| 245 |  |  |  |  |  |  | $self->log(1,"Module named '$name' is of an unknown type '$type'"); | 
| 246 |  |  |  |  |  |  | } | 
| 247 |  |  |  |  |  |  | } | 
| 248 |  |  |  |  |  |  |  | 
| 249 |  |  |  |  |  |  | if(!$first) { | 
| 250 |  |  |  |  |  |  | croak("No modules defined in description file"); | 
| 251 |  |  |  |  |  |  | } | 
| 252 |  |  |  |  |  |  |  | 
| 253 |  |  |  |  |  |  | # | 
| 254 |  |  |  |  |  |  | # Connect the modules to each other | 
| 255 |  |  |  |  |  |  | # | 
| 256 |  |  |  |  |  |  | foreach my $key (keys(%forwards)) { | 
| 257 |  |  |  |  |  |  | foreach my $other (split(/ /,$forwards{$key})) { | 
| 258 |  |  |  |  |  |  | if($modules{$other}) { | 
| 259 |  |  |  |  |  |  | $self->log(5,"Setting $key as producer of $other"); | 
| 260 |  |  |  |  |  |  | $modules{$key} or croak("No module named $key defined"); | 
| 261 |  |  |  |  |  |  | $modules{$other}->producers($modules{$key}); | 
| 262 |  |  |  |  |  |  | } | 
| 263 |  |  |  |  |  |  | } | 
| 264 |  |  |  |  |  |  | } | 
| 265 |  |  |  |  |  |  |  | 
| 266 |  |  |  |  |  |  | # | 
| 267 |  |  |  |  |  |  | # Configure feed back sending | 
| 268 |  |  |  |  |  |  | # | 
| 269 |  |  |  |  |  |  | foreach my $key (keys(%feedbacks)) { | 
| 270 |  |  |  |  |  |  | foreach my $other (split(/ /,$feedbacks{$key})) { | 
| 271 |  |  |  |  |  |  | if($modules{$other}) { | 
| 272 |  |  |  |  |  |  | $self->log(5,"Configuring $key to send feedback to $other"); | 
| 273 |  |  |  |  |  |  | $modules{$key} or croak("No module named $key defined"); | 
| 274 |  |  |  |  |  |  | $modules{$key}->isa('WebSource::Filter') or | 
| 275 |  |  |  |  |  |  | croak($modules{$key}->{name} . " is not a filter"); | 
| 276 |  |  |  |  |  |  | $modules{$other}->can("feedback") or | 
| 277 |  |  |  |  |  |  | croak($modules{$other}->{name} . " doesn't have a feedback method"); | 
| 278 |  |  |  |  |  |  | $modules{$key}->listeners($modules{$other}); | 
| 279 |  |  |  |  |  |  | } | 
| 280 |  |  |  |  |  |  | } | 
| 281 |  |  |  |  |  |  | } | 
| 282 |  |  |  |  |  |  |  | 
| 283 |  |  |  |  |  |  |  | 
| 284 |  |  |  |  |  |  | # | 
| 285 |  |  |  |  |  |  | # Setup first and last | 
| 286 |  |  |  |  |  |  | # | 
| 287 |  |  |  |  |  |  | $self->{first} = $modules{$first}; | 
| 288 |  |  |  |  |  |  | $self->{last}  = $modules{$last}; | 
| 289 |  |  |  |  |  |  | $self->log(5,"Initial module is $first"); | 
| 290 |  |  |  |  |  |  | $self->log(5,"Final module is $last"); | 
| 291 |  |  |  |  |  |  | } | 
| 292 |  |  |  |  |  |  |  | 
| 293 |  |  |  |  |  |  |  | 
| 294 |  |  |  |  |  |  | sub log { | 
| 295 |  |  |  |  |  |  | my $self = shift; | 
| 296 |  |  |  |  |  |  | my $level = shift; | 
| 297 |  |  |  |  |  |  | if($self->{logger}) { | 
| 298 |  |  |  |  |  |  | $self->{logger}->log($level, "[WebSource] ", @_); | 
| 299 |  |  |  |  |  |  | } | 
| 300 |  |  |  |  |  |  | } | 
| 301 |  |  |  |  |  |  |  | 
| 302 |  |  |  |  |  |  | =item B<< $source->push($item); >> | 
| 303 |  |  |  |  |  |  |  | 
| 304 |  |  |  |  |  |  | Pass the initial data to the first subtask | 
| 305 |  |  |  |  |  |  |  | 
| 306 |  |  |  |  |  |  | =cut | 
| 307 |  |  |  |  |  |  |  | 
| 308 |  |  |  |  |  |  | sub push { | 
| 309 |  |  |  |  |  |  | my ($self) = shift; | 
| 310 |  |  |  |  |  |  | $self->init; | 
| 311 |  |  |  |  |  |  | $self->{first}->push(map { WebSource::Envelope->new(type => "text/string", data => $_) } @_ ); | 
| 312 |  |  |  |  |  |  | } | 
| 313 |  |  |  |  |  |  |  | 
| 314 |  |  |  |  |  |  | =item B<< $source->query($query); >> | 
| 315 |  |  |  |  |  |  |  | 
| 316 |  |  |  |  |  |  | Build a query %hash for the given parameters and push it in | 
| 317 |  |  |  |  |  |  |  | 
| 318 |  |  |  |  |  |  | =cut | 
| 319 |  |  |  |  |  |  |  | 
| 320 |  |  |  |  |  |  | sub query { | 
| 321 |  |  |  |  |  |  | my $self = shift; | 
| 322 |  |  |  |  |  |  | $self->init; | 
| 323 |  |  |  |  |  |  | my %query = @_; | 
| 324 |  |  |  |  |  |  | if($query{data}) { | 
| 325 |  |  |  |  |  |  | $query{type} = "text/string"; | 
| 326 |  |  |  |  |  |  | } else { | 
| 327 |  |  |  |  |  |  | $query{type} = "empty"; | 
| 328 |  |  |  |  |  |  | } | 
| 329 |  |  |  |  |  |  | my $env = WebSource::Envelope->new(%query); | 
| 330 |  |  |  |  |  |  | $self->{first}->push($env); | 
| 331 |  |  |  |  |  |  | } | 
| 332 |  |  |  |  |  |  |  | 
| 333 |  |  |  |  |  |  | =item B<< $source->set_max_results($count); >> | 
| 334 |  |  |  |  |  |  |  | 
| 335 |  |  |  |  |  |  | Set the maximum number of results to output to $count | 
| 336 |  |  |  |  |  |  |  | 
| 337 |  |  |  |  |  |  | =cut | 
| 338 |  |  |  |  |  |  |  | 
| 339 |  |  |  |  |  |  | sub set_max_results { | 
| 340 |  |  |  |  |  |  | my ($self,$count) = @_; | 
| 341 |  |  |  |  |  |  | $self->{max_results} = $count; | 
| 342 |  |  |  |  |  |  | } | 
| 343 |  |  |  |  |  |  |  | 
| 344 |  |  |  |  |  |  | =item B<< $source->next_result(); >> | 
| 345 |  |  |  |  |  |  |  | 
| 346 |  |  |  |  |  |  | Returns the following result for the task | 
| 347 |  |  |  |  |  |  |  | 
| 348 |  |  |  |  |  |  | =cut | 
| 349 |  |  |  |  |  |  |  | 
| 350 |  |  |  |  |  |  | sub next_result { | 
| 351 |  |  |  |  |  |  | my $self = shift; | 
| 352 |  |  |  |  |  |  | if($self->{max_results} && $self->{max_results} <= $self->{cnt_results}) { | 
| 353 |  |  |  |  |  |  | return undef; | 
| 354 |  |  |  |  |  |  | } | 
| 355 |  |  |  |  |  |  | my $res = $self->{last}->produce; | 
| 356 |  |  |  |  |  |  | $res and ($self->{result_count} += 1); | 
| 357 |  |  |  |  |  |  | return $res; | 
| 358 |  |  |  |  |  |  | } | 
| 359 |  |  |  |  |  |  |  | 
| 360 |  |  |  |  |  |  | =back | 
| 361 |  |  |  |  |  |  |  | 
| 362 |  |  |  |  |  |  | =item B<< $source->parameters; >> | 
| 363 |  |  |  |  |  |  |  | 
| 364 |  |  |  |  |  |  | Returns a has of the initial tasks parameters | 
| 365 |  |  |  |  |  |  |  | 
| 366 |  |  |  |  |  |  | =cut | 
| 367 |  |  |  |  |  |  |  | 
| 368 |  |  |  |  |  |  | sub parameters { | 
| 369 |  |  |  |  |  |  | my $self = shift; | 
| 370 |  |  |  |  |  |  | return $self->{first}->parameters; | 
| 371 |  |  |  |  |  |  | } | 
| 372 |  |  |  |  |  |  |  | 
| 373 |  |  |  |  |  |  | =item B<< $source->option_spec; >> | 
| 374 |  |  |  |  |  |  |  | 
| 375 |  |  |  |  |  |  | Returns the spec of the options translated for Getopt::Mixed | 
| 376 |  |  |  |  |  |  |  | 
| 377 |  |  |  |  |  |  | =cut | 
| 378 |  |  |  |  |  |  |  | 
| 379 |  |  |  |  |  |  | sub option_spec { | 
| 380 |  |  |  |  |  |  | my $self = shift; | 
| 381 |  |  |  |  |  |  | my $doc = $self->{wsddoc}; | 
| 382 |  |  |  |  |  |  | my $xpc = XML::LibXML::XPathContext->new($doc); | 
| 383 |  |  |  |  |  |  | $xpc->registerNs('ws',$NameSpace); | 
| 384 |  |  |  |  |  |  |  | 
| 385 |  |  |  |  |  |  | my @spec; | 
| 386 |  |  |  |  |  |  | foreach my $onode ($xpc->findnodes('/ws:source/ws:options/*')) { | 
| 387 |  |  |  |  |  |  | my $name = ""; | 
| 388 |  |  |  |  |  |  | if($onode->nodeName() eq "option") { | 
| 389 |  |  |  |  |  |  | warn("Using option element under ws:options is deprecated. Directly use the options name as element name."); | 
| 390 |  |  |  |  |  |  | $name = $onode->getAttribute("name"); | 
| 391 |  |  |  |  |  |  | } else { | 
| 392 |  |  |  |  |  |  | $name = $onode->nodeName(); | 
| 393 |  |  |  |  |  |  | } | 
| 394 |  |  |  |  |  |  | my $shortcut = $onode->getAttribute("shortcut"); | 
| 395 |  |  |  |  |  |  | my $type = $onode->getAttribute("type"); | 
| 396 |  |  |  |  |  |  | if($name) { | 
| 397 |  |  |  |  |  |  | my $str = $name; | 
| 398 |  |  |  |  |  |  | if($type eq "string") { | 
| 399 |  |  |  |  |  |  | $str .= "=s"; | 
| 400 |  |  |  |  |  |  | } elsif($type eq "integer") { | 
| 401 |  |  |  |  |  |  | $str .= "=i"; | 
| 402 |  |  |  |  |  |  | } elsif($type eq "float") { | 
| 403 |  |  |  |  |  |  | $str .= "=f"; | 
| 404 |  |  |  |  |  |  | } | 
| 405 |  |  |  |  |  |  | if($shortcut) { | 
| 406 |  |  |  |  |  |  | $str .= " " . $shortcut . ">" . $name; | 
| 407 |  |  |  |  |  |  | } | 
| 408 |  |  |  |  |  |  | CORE::push(@spec,($str)); | 
| 409 |  |  |  |  |  |  | $self->log(3,"generated option spec '$str'\n"); | 
| 410 |  |  |  |  |  |  | } else { | 
| 411 |  |  |  |  |  |  | $self->log(1,"unamed option detected."); | 
| 412 |  |  |  |  |  |  | } | 
| 413 |  |  |  |  |  |  | } | 
| 414 |  |  |  |  |  |  | return @spec; | 
| 415 |  |  |  |  |  |  | } | 
| 416 |  |  |  |  |  |  |  | 
| 417 |  |  |  |  |  |  | =item B<< $source->set_option($opt,$val) >> | 
| 418 |  |  |  |  |  |  |  | 
| 419 |  |  |  |  |  |  | Sets source specific option $opt to value $val | 
| 420 |  |  |  |  |  |  |  | 
| 421 |  |  |  |  |  |  | =cut | 
| 422 |  |  |  |  |  |  |  | 
| 423 |  |  |  |  |  |  | sub set_option { | 
| 424 |  |  |  |  |  |  | my ($self,$opt,$val) = @_; | 
| 425 |  |  |  |  |  |  | $self->log(2,"Setting option $opt to value $val"); | 
| 426 |  |  |  |  |  |  |  | 
| 427 |  |  |  |  |  |  | my $xpc = XML::LibXML::XPathContext->new($self->{wsddoc}); | 
| 428 |  |  |  |  |  |  | $xpc->registerNs('ws',$NameSpace); | 
| 429 |  |  |  |  |  |  |  | 
| 430 |  |  |  |  |  |  | if(my @optnode = $xpc->findnodes("//ws:options")) { | 
| 431 |  |  |  |  |  |  | if (my @nodes = $optnode[0]->getChildrenByTagName($opt)) { | 
| 432 |  |  |  |  |  |  | if($nodes[0]->hasChildNodes()) { | 
| 433 |  |  |  |  |  |  | $nodes[0]->firstChild()->setData($val); | 
| 434 |  |  |  |  |  |  | } else { | 
| 435 |  |  |  |  |  |  | $nodes[0]->appendText($val); | 
| 436 |  |  |  |  |  |  | } | 
| 437 |  |  |  |  |  |  | } else { | 
| 438 |  |  |  |  |  |  | my $nn = $self->{wsddoc}->createElement($opt); | 
| 439 |  |  |  |  |  |  | $nn->appendText($val); | 
| 440 |  |  |  |  |  |  | $optnode[0]->appendChild($nn); | 
| 441 |  |  |  |  |  |  | } | 
| 442 |  |  |  |  |  |  | } else { | 
| 443 |  |  |  |  |  |  | croak("Setting option while ws:options node is absent"); | 
| 444 |  |  |  |  |  |  | } | 
| 445 |  |  |  |  |  |  | } | 
| 446 |  |  |  |  |  |  |  | 
| 447 |  |  |  |  |  |  |  | 
| 448 |  |  |  |  |  |  | =item B<< $source->apply_imports >> | 
| 449 |  |  |  |  |  |  |  | 
| 450 |  |  |  |  |  |  | Handles node of type  by inserting nodes from the wsd file referenced by href | 
| 451 |  |  |  |  |  |  | into (imported document) into the current wsd document (target document). | 
| 452 |  |  |  |  |  |  | A node is inserted from the imported document into the target document only if a node with the same | 
| 453 |  |  |  |  |  |  | name does not exist in the target document. | 
| 454 |  |  |  |  |  |  |  | 
| 455 |  |  |  |  |  |  | =cut | 
| 456 |  |  |  |  |  |  |  | 
| 457 |  |  |  |  |  |  | sub apply_imports { | 
| 458 |  |  |  |  |  |  | my ($self) = @_; | 
| 459 |  |  |  |  |  |  | my $doc = $self->{wsddoc}; | 
| 460 |  |  |  |  |  |  | my $xpc = XML::LibXML::XPathContext->new($doc); | 
| 461 |  |  |  |  |  |  | $xpc->registerNs('ws',$NameSpace); | 
| 462 |  |  |  |  |  |  |  | 
| 463 |  |  |  |  |  |  | my @import_nodes = $xpc->findnodes("//ws:import"); | 
| 464 |  |  |  |  |  |  | while(@import_nodes) { | 
| 465 |  |  |  |  |  |  | my $im_node  = shift @import_nodes; | 
| 466 |  |  |  |  |  |  | my $im_par   = $im_node->parentNode; | 
| 467 |  |  |  |  |  |  | my $im_wsd   = $im_node->getAttribute("href"); | 
| 468 |  |  |  |  |  |  | $self->log(2,"Processing import of ".$im_wsd); | 
| 469 |  |  |  |  |  |  | my $im_doc   = $self->load_wsd($im_wsd,$self->{wsd}); | 
| 470 |  |  |  |  |  |  |  | 
| 471 |  |  |  |  |  |  | foreach my $el ($im_doc->documentElement->childNodes) { | 
| 472 |  |  |  |  |  |  | $el->nodeType == 1 or next; | 
| 473 |  |  |  |  |  |  | my $nodeType = $el->localName; | 
| 474 |  |  |  |  |  |  | if($nodeType eq 'options') { | 
| 475 |  |  |  |  |  |  | # If options have not been locally redefined import them | 
| 476 |  |  |  |  |  |  | if(!$xpc->exists('//ws:options')) { | 
| 477 |  |  |  |  |  |  | $im_par->insertBefore($el,$im_node); | 
| 478 |  |  |  |  |  |  | } | 
| 479 |  |  |  |  |  |  | } else { | 
| 480 |  |  |  |  |  |  | my $name = $el->getAttribute("name"); | 
| 481 |  |  |  |  |  |  | if(!$xpc->exists('//*[@name="' . $name . '"]')) { | 
| 482 |  |  |  |  |  |  | $im_par->insertBefore($el,$im_node); | 
| 483 |  |  |  |  |  |  | } | 
| 484 |  |  |  |  |  |  | } | 
| 485 |  |  |  |  |  |  | } | 
| 486 |  |  |  |  |  |  | $im_par->removeChild($im_node); | 
| 487 |  |  |  |  |  |  | } | 
| 488 |  |  |  |  |  |  | } | 
| 489 |  |  |  |  |  |  |  | 
| 490 |  |  |  |  |  |  | =item B<< $source->apply_options >> | 
| 491 |  |  |  |  |  |  |  | 
| 492 |  |  |  |  |  |  | Handles node of type  by adding | 
| 493 |  |  |  |  |  |  | and attribut name aname with the value of the option named oname | 
| 494 |  |  |  |  |  |  | to the parent node. The ws:attribute node is then removed. | 
| 495 |  |  |  |  |  |  |  | 
| 496 |  |  |  |  |  |  | =cut | 
| 497 |  |  |  |  |  |  |  | 
| 498 |  |  |  |  |  |  | sub apply_options { | 
| 499 |  |  |  |  |  |  | my ($self) = @_; | 
| 500 |  |  |  |  |  |  | my $doc = $self->{wsddoc}; | 
| 501 |  |  |  |  |  |  | my $xpc = XML::LibXML::XPathContext->new($doc); | 
| 502 |  |  |  |  |  |  | $xpc->registerNs('ws',$NameSpace); | 
| 503 |  |  |  |  |  |  |  | 
| 504 |  |  |  |  |  |  | my @optnode = $xpc->findnodes("//ws:options"); | 
| 505 |  |  |  |  |  |  | foreach my $sa ($xpc->findnodes("//ws:set-attribute")) { | 
| 506 |  |  |  |  |  |  | my $p = $sa->parentNode; | 
| 507 |  |  |  |  |  |  | my $aname = $sa->getAttribute("name"); | 
| 508 |  |  |  |  |  |  | my $oexpr = $sa->getAttribute("value-of"); | 
| 509 |  |  |  |  |  |  | if($oexpr eq "") { | 
| 510 |  |  |  |  |  |  | $self->log(1,"Warning : Empty value-of attribute on ws:set-attribute"); | 
| 511 |  |  |  |  |  |  | } else { | 
| 512 |  |  |  |  |  |  | my $oval = $optnode[0]->findvalue($oexpr); | 
| 513 |  |  |  |  |  |  | if($oval) { | 
| 514 |  |  |  |  |  |  | $p->setAttribute($aname,$oval); | 
| 515 |  |  |  |  |  |  | } else { | 
| 516 |  |  |  |  |  |  | $self->log(1,"Warning : Expr '$oexpr' has no value"); | 
| 517 |  |  |  |  |  |  | } | 
| 518 |  |  |  |  |  |  | $p->removeChild($sa); | 
| 519 |  |  |  |  |  |  | } | 
| 520 |  |  |  |  |  |  | } | 
| 521 |  |  |  |  |  |  | $self->log(6,"After applying options...\n", $doc->toString(1)); | 
| 522 |  |  |  |  |  |  | } | 
| 523 |  |  |  |  |  |  |  | 
| 524 |  |  |  |  |  |  | =head1 SEE ALSO | 
| 525 |  |  |  |  |  |  |  | 
| 526 |  |  |  |  |  |  | ws-query, WebSource::Extract, WebSource::Fetch, WebSource::Filter, etc. | 
| 527 |  |  |  |  |  |  |  | 
| 528 |  |  |  |  |  |  | =cut | 
| 529 |  |  |  |  |  |  |  | 
| 530 |  |  |  |  |  |  | 1; |