| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package WebSource::XMLParser; |
|
2
|
1
|
|
|
1
|
|
5350
|
use utf8; |
|
|
1
|
|
|
|
|
13
|
|
|
|
1
|
|
|
|
|
6
|
|
|
3
|
1
|
|
|
1
|
|
39
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
34
|
|
|
4
|
1
|
|
|
1
|
|
6
|
use LWP::UserAgent; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
22
|
|
|
5
|
|
|
|
|
|
|
# use WebSource::Parser; |
|
6
|
1
|
|
|
1
|
|
58
|
use WebSource::Module; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use Carp; |
|
8
|
|
|
|
|
|
|
use Encode; |
|
9
|
|
|
|
|
|
|
use Encode::Guess; |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our @ISA = ('WebSource::Module'); |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
my %html_options = ( |
|
14
|
|
|
|
|
|
|
recover => 2 |
|
15
|
|
|
|
|
|
|
); |
|
16
|
|
|
|
|
|
|
my %xml_options = ( |
|
17
|
|
|
|
|
|
|
recover => 2 |
|
18
|
|
|
|
|
|
|
); |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 NAME |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
WebSource::XMLParser : Builds a document out of an http-response |
|
23
|
|
|
|
|
|
|
containing an XML or HTML file |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
An XMLParser operator is declared as follows : |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
$fetcher = WebSource::XMLParser->new(wsnode => $node); |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
# for the rest it works as a WebSource::Module |
|
37
|
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=head1 METHODS |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=over 2 |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=item B<< $parser = WebSource::XMLParser->new(desc => $node); >> |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
Create a new Fetcher; |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=cut |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
sub _init_ { |
|
49
|
|
|
|
|
|
|
my $self = shift; |
|
50
|
|
|
|
|
|
|
$self->SUPER::_init_; |
|
51
|
|
|
|
|
|
|
$self->{parser} or $self->{parser} = XML::LibXML->new; |
|
52
|
|
|
|
|
|
|
my $wsd = $self->{wsdnode}; |
|
53
|
|
|
|
|
|
|
if($wsd) { |
|
54
|
|
|
|
|
|
|
$self->{forceEncoding} = $wsd->getAttribute('force-encoding'); |
|
55
|
|
|
|
|
|
|
} |
|
56
|
|
|
|
|
|
|
return $self; |
|
57
|
|
|
|
|
|
|
} |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=item B<< $parser->handle($env); >> |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
Parses the content of an http-response |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=cut |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
sub handle { |
|
67
|
|
|
|
|
|
|
my $self = shift; |
|
68
|
|
|
|
|
|
|
my $env = shift; |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
# $env->type eq "text/html" or |
|
71
|
|
|
|
|
|
|
# ( $env->type eq "text/xml" or |
|
72
|
|
|
|
|
|
|
# return () ); |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
my $ct = $env->data; |
|
75
|
|
|
|
|
|
|
my $base = $env->{baseuri}; |
|
76
|
|
|
|
|
|
|
my $doc = eval { |
|
77
|
|
|
|
|
|
|
$self->log(2,"Found doctype of '". $env->type . "' with encoding '" . $env->{encoding} ."'"); |
|
78
|
|
|
|
|
|
|
$self->log(5,"-------- data -------------\n" . $ct); |
|
79
|
|
|
|
|
|
|
my %options; |
|
80
|
|
|
|
|
|
|
if($self->{forceEncoding}) { |
|
81
|
|
|
|
|
|
|
$ct = decode($self->{forceEncoding},$ct); |
|
82
|
|
|
|
|
|
|
} |
|
83
|
|
|
|
|
|
|
# elsif($env->{encoding}) { |
|
84
|
|
|
|
|
|
|
# $ct = decode($env->{encoding},$ct); |
|
85
|
|
|
|
|
|
|
# } |
|
86
|
|
|
|
|
|
|
if ($env->type eq "text/html") { |
|
87
|
|
|
|
|
|
|
$self->{parser}->parse_html_string($ct,\%html_options,%options); |
|
88
|
|
|
|
|
|
|
} else { |
|
89
|
|
|
|
|
|
|
$self->{parser}->parse_string($ct,\%xml_options,%options); |
|
90
|
|
|
|
|
|
|
} |
|
91
|
|
|
|
|
|
|
}; |
|
92
|
|
|
|
|
|
|
$doc->setEncoding('utf-8'); |
|
93
|
|
|
|
|
|
|
if(!$doc) { |
|
94
|
|
|
|
|
|
|
$self->log(1,"Couldn't parse document $base : $@"); |
|
95
|
|
|
|
|
|
|
$self->log(3,">> here is the content <<\n",$ct,"\n"); |
|
96
|
|
|
|
|
|
|
return (); |
|
97
|
|
|
|
|
|
|
} |
|
98
|
|
|
|
|
|
|
my $bytes = $doc->toString(1,'utf-8'); |
|
99
|
|
|
|
|
|
|
$self->log(6,"-------- parsed -------------\n" . $bytes); |
|
100
|
|
|
|
|
|
|
my %meta = %$env; |
|
101
|
|
|
|
|
|
|
return WebSource::Envelope->new( |
|
102
|
|
|
|
|
|
|
%meta, |
|
103
|
|
|
|
|
|
|
type => "object/dom-node", |
|
104
|
|
|
|
|
|
|
baseuri => $base, |
|
105
|
|
|
|
|
|
|
data => $doc); |
|
106
|
|
|
|
|
|
|
} |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=back |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=head1 SEE ALSO |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
WebSource::Module |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=cut |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
1; |