line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WebSource::XMLParser; |
2
|
1
|
|
|
1
|
|
5350
|
use utf8; |
|
1
|
|
|
|
|
13
|
|
|
1
|
|
|
|
|
6
|
|
3
|
1
|
|
|
1
|
|
39
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
34
|
|
4
|
1
|
|
|
1
|
|
6
|
use LWP::UserAgent; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
22
|
|
5
|
|
|
|
|
|
|
# use WebSource::Parser; |
6
|
1
|
|
|
1
|
|
58
|
use WebSource::Module; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
use Carp; |
8
|
|
|
|
|
|
|
use Encode; |
9
|
|
|
|
|
|
|
use Encode::Guess; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our @ISA = ('WebSource::Module'); |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
my %html_options = ( |
14
|
|
|
|
|
|
|
recover => 2 |
15
|
|
|
|
|
|
|
); |
16
|
|
|
|
|
|
|
my %xml_options = ( |
17
|
|
|
|
|
|
|
recover => 2 |
18
|
|
|
|
|
|
|
); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 NAME |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
WebSource::XMLParser : Builds a document out of an http-response |
23
|
|
|
|
|
|
|
containing an XML or HTML file |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=head1 DESCRIPTION |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
An XMLParser operator is declared as follows : |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 SYNOPSIS |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
$fetcher = WebSource::XMLParser->new(wsnode => $node); |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
# for the rest it works as a WebSource::Module |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=head1 METHODS |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=over 2 |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=item B<< $parser = WebSource::XMLParser->new(desc => $node); >> |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
Create a new Fetcher; |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=cut |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
sub _init_ { |
49
|
|
|
|
|
|
|
my $self = shift; |
50
|
|
|
|
|
|
|
$self->SUPER::_init_; |
51
|
|
|
|
|
|
|
$self->{parser} or $self->{parser} = XML::LibXML->new; |
52
|
|
|
|
|
|
|
my $wsd = $self->{wsdnode}; |
53
|
|
|
|
|
|
|
if($wsd) { |
54
|
|
|
|
|
|
|
$self->{forceEncoding} = $wsd->getAttribute('force-encoding'); |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
return $self; |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=item B<< $parser->handle($env); >> |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
Parses the content of an http-response |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=cut |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
sub handle { |
67
|
|
|
|
|
|
|
my $self = shift; |
68
|
|
|
|
|
|
|
my $env = shift; |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
# $env->type eq "text/html" or |
71
|
|
|
|
|
|
|
# ( $env->type eq "text/xml" or |
72
|
|
|
|
|
|
|
# return () ); |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
my $ct = $env->data; |
75
|
|
|
|
|
|
|
my $base = $env->{baseuri}; |
76
|
|
|
|
|
|
|
my $doc = eval { |
77
|
|
|
|
|
|
|
$self->log(2,"Found doctype of '". $env->type . "' with encoding '" . $env->{encoding} ."'"); |
78
|
|
|
|
|
|
|
$self->log(5,"-------- data -------------\n" . $ct); |
79
|
|
|
|
|
|
|
my %options; |
80
|
|
|
|
|
|
|
if($self->{forceEncoding}) { |
81
|
|
|
|
|
|
|
$ct = decode($self->{forceEncoding},$ct); |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
# elsif($env->{encoding}) { |
84
|
|
|
|
|
|
|
# $ct = decode($env->{encoding},$ct); |
85
|
|
|
|
|
|
|
# } |
86
|
|
|
|
|
|
|
if ($env->type eq "text/html") { |
87
|
|
|
|
|
|
|
$self->{parser}->parse_html_string($ct,\%html_options,%options); |
88
|
|
|
|
|
|
|
} else { |
89
|
|
|
|
|
|
|
$self->{parser}->parse_string($ct,\%xml_options,%options); |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
}; |
92
|
|
|
|
|
|
|
$doc->setEncoding('utf-8'); |
93
|
|
|
|
|
|
|
if(!$doc) { |
94
|
|
|
|
|
|
|
$self->log(1,"Couldn't parse document $base : $@"); |
95
|
|
|
|
|
|
|
$self->log(3,">> here is the content <<\n",$ct,"\n"); |
96
|
|
|
|
|
|
|
return (); |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
my $bytes = $doc->toString(1,'utf-8'); |
99
|
|
|
|
|
|
|
$self->log(6,"-------- parsed -------------\n" . $bytes); |
100
|
|
|
|
|
|
|
my %meta = %$env; |
101
|
|
|
|
|
|
|
return WebSource::Envelope->new( |
102
|
|
|
|
|
|
|
%meta, |
103
|
|
|
|
|
|
|
type => "object/dom-node", |
104
|
|
|
|
|
|
|
baseuri => $base, |
105
|
|
|
|
|
|
|
data => $doc); |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=back |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
=head1 SEE ALSO |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
WebSource::Module |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=cut |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
1; |