line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Web::Microformats2::Parser; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
968
|
use Moose; |
|
2
|
|
|
|
|
863897
|
|
|
2
|
|
|
|
|
13
|
|
4
|
2
|
|
|
2
|
|
15114
|
use MooseX::Types::URI qw(Uri); |
|
2
|
|
|
|
|
364656
|
|
|
2
|
|
|
|
|
12
|
|
5
|
2
|
|
|
2
|
|
4937
|
use HTML::TreeBuilder::XPath; |
|
2
|
|
|
|
|
122732
|
|
|
2
|
|
|
|
|
19
|
|
6
|
2
|
|
|
2
|
|
89
|
use HTML::Entities; |
|
2
|
|
|
|
|
6
|
|
|
2
|
|
|
|
|
140
|
|
7
|
2
|
|
|
2
|
|
30
|
use v5.10; |
|
2
|
|
|
|
|
8
|
|
8
|
2
|
|
|
2
|
|
27
|
use Scalar::Util; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
75
|
|
9
|
2
|
|
|
2
|
|
787
|
use JSON; |
|
2
|
|
|
|
|
9194
|
|
|
2
|
|
|
|
|
17
|
|
10
|
2
|
|
|
2
|
|
1353
|
use DateTime::Format::ISO8601; |
|
2
|
|
|
|
|
1100668
|
|
|
2
|
|
|
|
|
135
|
|
11
|
2
|
|
|
2
|
|
24
|
use Carp; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
168
|
|
12
|
|
|
|
|
|
|
|
13
|
2
|
|
|
2
|
|
1189
|
use Web::Microformats2::Item; |
|
2
|
|
|
|
|
10
|
|
|
2
|
|
|
|
|
114
|
|
14
|
2
|
|
|
2
|
|
1195
|
use Web::Microformats2::Document; |
|
2
|
|
|
|
|
7
|
|
|
2
|
|
|
|
|
89
|
|
15
|
|
|
|
|
|
|
|
16
|
2
|
|
|
2
|
|
1196
|
use Readonly; |
|
2
|
|
|
|
|
21080
|
|
|
2
|
|
|
|
|
6692
|
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
has 'url_context' => ( |
19
|
|
|
|
|
|
|
is => 'rw', |
20
|
|
|
|
|
|
|
isa => Uri, |
21
|
|
|
|
|
|
|
coerce => 1, |
22
|
|
|
|
|
|
|
lazy => 1, |
23
|
|
|
|
|
|
|
clearer => '_clear_url_context', |
24
|
|
|
|
|
|
|
default => sub { URI->new( 'http://example.com/' ) }, |
25
|
|
|
|
|
|
|
); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub parse { |
28
|
74
|
|
|
74
|
1
|
275532
|
my $self = shift; |
29
|
74
|
|
|
|
|
216
|
my ( $html, %args ) = @_; |
30
|
|
|
|
|
|
|
|
31
|
74
|
|
|
|
|
251
|
$self->_clear; |
32
|
74
|
50
|
|
|
|
215
|
if ( $args{ url_context } ) { |
33
|
0
|
|
|
|
|
0
|
$self->url_context( $args{url_context} ); |
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
|
36
|
74
|
|
|
|
|
568
|
my $tree = HTML::TreeBuilder::XPath->new; |
37
|
74
|
|
|
|
|
16755
|
$tree->ignore_unknown( 0 ); |
38
|
74
|
|
|
|
|
863
|
$tree->no_space_compacting( 1 ); |
39
|
74
|
|
|
|
|
715
|
$tree->ignore_ignorable_whitespace( 0 ); |
40
|
74
|
|
|
|
|
609
|
$tree->no_expand_entities( 1 ); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# Adding HTML5 elements because it's 2018. |
43
|
74
|
|
|
|
|
624
|
foreach (qw(article aside details figcaption figure footer header main mark nav section summary time)) { |
44
|
962
|
|
|
|
|
1426
|
$HTML::TreeBuilder::isBodyElement{$_}=1; |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
|
47
|
74
|
|
|
|
|
866
|
$tree->parse( $html ); |
48
|
|
|
|
|
|
|
|
49
|
74
|
100
|
|
|
|
159748
|
if ( my $base_url = $tree->findvalue( './/base/@href' ) ) { |
50
|
6
|
|
|
|
|
6364
|
$self->url_context( $base_url ); |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
|
53
|
74
|
|
|
|
|
78081
|
my $document = Web::Microformats2::Document->new; |
54
|
74
|
|
|
|
|
34348
|
$self->analyze_element( $document, $tree ); |
55
|
74
|
|
|
|
|
1990
|
return $document; |
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
# analyze_element: Recursive method that scans an element for new microformat |
59
|
|
|
|
|
|
|
# definitions (h-*) or properties (u|dt|e|p-*) and then does the right thing. |
60
|
|
|
|
|
|
|
# It also builds up the MF2 document's rels and rel-urls as it goes. |
61
|
|
|
|
|
|
|
sub analyze_element { |
62
|
1913
|
|
|
1913
|
0
|
2264
|
my $self = shift; |
63
|
1913
|
|
|
|
|
2838
|
my ( $document, $element, $current_item ) = @_; |
64
|
|
|
|
|
|
|
|
65
|
1913
|
100
|
66
|
|
|
7378
|
return unless blessed( $element) && $element->isa( 'HTML::Element' ); |
66
|
|
|
|
|
|
|
|
67
|
898
|
|
|
|
|
2249
|
$self->_add_element_rels_to_mf2_document( $element, $document ); |
68
|
|
|
|
|
|
|
|
69
|
898
|
|
|
|
|
5882
|
my $mf2_attrs = $self->_tease_out_mf2_attrs( $element ); |
70
|
|
|
|
|
|
|
|
71
|
898
|
|
|
|
|
1419
|
my $h_attrs = delete $mf2_attrs->{h}; |
72
|
898
|
|
|
|
|
1088
|
my $new_item; |
73
|
898
|
100
|
|
|
|
1538
|
if ( $h_attrs->[0] ) { |
74
|
136
|
|
|
|
|
796
|
$new_item = Web::Microformats2::Item->new( { |
75
|
|
|
|
|
|
|
types => $h_attrs, |
76
|
|
|
|
|
|
|
parent => $current_item, |
77
|
|
|
|
|
|
|
} ); |
78
|
136
|
|
|
|
|
97261
|
$document->add_item( $new_item ); |
79
|
136
|
100
|
|
|
|
281
|
unless ( $current_item ) { |
80
|
94
|
|
|
|
|
3246
|
$document->add_top_level_item( $new_item ); |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
|
84
|
898
|
|
|
|
|
2691
|
while (my ($mf2_type, $properties_ref ) = each( %$mf2_attrs ) ) { |
85
|
3592
|
100
|
|
|
|
6956
|
next unless $current_item; |
86
|
1924
|
100
|
|
|
|
1980
|
next unless @{ $properties_ref }; |
|
1924
|
|
|
|
|
4679
|
|
87
|
303
|
100
|
|
|
|
807
|
if ( $mf2_type eq 'p' ) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# p-property: |
89
|
|
|
|
|
|
|
# A catch-all generic property to store on the current |
90
|
|
|
|
|
|
|
# MF2 item being defined. |
91
|
|
|
|
|
|
|
# (If this same element begins an h-* microformat, we don't parse |
92
|
|
|
|
|
|
|
# this p-* any further; instead we'll store the new item under |
93
|
|
|
|
|
|
|
# this property name.) |
94
|
190
|
100
|
|
|
|
340
|
unless ( $new_item ) { |
95
|
158
|
|
|
|
|
252
|
for my $property ( @$properties_ref ) { |
96
|
163
|
|
|
|
|
346
|
my $value = $self->_parse_property_value( $element ); |
97
|
163
|
100
|
|
|
|
301
|
if ( defined $value ) { |
98
|
161
|
|
|
|
|
592
|
$current_item->add_property( |
99
|
|
|
|
|
|
|
"p-$property", |
100
|
|
|
|
|
|
|
$value, |
101
|
|
|
|
|
|
|
); |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
elsif ( $mf2_type eq 'u' ) { |
107
|
|
|
|
|
|
|
# u-property: |
108
|
|
|
|
|
|
|
# Look for a URL in child attributes, and store it as a property. |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
# (But not if a new h-format has been defined, in which case we'll |
111
|
|
|
|
|
|
|
# just use the u-property's name to store it. Why would you do that |
112
|
|
|
|
|
|
|
# instead of using a p-property? I don't know, but the tests demand |
113
|
|
|
|
|
|
|
# it.) |
114
|
44
|
100
|
|
|
|
114
|
unless ( $new_item ) { |
115
|
42
|
|
|
|
|
83
|
for my $property ( @$properties_ref ) { |
116
|
43
|
|
|
|
|
102
|
my $vcp_fragments_ref = |
117
|
|
|
|
|
|
|
$self->_seek_value_class_pattern( $element ); |
118
|
43
|
100
|
|
|
|
100
|
if ( my $url = $self->_tease_out_url( $element ) ) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
119
|
38
|
|
|
|
|
162
|
$current_item->add_property( "u-$property", $url ); |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
elsif ( @$vcp_fragments_ref ) { |
122
|
2
|
|
|
|
|
10
|
$current_item->add_property( |
123
|
|
|
|
|
|
|
"u-$property", |
124
|
|
|
|
|
|
|
join q{}, @$vcp_fragments_ref, |
125
|
|
|
|
|
|
|
) |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
elsif ( $url = $self->_tease_out_unlikely_url($element)) { |
128
|
2
|
|
|
|
|
8
|
$current_item->add_property( "u-$property", $url ); |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
else { |
131
|
1
|
|
|
|
|
6
|
$current_item->add_property( |
132
|
|
|
|
|
|
|
"u-$property", |
133
|
|
|
|
|
|
|
_trim( $element->as_text ), |
134
|
|
|
|
|
|
|
); |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
elsif ( $mf2_type eq 'e' ) { |
140
|
|
|
|
|
|
|
# e-property: |
141
|
|
|
|
|
|
|
# Create a struct with keys "html" and "value", and then |
142
|
|
|
|
|
|
|
# store this in a new property. |
143
|
18
|
|
|
|
|
34
|
for my $property ( @$properties_ref ) { |
144
|
18
|
|
|
|
|
26
|
my %e_data; |
145
|
18
|
|
|
|
|
51
|
for my $content_piece ( $element->content_list ) { |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
# Make sure all URLs found in certain HTML attrs are |
148
|
|
|
|
|
|
|
# absolute. |
149
|
42
|
100
|
|
|
|
4331
|
if ( ref $content_piece ) { |
150
|
|
|
|
|
|
|
# XXX This is probably a bit too loose about what tags |
151
|
|
|
|
|
|
|
# these attrs can appear on. |
152
|
14
|
|
|
|
|
52
|
for my $href_element ( $content_piece, $content_piece->findnodes('.//*[@href|@src]') ) { |
153
|
22
|
|
|
|
|
19024
|
foreach ( qw( href src ) ) { |
154
|
44
|
|
|
|
|
180
|
my $url = $href_element->attr($_); |
155
|
44
|
100
|
|
|
|
419
|
if ( $url ) { |
156
|
9
|
|
|
|
|
289
|
my $abs_url = URI->new_abs( $url, $self->url_context)->as_string; |
157
|
9
|
|
|
|
|
1392
|
$href_element->attr( $_=> $abs_url ); |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
} |
161
|
14
|
|
|
|
|
71
|
$e_data{html} .= $content_piece->as_HTML( '<>&', undef, {} ); |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
else { |
165
|
|
|
|
|
|
|
|
166
|
28
|
|
|
|
|
72
|
$e_data{html} .= $content_piece; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
} |
169
|
18
|
|
|
|
|
379
|
$e_data{ value } = _trim (decode_entities( $element->as_text) ); |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# The official tests specifically trim space-glyphs per se; |
172
|
|
|
|
|
|
|
# all other trailing whitespace stays. Shrug. |
173
|
18
|
|
|
|
|
144
|
$e_data{ html } =~ s/ +$//; |
174
|
|
|
|
|
|
|
|
175
|
18
|
|
|
|
|
83
|
$current_item->add_property( "e-$property", \%e_data ); |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
elsif ( $mf2_type eq 'dt' ) { |
179
|
|
|
|
|
|
|
# dt-property: |
180
|
|
|
|
|
|
|
# Read a child attribute as an ISO-8601 date-time string. |
181
|
|
|
|
|
|
|
# Store it as a property in the MF2 date-time representation format. |
182
|
51
|
|
|
|
|
88
|
for my $property ( @$properties_ref ) { |
183
|
51
|
|
|
|
|
56
|
my $dt_string; |
184
|
51
|
|
|
|
|
159
|
my $vcp_fragments_ref = |
185
|
|
|
|
|
|
|
$self->_seek_value_class_pattern( $element ); |
186
|
51
|
100
|
|
|
|
126
|
if ( @$vcp_fragments_ref ) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
187
|
22
|
|
|
|
|
81
|
$dt_string = $self->_format_datetime(join (q{T}, @$vcp_fragments_ref), $current_item); |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
elsif ( my $alt = $element->findvalue( './@datetime|@title|@value' ) ) { |
190
|
26
|
|
|
|
|
19183
|
$dt_string = $alt; |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
elsif ( my $text = $element->as_trimmed_text ) { |
193
|
3
|
|
|
|
|
2012
|
$dt_string = $text; |
194
|
|
|
|
|
|
|
} |
195
|
51
|
50
|
|
|
|
1899
|
if ( defined $dt_string ) { |
196
|
51
|
|
|
|
|
206
|
$current_item->add_property( |
197
|
|
|
|
|
|
|
"dt-$property", |
198
|
|
|
|
|
|
|
$dt_string, |
199
|
|
|
|
|
|
|
); |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
|
205
|
898
|
100
|
|
|
|
1503
|
if ( $new_item ) { |
206
|
136
|
|
|
|
|
397
|
for my $child_element ( $element->content_list ) { |
207
|
587
|
|
|
|
|
1872
|
$self->analyze_element( $document, $child_element, $new_item ); |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
# Now that the new item's been recursively scanned, perform |
211
|
|
|
|
|
|
|
# some post-processing. |
212
|
|
|
|
|
|
|
# First, add any implied properties. |
213
|
136
|
|
|
|
|
398
|
for my $impliable_property (qw(name photo url)) { |
214
|
408
|
100
|
|
|
|
12311
|
unless ( $new_item->has_property( $impliable_property ) ) { |
215
|
324
|
|
|
|
|
655
|
my $method = "_set_implied_$impliable_property"; |
216
|
324
|
|
|
|
|
854
|
$self->$method( $new_item, $element ); |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# Put this onto the parent item's property-list, or its children-list, |
221
|
|
|
|
|
|
|
# depending on context. |
222
|
136
|
|
|
|
|
190
|
my @item_properties; |
223
|
136
|
|
|
|
|
300
|
for my $prefix (qw( u p ) ) { |
224
|
272
|
|
|
|
|
314
|
push @item_properties, map { "$prefix-$_" } @{ $mf2_attrs->{$prefix} }; |
|
37
|
|
|
|
|
127
|
|
|
272
|
|
|
|
|
561
|
|
225
|
|
|
|
|
|
|
} |
226
|
136
|
100
|
100
|
|
|
773
|
if ( $current_item && @item_properties ) { |
|
|
100
|
|
|
|
|
|
227
|
33
|
|
|
|
|
63
|
for my $item_property ( @item_properties ) { |
228
|
|
|
|
|
|
|
# We place a clone of the new item into the current item's |
229
|
|
|
|
|
|
|
# property list, rather than the item itself. This allows for |
230
|
|
|
|
|
|
|
# edge cases where the same item needs to go under multiple |
231
|
|
|
|
|
|
|
# properties, but carry different 'value' attributes. |
232
|
37
|
|
|
|
|
283
|
my $cloned_new_item = |
233
|
|
|
|
|
|
|
bless { %$new_item }, ref $new_item; |
234
|
|
|
|
|
|
|
|
235
|
37
|
|
|
|
|
159
|
$current_item |
236
|
|
|
|
|
|
|
->add_property( "$item_property", $cloned_new_item ); |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
# Now add a "value" attribute to this new item, if appropriate, |
239
|
|
|
|
|
|
|
# according to the MF2 spec. |
240
|
37
|
|
|
|
|
60
|
my $value_attribute; |
241
|
37
|
100
|
|
|
|
131
|
if ( $item_property =~ /^p-/ ) { |
|
|
50
|
|
|
|
|
|
242
|
35
|
100
|
|
|
|
118
|
if ( my $name = $new_item->get_properties('name')->[0] ) { |
243
|
28
|
|
|
|
|
48
|
$value_attribute = $name; |
244
|
|
|
|
|
|
|
} |
245
|
|
|
|
|
|
|
else { |
246
|
7
|
|
|
|
|
18
|
$value_attribute = |
247
|
|
|
|
|
|
|
$self->_parse_property_value( $element ); |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
elsif ( $item_property =~ /^u-/ ) { |
251
|
2
|
|
|
|
|
8
|
$value_attribute = $new_item->get_properties('url')->[0]; |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
|
254
|
37
|
50
|
|
|
|
914
|
$cloned_new_item->value( $value_attribute ) if defined ($value_attribute); |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
elsif ($current_item) { |
258
|
9
|
|
|
|
|
282
|
$current_item->add_child ( $new_item ); |
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
else { |
263
|
762
|
|
|
|
|
1517
|
for my $child_element ( $element->content_list ) { |
264
|
1252
|
|
|
|
|
4841
|
$self->analyze_element( $document, $child_element, $current_item ); |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
sub _tease_out_mf2_attrs { |
270
|
1053
|
|
|
1053
|
|
1233
|
my $self = shift; |
271
|
1053
|
|
|
|
|
1430
|
my ( $element ) = @_; |
272
|
|
|
|
|
|
|
|
273
|
1053
|
|
|
|
|
1254
|
my %mf2_attrs; |
274
|
1053
|
|
|
|
|
1499
|
foreach ( qw( h e u dt p ) ) { |
275
|
5265
|
|
|
|
|
7844
|
$mf2_attrs{ $_ } = []; |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
|
278
|
1053
|
|
|
|
|
1804
|
my $class_attr = $element->attr('class'); |
279
|
1053
|
100
|
|
|
|
9787
|
if ( $class_attr ) { |
280
|
538
|
|
|
|
|
2889
|
while ($class_attr =~ /\b(h|e|u|dt|p)-([a-z]+(\-[a-z]+)*)($|\s)/g ) { |
281
|
549
|
|
|
|
|
1083
|
my $mf2_type = $1; |
282
|
549
|
|
|
|
|
855
|
my $mf2_attr = $2; |
283
|
|
|
|
|
|
|
|
284
|
549
|
|
|
|
|
666
|
push @{ $mf2_attrs{ $mf2_type } }, $mf2_attr; |
|
549
|
|
|
|
|
2107
|
|
285
|
|
|
|
|
|
|
} |
286
|
|
|
|
|
|
|
} |
287
|
|
|
|
|
|
|
|
288
|
1053
|
|
|
|
|
2071
|
return \%mf2_attrs; |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
sub _tease_out_url { |
292
|
43
|
|
|
43
|
|
64
|
my $self = shift; |
293
|
43
|
|
|
|
|
82
|
my ( $element ) = @_; |
294
|
|
|
|
|
|
|
|
295
|
43
|
|
|
|
|
70
|
my $xpath; |
296
|
|
|
|
|
|
|
my $url; |
297
|
43
|
100
|
|
|
|
117
|
if ( $element->tag =~ /^(a|area|link)$/ ) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
298
|
26
|
|
|
|
|
219
|
$xpath = './@href'; |
299
|
|
|
|
|
|
|
} |
300
|
|
|
|
|
|
|
elsif ( $element->tag =~ /^(img|audio)$/ ) { |
301
|
10
|
|
|
|
|
162
|
$xpath = './@src'; |
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
elsif ( $element->tag eq 'video' ) { |
304
|
1
|
|
|
|
|
16
|
$xpath = './@src|@poster'; |
305
|
|
|
|
|
|
|
} |
306
|
|
|
|
|
|
|
elsif ( $element->tag eq 'object' ) { |
307
|
1
|
|
|
|
|
20
|
$xpath = './@data'; |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
|
310
|
43
|
100
|
|
|
|
187
|
if ( $xpath ) { |
311
|
38
|
|
|
|
|
103
|
$url = $element->findvalue( $xpath ); |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
|
314
|
43
|
100
|
|
|
|
13546
|
if ( defined $url ) { |
315
|
38
|
|
|
|
|
1197
|
$url = URI->new_abs( $url, $self->url_context )->as_string; |
316
|
|
|
|
|
|
|
} |
317
|
|
|
|
|
|
|
|
318
|
43
|
|
|
|
|
6896
|
return $url; |
319
|
|
|
|
|
|
|
} |
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
sub _tease_out_unlikely_url { |
322
|
3
|
|
|
3
|
|
5
|
my $self = shift; |
323
|
3
|
|
|
|
|
6
|
my ( $element ) = @_; |
324
|
|
|
|
|
|
|
|
325
|
3
|
|
|
|
|
6
|
my $xpath; |
326
|
|
|
|
|
|
|
my $url; |
327
|
3
|
100
|
|
|
|
7
|
if ( $element->tag eq 'abbr' ) { |
|
|
100
|
|
|
|
|
|
328
|
1
|
|
|
|
|
7
|
$xpath = './@title'; |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
elsif ( $element->tag =~ /^(data|input)$/ ) { |
331
|
1
|
|
|
|
|
15
|
$xpath = './@value'; |
332
|
|
|
|
|
|
|
} |
333
|
|
|
|
|
|
|
|
334
|
3
|
100
|
|
|
|
24
|
if ( $xpath ) { |
335
|
2
|
|
|
|
|
7
|
$url = $element->findvalue( $xpath ); |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
|
338
|
3
|
|
|
|
|
1270
|
return $url; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
sub _set_implied_name { |
342
|
83
|
|
|
83
|
|
111
|
my $self = shift; |
343
|
83
|
|
|
|
|
164
|
my ( $item, $element ) = @_; |
344
|
|
|
|
|
|
|
|
345
|
83
|
|
|
|
|
2060
|
my $types = $item->types; |
346
|
|
|
|
|
|
|
|
347
|
83
|
100
|
100
|
|
|
2460
|
return if $item->has_properties || $item->has_children; |
348
|
|
|
|
|
|
|
|
349
|
59
|
|
|
|
|
145
|
my $xpath; |
350
|
|
|
|
|
|
|
my $name; |
351
|
59
|
|
|
|
|
0
|
my $kid; |
352
|
59
|
|
|
|
|
87
|
my $accept_if_empty = 1; # If true, then null-string names are okay. |
353
|
59
|
100
|
100
|
|
|
154
|
if ( $element->tag =~ /^(img|area)$/ ) { |
|
|
100
|
100
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
354
|
8
|
|
|
|
|
78
|
$xpath = './@alt'; |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
elsif ( $element->tag eq 'abbr' ) { |
357
|
1
|
|
|
|
|
13
|
$xpath = './@title'; |
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
elsif ( |
360
|
|
|
|
|
|
|
( $kid = $self->_non_h_unique_child( $element, 'img' ) ) |
361
|
|
|
|
|
|
|
|| ( $kid = $self->_non_h_unique_child( $element, 'area' ) ) |
362
|
|
|
|
|
|
|
) { |
363
|
7
|
|
|
|
|
12
|
$xpath = './@alt'; |
364
|
7
|
|
|
|
|
14
|
$accept_if_empty = 0; |
365
|
|
|
|
|
|
|
} |
366
|
|
|
|
|
|
|
elsif ( $kid = $self->_non_h_unique_child( $element, 'abbr' ) ) { |
367
|
1
|
|
|
|
|
2
|
$xpath = './@title'; |
368
|
1
|
|
|
|
|
2
|
$accept_if_empty = 0; |
369
|
|
|
|
|
|
|
} |
370
|
|
|
|
|
|
|
elsif ( |
371
|
|
|
|
|
|
|
( $kid = $self->_non_h_unique_grandchild( $element, 'img' ) ) |
372
|
|
|
|
|
|
|
|| ( $kid = $self->_non_h_unique_grandchild( $element, 'area' ) ) |
373
|
|
|
|
|
|
|
) { |
374
|
3
|
|
|
|
|
5
|
$xpath = './@alt'; |
375
|
3
|
|
|
|
|
5
|
$accept_if_empty = 0; |
376
|
|
|
|
|
|
|
} |
377
|
|
|
|
|
|
|
elsif ( $kid = $self->_non_h_unique_grandchild( $element, 'abbr' ) ) { |
378
|
1
|
|
|
|
|
2
|
$xpath = './@title'; |
379
|
1
|
|
|
|
|
2
|
$accept_if_empty = 0; |
380
|
|
|
|
|
|
|
} |
381
|
|
|
|
|
|
|
|
382
|
59
|
|
66
|
|
|
162
|
my $foo = $kid || $element; |
383
|
|
|
|
|
|
|
|
384
|
59
|
100
|
|
|
|
100
|
if ( $xpath ) { |
385
|
21
|
|
66
|
|
|
59
|
my $element_to_check = $kid || $element; |
386
|
21
|
|
|
|
|
60
|
my $value = $element_to_check->findvalue( $xpath ); |
387
|
21
|
100
|
66
|
|
|
6980
|
if ( ( $value ne q{} ) || $accept_if_empty ) { |
388
|
19
|
|
|
|
|
32
|
$name = $value; |
389
|
|
|
|
|
|
|
} |
390
|
|
|
|
|
|
|
} |
391
|
|
|
|
|
|
|
|
392
|
59
|
100
|
|
|
|
112
|
unless ( defined $name ) { |
393
|
40
|
|
|
|
|
115
|
$name = _trim( $element->as_text ); |
394
|
|
|
|
|
|
|
} |
395
|
|
|
|
|
|
|
|
396
|
59
|
50
|
|
|
|
169
|
if ( length $name > 0 ) { |
397
|
59
|
|
|
|
|
170
|
$item->add_property( 'p-name', $name ); |
398
|
|
|
|
|
|
|
} |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
} |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
sub _set_implied_photo { |
403
|
126
|
|
|
126
|
|
191
|
my $self = shift; |
404
|
126
|
|
|
|
|
248
|
my ( $item, $element ) = @_; |
405
|
|
|
|
|
|
|
|
406
|
126
|
|
|
|
|
250
|
my $xpath; |
407
|
|
|
|
|
|
|
my $url; |
408
|
126
|
|
|
|
|
0
|
my $kid; |
409
|
|
|
|
|
|
|
|
410
|
126
|
100
|
|
|
|
311
|
if ( $element->tag eq 'img' ) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
411
|
5
|
|
|
|
|
36
|
$xpath = './@src'; |
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
elsif ( $element->tag eq 'object' ) { |
414
|
1
|
|
|
|
|
12
|
$xpath = './@data'; |
415
|
|
|
|
|
|
|
} |
416
|
|
|
|
|
|
|
elsif ( $kid = $self->_non_h_unique_child( $element, 'img' ) ) { |
417
|
4
|
|
|
|
|
7
|
$xpath = './@src'; |
418
|
4
|
|
|
|
|
9
|
$element = $kid; |
419
|
|
|
|
|
|
|
} |
420
|
|
|
|
|
|
|
elsif ( $kid = $self->_non_h_unique_child( $element, 'object' ) ) { |
421
|
2
|
|
|
|
|
4
|
$xpath = './@data'; |
422
|
2
|
|
|
|
|
4
|
$element = $kid; |
423
|
|
|
|
|
|
|
} |
424
|
|
|
|
|
|
|
elsif ( $kid = $self->_non_h_unique_grandchild( $element, 'img' ) ) { |
425
|
3
|
|
|
|
|
6
|
$xpath = './@src'; |
426
|
3
|
|
|
|
|
7
|
$element = $kid; |
427
|
|
|
|
|
|
|
} |
428
|
|
|
|
|
|
|
elsif ( $kid = $self->_non_h_unique_grandchild( $element, 'object' ) ) { |
429
|
1
|
|
|
|
|
4
|
$xpath = './@data'; |
430
|
1
|
|
|
|
|
2
|
$element = $kid; |
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
|
433
|
126
|
100
|
|
|
|
253
|
if ( $xpath ) { |
434
|
16
|
|
|
|
|
44
|
$url = $element->findvalue( $xpath ); |
435
|
|
|
|
|
|
|
} |
436
|
|
|
|
|
|
|
|
437
|
126
|
100
|
|
|
|
5086
|
if ( defined $url ) { |
438
|
16
|
|
|
|
|
476
|
$url = URI->new_abs( $url, $self->url_context )->as_string; |
439
|
16
|
|
|
|
|
3737
|
$item->add_property( 'u-photo', $url ); |
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
} |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
sub _set_implied_url { |
445
|
115
|
|
|
115
|
|
181
|
my $self = shift; |
446
|
115
|
|
|
|
|
192
|
my ( $item, $element ) = @_; |
447
|
|
|
|
|
|
|
|
448
|
115
|
|
|
|
|
224
|
my $xpath; |
449
|
|
|
|
|
|
|
my $url; |
450
|
|
|
|
|
|
|
|
451
|
115
|
|
|
|
|
0
|
my $kid; |
452
|
115
|
100
|
100
|
|
|
235
|
if ( $element->tag =~ /^(a|area)$/ ) { |
|
|
100
|
66
|
|
|
|
|
|
|
|
100
|
|
|
|
|
453
|
23
|
|
|
|
|
178
|
$xpath = './@href'; |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
elsif ( |
456
|
|
|
|
|
|
|
( $kid = $self->_non_h_unique_child( $element, 'a' ) ) |
457
|
|
|
|
|
|
|
|| ( $kid = $self->_non_h_unique_child( $element, 'area' ) ) |
458
|
|
|
|
|
|
|
|| ( $kid = $self->_non_h_unique_grandchild( $element, 'a' ) ) |
459
|
|
|
|
|
|
|
|| ( $kid = $self->_non_h_unique_grandchild( $element, 'area' ) ) |
460
|
|
|
|
|
|
|
) { |
461
|
10
|
|
|
|
|
21
|
$xpath = './@href'; |
462
|
10
|
|
|
|
|
13
|
$element = $kid; |
463
|
|
|
|
|
|
|
} |
464
|
|
|
|
|
|
|
|
465
|
115
|
100
|
|
|
|
232
|
if ( $xpath ) { |
466
|
33
|
|
|
|
|
97
|
$url = $element->findvalue( $xpath ); |
467
|
|
|
|
|
|
|
} |
468
|
|
|
|
|
|
|
|
469
|
115
|
100
|
|
|
|
10320
|
if ( defined $url ) { |
470
|
33
|
|
|
|
|
989
|
$url = URI->new_abs( $url, $self->url_context )->as_string; |
471
|
33
|
|
|
|
|
6210
|
$item->add_property( 'u-url', $url ); |
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
} |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
sub _non_h_unique_child { |
477
|
642
|
|
|
642
|
|
2973
|
my $self = shift; |
478
|
642
|
|
|
|
|
913
|
my ( $element, $tag ) = @_; |
479
|
|
|
|
|
|
|
|
480
|
642
|
100
|
|
|
|
1015
|
my @children = grep { (ref $_) && $_->tag eq $tag } $element->content_list; |
|
1864
|
|
|
|
|
8356
|
|
481
|
|
|
|
|
|
|
|
482
|
642
|
100
|
|
|
|
2109
|
if ( @children == 1 ) { |
483
|
38
|
|
|
|
|
79
|
my $mf2_attrs = $self->_tease_out_mf2_attrs( $children[0] ); |
484
|
38
|
100
|
|
|
|
94
|
if (not ( $mf2_attrs->{h}->[0] ) ) { |
485
|
32
|
|
|
|
|
145
|
return $children[0]; |
486
|
|
|
|
|
|
|
} |
487
|
|
|
|
|
|
|
} |
488
|
|
|
|
|
|
|
|
489
|
610
|
|
|
|
|
1863
|
return; |
490
|
|
|
|
|
|
|
} |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
sub _non_h_unique_grandchild { |
493
|
512
|
|
|
512
|
|
630
|
my $self = shift; |
494
|
512
|
|
|
|
|
740
|
my ( $element, $tag ) = @_; |
495
|
|
|
|
|
|
|
|
496
|
512
|
|
|
|
|
851
|
my @children = grep { ref $_ } $element->content_list; |
|
1675
|
|
|
|
|
3821
|
|
497
|
|
|
|
|
|
|
|
498
|
512
|
100
|
|
|
|
974
|
if ( @children == 1 ) { |
499
|
117
|
|
|
|
|
204
|
my $mf2_attrs = $self->_tease_out_mf2_attrs( $children[0] ); |
500
|
117
|
100
|
|
|
|
265
|
if (not ( $mf2_attrs->{h}->[0] ) ) { |
501
|
89
|
|
|
|
|
157
|
return $self->_non_h_unique_child( $children[0], $tag ); |
502
|
|
|
|
|
|
|
} |
503
|
|
|
|
|
|
|
} |
504
|
|
|
|
|
|
|
|
505
|
423
|
|
|
|
|
1156
|
return; |
506
|
|
|
|
|
|
|
} |
507
|
|
|
|
|
|
|
|
508
|
|
|
|
|
|
|
sub _clear { |
509
|
74
|
|
|
74
|
|
116
|
my $self = shift; |
510
|
|
|
|
|
|
|
|
511
|
74
|
|
|
|
|
2815
|
$self->_clear_url_context; |
512
|
|
|
|
|
|
|
} |
513
|
|
|
|
|
|
|
|
514
|
|
|
|
|
|
|
sub _seek_value_class_pattern { |
515
|
348
|
|
|
348
|
|
427
|
my $self = shift; |
516
|
|
|
|
|
|
|
|
517
|
348
|
|
|
|
|
548
|
my ( $element, $vcp_fragments_ref ) = @_; |
518
|
|
|
|
|
|
|
|
519
|
348
|
|
100
|
|
|
1031
|
$vcp_fragments_ref ||= []; |
520
|
|
|
|
|
|
|
|
521
|
348
|
|
|
|
|
675
|
my $class = $element->attr( 'class' ); |
522
|
348
|
100
|
100
|
|
|
3721
|
if ( $class && $class =~ /\bvalue(-title)?\b/ ) { |
523
|
57
|
100
|
100
|
|
|
148
|
if ( $1 ) { |
|
|
100
|
|
|
|
|
|
524
|
6
|
|
|
|
|
14
|
push @$vcp_fragments_ref, $element->attr( 'title' ); |
525
|
|
|
|
|
|
|
} |
526
|
|
|
|
|
|
|
elsif ( ( $element->tag =~ /^(del|ins|time)$/ ) && defined( $element->attr('datetime'))) { |
527
|
20
|
|
|
|
|
327
|
push @$vcp_fragments_ref, $element->attr('datetime'); |
528
|
|
|
|
|
|
|
} |
529
|
|
|
|
|
|
|
else { |
530
|
31
|
|
|
|
|
291
|
my $html; |
531
|
31
|
|
|
|
|
52
|
for my $content_piece ( $element->content_list ) { |
532
|
31
|
50
|
|
|
|
145
|
if ( ref $content_piece ) { |
533
|
0
|
|
|
|
|
0
|
$html .= $content_piece->as_HTML; |
534
|
|
|
|
|
|
|
} |
535
|
|
|
|
|
|
|
else { |
536
|
31
|
|
|
|
|
70
|
$html .= $content_piece; |
537
|
|
|
|
|
|
|
} |
538
|
|
|
|
|
|
|
} |
539
|
31
|
|
|
|
|
106
|
push @$vcp_fragments_ref, $html; |
540
|
|
|
|
|
|
|
} |
541
|
|
|
|
|
|
|
} |
542
|
|
|
|
|
|
|
else { |
543
|
291
|
|
|
|
|
580
|
for my $child_element ( grep { ref $_ } $element->content_list ) { |
|
417
|
|
|
|
|
1763
|
|
544
|
84
|
|
|
|
|
160
|
$self->_seek_value_class_pattern( |
545
|
|
|
|
|
|
|
$child_element, $vcp_fragments_ref |
546
|
|
|
|
|
|
|
); |
547
|
|
|
|
|
|
|
} |
548
|
|
|
|
|
|
|
} |
549
|
|
|
|
|
|
|
|
550
|
348
|
|
|
|
|
858
|
return $vcp_fragments_ref; |
551
|
|
|
|
|
|
|
} |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
sub _trim { |
554
|
231
|
|
|
231
|
|
102801
|
my ($string) = @_; |
555
|
231
|
|
|
|
|
693
|
$string =~ s/^\s+//; |
556
|
231
|
|
|
|
|
776
|
$string =~ s/\s+$//; |
557
|
231
|
|
|
|
|
536
|
return $string; |
558
|
|
|
|
|
|
|
} |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
sub _format_datetime { |
561
|
22
|
|
|
22
|
|
41
|
my ($self, $dt_string, $current_item) = @_; |
562
|
|
|
|
|
|
|
|
563
|
22
|
|
|
|
|
28
|
my $dt; |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
# Knock off leading/trailing whitespace. |
566
|
22
|
|
|
|
|
36
|
$dt_string = _trim($dt_string); |
567
|
|
|
|
|
|
|
|
568
|
22
|
|
|
|
|
41
|
$dt_string =~ s/t/T/; |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
# Note presence of AM/PM, but toss it out of the string. |
571
|
22
|
|
|
|
|
97
|
$dt_string =~ s/((?:a|p)\.?m\.?)//i; |
572
|
22
|
|
50
|
|
|
62
|
my $am_or_pm = $1 || ''; |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
# Store the provided TZ offset. |
575
|
22
|
|
|
|
|
72
|
my ($provided_offset) = $dt_string =~ /([\-\+Z](?:\d\d:?\d\d)?)$/; |
576
|
22
|
|
100
|
|
|
66
|
$provided_offset ||= ''; |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
# Reformat HHMM offset as HH:MM. |
579
|
22
|
|
|
|
|
84
|
$dt_string =~ s/(-|\+)(\d\d)(\d\d)/$1$2:$3/; |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
# Store the provided seconds. |
582
|
22
|
|
|
|
|
58
|
my ($seconds) = $dt_string =~ /\d\d:\d\d:(\d\d)/; |
583
|
22
|
100
|
|
|
|
48
|
$seconds = '' unless defined $seconds; |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
# Insert :00 seconds on time when paired with a TZ offset. |
586
|
22
|
|
|
|
|
62
|
$dt_string =~ s/T(\d\d:\d\d)([\-\+Z])/T$1:00$2/; |
587
|
22
|
|
|
|
|
41
|
$dt_string =~ s/^(\d\d:\d\d)([\-\+Z])/$1:00$2/; |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
# Zero-pad hours when only a single-digit hour appears. |
590
|
22
|
|
|
|
|
49
|
$dt_string =~ s/T(\d)$/T0$1/; |
591
|
22
|
|
|
|
|
54
|
$dt_string =~ s/T(\d):/T0$1:/; |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
# Insert :00 minutes on time when only an hour is listed. |
594
|
22
|
|
|
|
|
44
|
$dt_string =~ s/T(\d\d)$/T$1:00/; |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
# Treat a space separator between date & time as a 'T'. |
597
|
22
|
|
|
|
|
39
|
$dt_string =~ s/ /T/; |
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
# If this is a time with no date, try to apply a previously-seen |
600
|
|
|
|
|
|
|
# date to it. |
601
|
22
|
|
|
|
|
30
|
my $date_is_defined = 1; |
602
|
22
|
100
|
|
|
|
46
|
if ( $dt_string =~ /^\d\d:/ ) { |
603
|
1
|
50
|
|
|
|
32
|
if ( my $previous_dt = $current_item->last_seen_date ) { |
604
|
1
|
|
|
|
|
10
|
$dt_string = $previous_dt->ymd . "T$dt_string"; |
605
|
|
|
|
|
|
|
} |
606
|
|
|
|
|
|
|
else { |
607
|
0
|
|
|
|
|
0
|
$date_is_defined = 0; |
608
|
0
|
|
|
|
|
0
|
carp "Encountered a value-class datetime with only a time, " |
609
|
|
|
|
|
|
|
. "no date, and no date defined earlier. Results may " |
610
|
|
|
|
|
|
|
. "not be what you expect. (Data: $dt_string)"; |
611
|
|
|
|
|
|
|
} |
612
|
|
|
|
|
|
|
} |
613
|
|
|
|
|
|
|
|
614
|
22
|
|
|
|
|
42
|
eval { |
615
|
22
|
|
|
|
|
93
|
$dt = DateTime::Format::ISO8601->new |
616
|
|
|
|
|
|
|
->parse_datetime( $dt_string ); |
617
|
|
|
|
|
|
|
}; |
618
|
|
|
|
|
|
|
|
619
|
22
|
50
|
|
|
|
11650
|
return if $@; |
620
|
|
|
|
|
|
|
|
621
|
22
|
50
|
|
|
|
48
|
if ($date_is_defined) { |
622
|
22
|
|
|
|
|
649
|
$current_item->last_seen_date( $dt ); |
623
|
|
|
|
|
|
|
} |
624
|
|
|
|
|
|
|
|
625
|
22
|
100
|
|
|
|
59
|
if ($am_or_pm =~ /^[pP]/) { |
626
|
|
|
|
|
|
|
# There was a 'pm' specified, so add 12 hours. |
627
|
7
|
|
|
|
|
19
|
$dt->add( hours => 12 ); |
628
|
|
|
|
|
|
|
} |
629
|
|
|
|
|
|
|
|
630
|
22
|
|
|
|
|
5648
|
my $format; |
631
|
22
|
100
|
66
|
|
|
129
|
if ( ($dt_string =~ /-/) && ($dt_string =~ /[ T]/) ) { |
|
|
50
|
|
|
|
|
|
632
|
21
|
|
|
|
|
29
|
my $offset; |
633
|
21
|
100
|
|
|
|
49
|
if ($provided_offset eq 'Z') { |
|
|
100
|
|
|
|
|
|
634
|
2
|
|
|
|
|
3
|
$offset = 'Z'; |
635
|
|
|
|
|
|
|
} |
636
|
|
|
|
|
|
|
elsif ($provided_offset) { |
637
|
7
|
|
|
|
|
10
|
$offset = '%z'; |
638
|
|
|
|
|
|
|
} |
639
|
|
|
|
|
|
|
else { |
640
|
12
|
|
|
|
|
16
|
$offset = ''; |
641
|
|
|
|
|
|
|
} |
642
|
21
|
100
|
|
|
|
56
|
$seconds = ":$seconds" if length $seconds; |
643
|
21
|
|
|
|
|
46
|
$format = "%Y-%m-%d %H:%M$seconds$offset"; |
644
|
|
|
|
|
|
|
} |
645
|
|
|
|
|
|
|
elsif ( $dt_string =~ /-/ ) { |
646
|
1
|
|
|
|
|
2
|
$format = '%Y-%m-%d'; |
647
|
|
|
|
|
|
|
} |
648
|
|
|
|
|
|
|
|
649
|
22
|
|
|
|
|
62
|
return $dt->strftime( $format ); |
650
|
|
|
|
|
|
|
} |
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
sub _parse_property_value { |
653
|
170
|
|
|
170
|
|
271
|
my ( $self, $element ) = @_; |
654
|
|
|
|
|
|
|
|
655
|
170
|
|
|
|
|
199
|
my $value; |
656
|
|
|
|
|
|
|
|
657
|
170
|
|
|
|
|
336
|
my $vcp_fragments_ref = |
658
|
|
|
|
|
|
|
$self->_seek_value_class_pattern( $element ); |
659
|
170
|
100
|
|
|
|
524
|
if ( @$vcp_fragments_ref ) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
660
|
11
|
|
|
|
|
24
|
$value = join q{}, @$vcp_fragments_ref; |
661
|
|
|
|
|
|
|
} |
662
|
|
|
|
|
|
|
elsif ( my $alt = $element->findvalue( './@title|@value|@alt' ) ) { |
663
|
9
|
|
|
|
|
6962
|
$value = $alt; |
664
|
|
|
|
|
|
|
} |
665
|
|
|
|
|
|
|
elsif ( my $text = _trim( decode_entities($element->as_text) ) ) { |
666
|
148
|
|
|
|
|
205
|
$value = $text; |
667
|
|
|
|
|
|
|
} |
668
|
|
|
|
|
|
|
|
669
|
170
|
|
|
|
|
386
|
return $value; |
670
|
|
|
|
|
|
|
} |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
sub _add_element_rels_to_mf2_document { |
673
|
898
|
|
|
898
|
|
1273
|
my ( $self, $element, $document ) = @_; |
674
|
|
|
|
|
|
|
|
675
|
898
|
100
|
|
|
|
1718
|
return unless $element->tag =~ /^(a|link)$/; |
676
|
|
|
|
|
|
|
|
677
|
99
|
|
|
|
|
789
|
my $rel = $element->attr( 'rel' ); |
678
|
99
|
100
|
|
|
|
953
|
return unless defined $rel; |
679
|
|
|
|
|
|
|
|
680
|
42
|
|
|
|
|
68
|
my $href = $element->attr( 'href' ); |
681
|
42
|
|
|
|
|
1399
|
my $url = URI->new_abs( $href, $self->url_context)->as_string; |
682
|
|
|
|
|
|
|
|
683
|
42
|
|
|
|
|
3508
|
my @rels = split /\s+/, $rel; |
684
|
42
|
|
|
|
|
81
|
for my $rel ( @rels ) { |
685
|
48
|
|
|
|
|
142
|
$document->add_rel( $rel, $url ); |
686
|
|
|
|
|
|
|
} |
687
|
|
|
|
|
|
|
|
688
|
42
|
|
|
|
|
63
|
my $rel_url_value = {}; |
689
|
42
|
|
|
|
|
69
|
foreach (qw( hreflang media title type ) ) { |
690
|
168
|
50
|
|
|
|
247
|
next if defined $rel_url_value->{ $_ }; |
691
|
168
|
|
|
|
|
282
|
my $value = $element->attr( $_ ); |
692
|
168
|
100
|
|
|
|
1381
|
if ( defined $value ) { |
693
|
6
|
|
|
|
|
14
|
$rel_url_value->{ $_ } = $value; |
694
|
|
|
|
|
|
|
} |
695
|
|
|
|
|
|
|
} |
696
|
42
|
|
|
|
|
88
|
my $text = ($element->as_text); |
697
|
42
|
50
|
|
|
|
837
|
if ( defined $text ) { |
698
|
42
|
|
|
|
|
80
|
$rel_url_value->{ text } = $text; |
699
|
|
|
|
|
|
|
} |
700
|
|
|
|
|
|
|
|
701
|
42
|
|
|
|
|
68
|
$rel_url_value->{ rels } = \@rels; |
702
|
|
|
|
|
|
|
|
703
|
42
|
|
|
|
|
91
|
$document->add_rel_url( $url, $rel_url_value ); |
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
} |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
1; |
708
|
|
|
|
|
|
|
|
709
|
|
|
|
|
|
|
=pod |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
=head1 NAME |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
Web::Microformats2::Parser - Read Microformats2 information from HTML |
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
=head1 DESCRIPTION |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
An object of this class represents a Microformats2 parser. |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
See L<Web::Microformats2> for further context and purpose. |
720
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
=head1 METHODS |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
=head2 Class Methods |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
=head3 new |
726
|
|
|
|
|
|
|
|
727
|
|
|
|
|
|
|
$parser = Web::Microformats2::Parser->new; |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
Returns a parser object. |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
=head2 Object Methods |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
=head3 parse |
734
|
|
|
|
|
|
|
|
735
|
|
|
|
|
|
|
$doc = $parser->parse( $html, %args ); |
736
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
Pass in a string containing HTML which itself contains Microformats2 |
738
|
|
|
|
|
|
|
metadata, and receive a L<Web::Microformats2::Document> object in return. |
739
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
The optional args hash recognizes the following keys: |
741
|
|
|
|
|
|
|
|
742
|
|
|
|
|
|
|
=over |
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
=item url_context |
745
|
|
|
|
|
|
|
|
746
|
|
|
|
|
|
|
A L<URI> object or URI-shaped string that will be used as a context for |
747
|
|
|
|
|
|
|
transforming all relative URL properties encountered within MF2 tags |
748
|
|
|
|
|
|
|
into absolute URLs. |
749
|
|
|
|
|
|
|
|
750
|
|
|
|
|
|
|
The default value is C<http://example.com>, so you'll probably want to |
751
|
|
|
|
|
|
|
set this to something more interesting, such as the absolute URL of the |
752
|
|
|
|
|
|
|
HTML that we are parsing. |
753
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
=back |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
=head1 AUTHOR |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
Jason McIntosh (jmac@jmac.org) |
759
|
|
|
|
|
|
|
|
760
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
This software is Copyright (c) 2018 by Jason McIntosh. |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
This is free software, licensed under: |
765
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
The MIT (X11) License |