File Coverage

blib/lib/Web/PageMeta.pm

Criterion	Covered	Total	%
statement	83	83	100.0
branch	13	20	65.0
condition	6	6	100.0
subroutine	17	17	100.0
pod			n/a
total	119	126	94.4

line	stmt	bran	cond	sub	time	code
1						package Web::PageMeta;
2
3						our $VERSION = '0.01';
4
5	3			3	479307	use 5.010;
	3				28
6	3			3	1776	use Moose;
	3				1497985
	3				21
7	3			3	25672	use MooseX::Types::URI qw(Uri);
	3				687737
	3				21
8	3			3	6293	use URI;
	3				7
	3				77
9	3			3	18	use URI::QueryParam;
	3				6
	3				79
10	3			3	2155	use Log::Any qw($log);
	3				27483
	3				18
11	3			3	9585	use Future 0.44;
	3				32910
	3				104
12	3			3	1575	use Future::AsyncAwait;
	3				6953
	3				18
13	3			3	1647	use Future::HTTP;
	3				60618
	3				124
14	3			3	1874	use Web::Scraper;
	3				237872
	3				28
15	3			3	2167	use Encode qw(find_mime_encoding);
	3				45932
	3				386
16	3			3	27	use Time::HiRes qw(time);
	3				7
	3				28
17
18	3			3	373	use namespace::autoclean;
	3				7
	3				41
19
20						has 'url' => (
21						isa => Uri,
22						is => 'ro',
23						required => 1,
24						coerce => 1,
25						);
26
27						has 'title' => (
28						isa => 'Str',
29						is => 'ro',
30						lazy => 1,
31						default => sub {return $_[0]->page_meta->{title} // ''},
32						);
33						has 'image' => (
34						isa => 'Str',
35						is => 'ro',
36						lazy => 1,
37						default => sub {return $_[0]->page_meta->{image} // ''},
38						);
39						has 'description' => (
40						isa => 'Str',
41						is => 'ro',
42						lazy => 1,
43						default => sub {return $_[0]->page_meta->{description} // ''},
44						);
45
46						has 'image_data' => (
47						isa => 'Str',
48						is => 'ro',
49						lazy => 1,
50						default => sub {$_[0]->fetch_image_data_ft->get},
51						);
52
53						has 'page_meta' => (
54						isa => 'HashRef',
55						is => 'rw',
56						lazy => 1,
57						default => sub {$_[0]->fetch_page_meta_ft->get},
58						);
59
60						has 'fetch_page_meta_ft' => (
61						isa => 'Future',
62						is => 'ro',
63						lazy => 1,
64						builder => '_build__fetch_page_meta_ft',
65						);
66
67						has 'fetch_image_data_ft' => (
68						isa => 'Future',
69						is => 'ro',
70						lazy => 1,
71						builder => '_build__fetch_image_data_ft',
72						);
73
74						has '_ua' => (
75						is => 'ro',
76						lazy => 1,
77						default => sub {Future::HTTP->new()},
78						);
79
80						has '_html_meta_scraper' => (
81						is => 'ro',
82						lazy => 1,
83						builder => '_build__html_meta_scraper',
84						);
85
86						has 'extra_scraper' => (
87						is => 'ro',
88						predicate => 'has_extra_scraper',
89						);
90
91						sub _build__html_meta_scraper {
92						state $html_meta_scraper = scraper {
93	11			11	77942	process '/html/head/meta[contains(@property, "og:")]',
94						'head_meta_og[]' => {
95						key => '@property',
96						val => '@content',
97						};
98	11				34247	process '/html/head/title', 'title' => 'TEXT';
99	11				11200	process '/html/head/meta[@name="description"]', 'description' => '@content';
100	11				24402	process '/html/head/base', 'base_href' => '@href';
101	11			11	34	};
102	11				327	return $html_meta_scraper;
103						}
104
105	11			11	21	async sub _build__fetch_page_meta_ft {
106	11				22	my ($self) = @_;
107
108	11				34	my $timer = time();
109	11				265	my ($body, $headers) =
110						await $self->_ua->http_get($self->url, headers => {'Accept' => 'text/html',},);
111	11				20604	my $status = $headers->{Status};
112	11				350	$log->debugf('page meta fetch %d %s finished in %.3fs', $status, $self->url, time() - $timer);
113
114	11	50			65	die 'failed to fetch ' . $self->url
115						unless $status == 200;
116
117	11	100			34	if (my $content_type = $headers->{'content-type'}) {
118	3	50			21	if (my ($charset) = ($content_type =~ m/\bcharset=(.+)/)) {
119	3	50			16	if (my $decoder = find_mime_encoding($charset)) {
120	3				483	$body = $decoder->decode($body);
121						}
122						}
123						}
124
125	11				387	my $scraper_data = $self->_html_meta_scraper->scrape(\$body);
126						my %page_meta = (
127						title => $scraper_data->{title} // '',
128	11		100		14548	description => $scraper_data->{description} // '',
			100
129						);
130	11		100		23	foreach my $attr (@{$scraper_data->{head_meta_og} // []}) {
	11				95
131	19				71	my $key = $attr->{key};
132	19				34	my $val = $attr->{val};
133	19	50			91	next unless $key =~ m/^og:(.+)$/;
134	19				64	$page_meta{$1} = $val;
135						}
136
137	11	100			452	if ($self->has_extra_scraper) {
138	1				27	my $escraper_data = $self->extra_scraper->scrape(\$body);
139	1				152556	foreach my $key (keys %{$escraper_data}) {
	1				6
140	1				4	$page_meta{$key} = $escraper_data->{$key};
141						}
142						}
143
144	11	50			36	if ($page_meta{image}) {
145						my $base_url = (
146						$scraper_data->{base_href}
147	11	100			303	? URI::WithBase->new($scraper_data->{base_href}, $self->url)->abs->as_string
148						: $self->url
149						);
150	11				594	$page_meta{image} = URI::WithBase->new($page_meta{image}, $base_url)->abs->as_string;
151						}
152
153	11				4085	return $self->page_meta(\%page_meta);
154						}
155
156	8			8	16	async sub _build__fetch_image_data_ft {
157	8				16	my ($self) = @_;
158
159	8				206	await $self->fetch_page_meta_ft;
160	8				684	my $fetch_url = $self->image;
161	8	50			30	return $self->{image_data} = ''
162						unless $fetch_url;
163
164	8				29	my $timer = time();
165	8				204	my ($body, $headers) = await $self->_ua->http_get($fetch_url);
166	8				19264	my $status = $headers->{Status};
167	8				270	$log->debugf('img fetch %d %s for %s finished in %.3fs',
168						$status, $fetch_url, $self->url, time() - $timer);
169
170	8	50			44	die 'failed to fetch ' . $fetch_url
171						unless $status == 200;
172	8				64	return $self->{image_data} = $body;
173						}
174
175						__PACKAGE__->meta->make_immutable;
176
177						1;
178
179						__END__
180
181						=head1 NAME
182
183						Web::PageMeta - get page open-graph / meta data
184
185						=head1 SYNOPSIS
186
187						use Web::PageMeta;
188						my $page = Web::PageMeta->new(url => "https://www.apa.at/");
189						say $page->title;
190						say $page->image;
191
192						async fetch previews and images:
193
194						use Web::PageMeta;
195						my @urls = qw(
196						https://www.apa.at/
197						http://www.diepresse.at/
198						https://metacpan.org/
199						https://github.com/
200						);
201						my @page_views = map { Web::PageMeta->new( url => $_ ) }
202						@urls;
203						Future->wait_all( map { $_->fetch_image_data_ft, } @page_views )->get;
204						foreach my $pv (@page_views) {
205						say 'title> '.$pv->title;
206						say 'img_size> '.length($pv->image_data);
207						}
208
209						# alternativelly instead of Future->wait_all()
210						use Future::Utils qw( fmap_void );
211						fmap_void(
212						sub { return $_[0]->fetch_image_data_ft },
213						foreach => [@page_views],
214						concurrent => 3
215						)->get;
216
217						=head1 DESCRIPTION
218
219						Get (not only) open-graph web page meta data. can be used in both normal
220						and async code.
221
222						=head1 ACCESSORS
223
224						=head2 new
225
226						Constructor, only L</url> is required.
227
228						=head2 url
229
230						HTTP url to fetch data from.
231
232						=head2 title
233
234						Returns title of the page.
235
236						=head2 description
237
238						Returns description of the page.
239
240						=head2 image
241
242						Returns image location of the page.
243
244						=head2 image_data
245
246						Returns image binary data of L</image> link.
247
248						=head2 page_meta
249
250						Returns hash ref with all open-graph data.
251
252						=head2 extra_scraper
253
254						L<Web::Scrape> object to fetch image, title or description from different
255						than default location.
256
257						use Web::Scrape;
258						use Web::PageMeta;
259						my $escraper = scraper {
260						process_first '.slider .camera_wrap div', 'image' => '@data-src';
261						};
262						my $wmeta = Web::PageMeta->new(
263						url => 'https://www.meon.eu/',
264						extra_scraper => $escraper,
265						);
266
267						=head2 fetch_page_meta_ft
268
269						Returns future object for fetching paga meta data. See L</"ASYNC USE">.
270						On done L</page_meta> hash is returned.
271
272						has fetch_image_data_ft
273
274						Returns future object for fetching image data. See L</"ASYNC USE">
275						On done L</image_data> scalar is returned.
276
277						=head1 ASYNC USE
278
279						To run multiple page meta data or image http requests in parallel or
280						to be used in async programs L</fetch_page_meta_ft> and L<fetch_image_data_ft>
281						returning L<Future> object can be used. See L</SYNOPSIS> or F<t/02_async.t>
282						for sample use.
283
284						=head1 SEE ALSO
285
286						L<https://ogp.me/>
287
288						=head1 AUTHOR
289
290						Jozef Kutej, C<< <jkutej at cpan.org> >>
291
292						=head1 LICENSE AND COPYRIGHT
293
294						Copyright 2021 jkutej@cpan.org
295
296						This program is free software; you can redistribute it and/or modify it
297						under the terms of either: the GNU General Public License as published
298						by the Free Software Foundation; or the Artistic License.
299
300						See http://dev.perl.org/licenses/ for more information.
301
302						=cut