File Coverage

blib/lib/HTML/Spelling/Site/Checker.pm

Criterion	Covered	Total	%
statement	71	136	52.2
branch	3	24	12.5
condition	3	18	16.6
subroutine	15	22	68.1
pod	3	3	100.0
total	95	203	46.8

line	stmt	bran	cond	sub	pod	time	code
1							package HTML::Spelling::Site::Checker;
2							$HTML::Spelling::Site::Checker::VERSION = '0.10.2';
3	1			1		28528	use strict;
	1					3
	1					29
4	1			1		6	use warnings;
	1					2
	1					24
5	1			1		5	use autodie;
	1					2
	1					6
6	1			1		5430	use utf8;
	1					3
	1					7
7
8	1			1		58	use 5.014;
	1					6
9
10	1			1		7	use MooX qw/late/;
	1					3
	1					6
11
12	1			1		2033	use HTML::Parser 3.00 ();
	1					7235
	1					33
13	1			1		7	use List::MoreUtils qw(any);
	1					2
	1					15
14	1			1		1573	use JSON::MaybeXS qw(decode_json);
	1					5681
	1					66
15	1			1		899	use Path::Tiny 0.144 qw/ path /;
	1					12602
	1					58
16	1			1		472	use Digest ();
	1					589
	1					206
17
18							has '_inside' =>
19							( is => 'rw', isa => 'HashRef', default => sub { return +{}; } );
20							has 'whitelist_parser' => ( is => 'ro', required => 1 );
21							has 'check_word_cb' => ( is => 'ro', isa => 'CodeRef', required => 1 );
22							has 'timestamp_cache_fn' => ( is => 'ro', isa => 'Str', required => 1 );
23
24							sub _tag
25							{
26	0			0		0	my ( $self, $tag, $num ) = @_;
27
28	0					0	$self->_inside->{$tag} += $num;
29
30	0					0	return;
31							}
32
33							sub should_check
34							{
35	0			0	1	0	my ( $self, $args ) = @_;
36	1			1		8	return ( $args->{word} !~ m#\A[\p{Hebrew}\-'’]+\z# );
	1					1
	1					16
	0					0
37							}
38
39							sub _calc_mispellings
40							{
41	1			1		3	my ( $self, $args ) = @_;
42
43	1					2	my @ret;
44
45	1					5	my $filenames = $args->{files};
46
47	1					3	my $whitelist = $self->whitelist_parser;
48	1					12	$whitelist->parse;
49
50	1					15	binmode STDOUT, ":encoding(utf8)";
51
52	1					1030	my $cache_fh = path( $self->timestamp_cache_fn );
53
54	1					72	my $app_key = 'HTML-Spelling-Site';
55	1					3	my $data_key = 'timestamp_cache';
56	1					3	my $DIGEST_NAME = 'SHA-256';
57	1					2	my $digest_key = 'digest_cache';
58	1					6	my $digest = Digest->new($DIGEST_NAME);
59
60							my $write_cache = sub {
61	2			2		10	my ( $ref, $ddata ) = @_;
62	2					10	$cache_fh->parent()->mkdir();
63	2					389	$cache_fh->spew_raw(
64							JSON::MaybeXS->new( canonical => 1 )->encode(
65							{
66							$app_key => {
67							$data_key => $ref,
68							$digest_key => { $DIGEST_NAME => $ddata, },
69							},
70							},
71							)
72							);
73
74	2					4735	return;
75	1					3415	};
76
77	1	50				8	if ( !$cache_fh->exists() )
78							{
79	1					88	$write_cache->( +{}, +{} );
80							}
81
82	1					6	my $main_json = decode_json( scalar( $cache_fh->slurp_raw() ) );
83	1					215	my $timestamp_cache = $main_json->{$app_key}->{$data_key};
84							my $digest_cache =
85	1		50			25	( $main_json->{$app_key}->{$digest_key}->{$DIGEST_NAME} // +{} );
86
87	1					7	my $check_word = $self->check_word_cb;
88
89							FILENAMES_LOOP:
90	1					5	foreach my $filename (@$filenames)
91							{
92	0					0	my $fp = path($filename);
93	0					0	my $mtime = $fp->stat->mtime;
94	0	0	0			0	if ( exists( $timestamp_cache->{$filename} )
95							and $timestamp_cache->{$filename} >= $mtime )
96							{
97	0					0	next FILENAMES_LOOP;
98							}
99	0					0	my $d = $digest->clone()->addfile( $fp->openr_raw )->b64digest;
100	0	0	0			0	if ( exists( $digest_cache->{$filename} )
101							and $digest_cache->{$filename} eq $d )
102							{
103	0					0	$timestamp_cache->{$filename} = $mtime;
104
105	0					0	next FILENAMES_LOOP;
106							}
107
108	0					0	my $file_is_ok = 1;
109
110							my $process_text = sub {
111	0	0		0		0	if (
112							any
113							{
114	0	0				0	exists( $self->_inside->{$_} ) and $self->_inside->{$_} > 0
115							} qw(script style)
116							)
117							{
118	0					0	return;
119							}
120
121	0					0	my $text = shift;
122
123	0					0	my @lines = split /\n/, $text, -1;
124
125	0					0	foreach my $l (@lines)
126							{
127
128	0					0	my $mispelling_found = 0;
129
130							my $mark_word = sub {
131	0					0	my ($word) = @_;
132
133	0					0	$word =~ s{’(ve\|s\|m\|d\|t\|ll\|re)\z}{'$1};
134	0					0	$word =~ s{[’']\z}{};
135	0	0				0	if ( $word =~ /[A-Za-z]/ )
136							{
137	0					0	$word =~
138							s{\A(?:(?:ֹו?(?:ש\|ל\|מ\|ב\|כש\|לכש\|מה\|שה\|לכשה\|ב-))\|ו)-?}{};
139	0					0	$word =~ s{'?ים\z}{};
140							}
141
142	0		0			0	my $verdict = (
143							(
144							!$whitelist->check_word(
145							{ filename => $filename, word => $word }
146							)
147							)
148							&& $self->should_check( { word => $word } )
149							&& ( !( $check_word->($word) ) )
150							);
151
152	0		0			0	$mispelling_found \|\|= $verdict;
153
154	0	0				0	return $verdict ? "«$word»" : $word;
155	0					0	};
156
157	0					0	$l =~ s/
158							# Not sure this regex to match a word is fully
159							# idiot-proof, but we can amend it later.
160							([\w'’-]+)
161	0					0	/$mark_word->($1)/egx;
162
163	0	0				0	if ($mispelling_found)
164							{
165	0					0	$file_is_ok = 0;
166	0					0	push @ret,
167							{
168							filename => $filename,
169							line_num => 1,
170							line_with_context => $l,
171							};
172							}
173							}
174	0					0	};
175
176							HTML::Parser->new(
177							api_version => 3,
178							handlers => [
179	0			0		0	start => [ sub { return $self->_tag(@_); }, "tagname, '+1'" ],
180	0			0		0	end => [ sub { return $self->_tag(@_); }, "tagname, '-1'" ],
	0					0
181							text => [ $process_text, "dtext" ],
182							],
183							marked_sections => 1,
184							)->parse_file( $fp->openr_utf8() );
185
186	0	0				0	if ($file_is_ok)
187							{
188	0					0	$timestamp_cache->{$filename} = $mtime;
189	0					0	$digest_cache->{$filename} = $d;
190							}
191							}
192
193	1					5	$write_cache->( $timestamp_cache, $digest_cache );
194
195	1					16	return { misspellings => \@ret, };
196							}
197
198							sub _format_error
199							{
200	0			0		0	my ( $self, $error ) = @_;
201
202							return sprintf( "%s:%d:%s",
203							$error->{filename}, $error->{line_num}, $error->{line_with_context},
204	0					0	);
205							}
206
207							sub spell_check
208							{
209	0			0	1	0	my ( $self, $args ) = @_;
210
211	0					0	my $misspellings = $self->_calc_mispellings($args);
212
213	0					0	foreach my $error ( @{ $misspellings->{misspellings} } )
	0					0
214							{
215	0					0	printf {*STDOUT} "%s\n", $self->_format_error($error);
	0					0
216							}
217
218	0					0	print "\n";
219							}
220
221							sub test_spelling
222							{
223	1			1	1	91	my ( $self, $args ) = @_;
224
225	1		50			9	my $MAXLEN = ( $args->{'MAXLEN'} \|\| 1000 );
226	1		50			5	my $MAXSIZE = ( $args->{'MAXSIZE'} \|\| 20 );
227	1					4	my $misspellings = $self->_calc_mispellings($args);
228
229	1	50				7	if ( $args->{light} )
230							{
231	0					0	require Test::More;
232
233	0					0	my $ret = Test::More::is( scalar( @{ $misspellings->{misspellings} } ),
234	0					0	0, $args->{blurb} );
235
236	0					0	my $output_text = '';
237							DIAGLOOP:
238	0					0	foreach my $error ( @{ $misspellings->{misspellings} } )
	0					0
239							{
240	0					0	$output_text .= $self->_format_error($error) . "\n";
241	0	0				0	if ( length($output_text) >= $MAXLEN )
242							{
243	0					0	$output_text = substr( $output_text, 0, $MAXLEN );
244	0					0	last DIAGLOOP;
245							}
246							}
247	0					0	Test::More::diag($output_text);
248	0					0	return $ret;
249							}
250	1					1147	require Test::Differences;
251	1					22823	my @arr = @{ $misspellings->{misspellings} };
	1					10
252	1	50				7	if ( @arr > $MAXSIZE )
253							{
254	0					0	$#arr = $MAXSIZE - 1;
255							}
256	1					7	return Test::Differences::eq_or_diff( ( \@arr ), [], $args->{blurb}, );
257							}
258							1;
259
260							__END__
261
262							=pod
263
264							=encoding UTF-8
265
266							=head1 NAME
267
268							HTML::Spelling::Site::Checker - does the actual checking.
269
270							=head1 VERSION
271
272							version 0.10.2
273
274							=head1 SYNOPSIS
275
276							In lib/Shlomif/Spelling/FindFiles.pm :
277
278							package Shlomif::Spelling::FindFiles;
279
280							use strict;
281							use warnings;
282
283							use MooX qw/late/;
284							use List::MoreUtils qw/any/;
285
286							use HTML::Spelling::Site::Finder;
287
288							my @prunes =
289							(
290							qr#^\Qdest/t2/humour/by-others/how-to-make-square-corners-with-CSS/#,
291							);
292
293							sub list_htmls
294							{
295							my ($self) = @_;
296
297							return HTML::Spelling::Site::Finder->new(
298							{
299							root_dir => 'dest/t2',
300							prune_cb => sub {
301							my ($path) = @_;
302							return any { $path =~ $_ } @prunes;
303							},
304							}
305							)->list_all_htmls;
306							}
307
308							1;
309
310							In lib/Shlomif/Spelling/Whitelist.pm :
311
312							package Shlomif::Spelling::Whitelist;
313
314							use strict;
315							use warnings;
316
317							use MooX qw/late/;
318
319							extends('HTML::Spelling::Site::Whitelist');
320
321							has '+filename' => (default => 'lib/hunspell/whitelist1.txt');
322
323							1;
324
325							In lib/Shlomif/Spelling/Check.pm :
326
327							package Shlomif::Spelling::Check;
328
329							use strict;
330							use warnings;
331							use autodie;
332							use utf8;
333
334							use MooX qw/late/;
335
336							use Text::Hunspell;
337							use Shlomif::Spelling::Whitelist;
338							use HTML::Spelling::Site::Checker;
339
340							sub spell_check
341							{
342							my ($self, $args) = @_;
343
344							my $speller = Text::Hunspell->new(
345							'/usr/share/hunspell/en_GB.aff',
346							'/usr/share/hunspell/en_GB.dic',
347							);
348
349							if (not $speller)
350							{
351							die "Could not initialize speller!";
352							}
353
354							my $files = $args->{files};
355
356							return HTML::Spelling::Site::Checker->new(
357							{
358							timestamp_cache_fn => './Tests/data/cache/spelling-timestamp.json',
359							whitelist_parser => scalar( Shlomif::Spelling::Whitelist->new() ),
360							check_word_cb => sub {
361							my ($word) = @_;
362							return $speller->check($word);
363							},
364							}
365							)->spell_check(
366							{
367							files => $args->{files}
368							}
369							);
370							}
371
372							1;
373
374							In lib/Shlomif/Spelling/Iface.pm :
375
376							package Shlomif::Spelling::Iface;
377
378							use strict;
379							use warnings;
380
381							use MooX (qw( late ));
382
383							use Shlomif::Spelling::Check;
384							use Shlomif::Spelling::FindFiles;
385
386							sub run
387							{
388							return Shlomif::Spelling::Check->new()->spell_check(
389							{
390							files => Shlomif::Spelling::FindFiles->new->list_htmls(),
391							},
392							);
393							}
394
395							1;
396
397							In bin/spell-checker-iface :
398
399							#!/usr/bin/env perl
400
401							use strict;
402							use warnings;
403
404							use lib './lib';
405
406							use Shlomif::Spelling::Iface;
407
408							Shlomif::Spelling::Iface->new->run;
409
410							In t/html-spell-check.t :
411
412							#!/usr/bin/perl
413
414							use strict;
415							use warnings;
416
417							use Test::More tests => 1;
418
419							{
420							my $output = `./bin/spell-checker-iface 2>&1`;
421							chomp($output);
422
423							# TEST
424							is ($output, '', "No spelling errors.");
425							}
426
427							=head1 DESCRIPTION
428
429							The instances of this class can be used to do the actual scanning of
430							local HTML files.
431
432							=head1 METHODS
433
434							=head2 my $obj = HTML::Spelling::Site::Checker->new({ whitelist_parser => $parser_obj, check_word_cb => sub { ... }, timestamp_cache_fn => '/path/to/timestamp-cache.json' })
435
436							Initialises a new object. C<whitelist_parser> is normally an instance of
437							L<HTML::Spelling::Site::Whitelist>. C<check_word_cb> is a subroutine to check
438							a word for correctness. C<timestamp_cache_fn> points to the path where the
439							cache of the last-checked timestamps of the files is stored in JSON format.
440
441							=head2 $finder->spell_check();
442
443							Performs the spell check and prints the erroneous words to stdout.
444
445							=head2 $bool = $finder->should_check({word=>$word_string})
446
447							Whether the word should be checked for being misspelled or not. Can be
448							overridden in subclasses. (Was added in version 0.4.0).
449
450							=head2 $finder->test_spelling({ files => [@files], blurb => $blurb, });
451
452							A spell check function compatible with L<Test::More> . Emits one assertion.
453
454							Since version 0.2.0, if a C<<< light => 1 >>> key is specified and is true, it
455							will not use L<Test::Differences>, which tends to consume a lot of RAM when
456							there are many messages.
457
458							Since version 0.10.0, C<'MAXLEN'> argument was added.
459
460							Since version 0.10.0, C<'MAXSIZE'> argument was added.
461
462							=head2 $finder->whitelist_parser()
463
464							For internal use.
465
466							=head2 $finder->check_word_cb()
467
468							For internal use.
469
470							=head2 $finder->timestamp_cache_fn()
471
472							For internal use.
473
474							=for :stopwords cpan testmatrix url bugtracker rt cpants kwalitee diff irc mailto metadata placeholders metacpan
475
476							=head1 SUPPORT
477
478							=head2 Websites
479
480							The following websites have more information about this module, and may be of help to you. As always,
481							in addition to those websites please use your favorite search engine to discover more resources.
482
483							=over 4
484
485							=item *
486
487							MetaCPAN
488
489							A modern, open-source CPAN search engine, useful to view POD in HTML format.
490
491							L<https://metacpan.org/release/HTML-Spelling-Site>
492
493							=item *
494
495							RT: CPAN's Bug Tracker
496
497							The RT ( Request Tracker ) website is the default bug/issue tracking system for CPAN.
498
499							L<https://rt.cpan.org/Public/Dist/Display.html?Name=HTML-Spelling-Site>
500
501							=item *
502
503							CPANTS
504
505							The CPANTS is a website that analyzes the Kwalitee ( code metrics ) of a distribution.
506
507							L<http://cpants.cpanauthors.org/dist/HTML-Spelling-Site>
508
509							=item *
510
511							CPAN Testers
512
513							The CPAN Testers is a network of smoke testers who run automated tests on uploaded CPAN distributions.
514
515							L<http://www.cpantesters.org/distro/H/HTML-Spelling-Site>
516
517							=item *
518
519							CPAN Testers Matrix
520
521							The CPAN Testers Matrix is a website that provides a visual overview of the test results for a distribution on various Perls/platforms.
522
523							L<http://matrix.cpantesters.org/?dist=HTML-Spelling-Site>
524
525							=item *
526
527							CPAN Testers Dependencies
528
529							The CPAN Testers Dependencies is a website that shows a chart of the test results of all dependencies for a distribution.
530
531							L<http://deps.cpantesters.org/?module=HTML::Spelling::Site>
532
533							=back
534
535							=head2 Bugs / Feature Requests
536
537							Please report any bugs or feature requests by email to C<bug-html-spelling-site at rt.cpan.org>, or through
538							the web interface at L<https://rt.cpan.org/Public/Bug/Report.html?Queue=HTML-Spelling-Site>. You will be automatically notified of any
539							progress on the request by the system.
540
541							=head2 Source Code
542
543							The code is open to the world, and available for you to hack on. Please feel free to browse it and play
544							with it, or whatever. If you want to contribute patches, please send me a diff or prod me to pull
545							from your repository :)
546
547							L<https://github.com/shlomif/HTML-Spelling-Site>
548
549							git clone https://github.com/shlomif/HTML-Spelling-Site.git
550
551							=head1 AUTHOR
552
553							Shlomi Fish <shlomif@cpan.org>
554
555							=head1 BUGS
556
557							Please report any bugs or feature requests on the bugtracker website
558							L<https://github.com/shlomif/html-spelling-site/issues>
559
560							When submitting a bug or request, please include a test-file or a
561							patch to an existing test-file that illustrates the bug or desired
562							feature.
563
564							=head1 COPYRIGHT AND LICENSE
565
566							This software is Copyright (c) 2016 by Shlomi Fish.
567
568							This is free software, licensed under:
569
570							The MIT (X11) License
571
572							=cut