File Coverage

blib/lib/Markdown/Compiler/Lexer.pm

Criterion	Covered	Total	%
statement	237	257	92.2
branch	24	32	75.0
condition			n/a
subroutine	72	77	93.5
pod	0	37	0.0
total	333	403	82.6

line	stmt	bran	sub	pod	time	code
1						package Markdown::Compiler::Lexer;
2						BEGIN {
3						{
4						package Markdown::Compiler::Lexer::Token;
5	17		17		133	use Moo;
	17				45
	17				93
6
7	17				136	has source => (
8						is => 'ro',
9						required => 1,
10						);
11
12	17				379524	has start => (
13						is => 'ro',
14						required => 1,
15						);
16
17	17				4714	has end => (
18						is => 'ro',
19						required => 1,
20						);
21
22						has line => (
23						is => 'ro',
24						lazy => 1,
25						builder => sub {
26	0		0		0	my $self = shift;
27
28	0				0	my $lines = grep { $_ eq "\n" } (split(//, substr(${$self->source}, 0, $self->start)));
	0				0
	0				0
29	0				0	return $lines;
30						},
31	17				4521	);
32
33						has content => (
34						is => 'ro',
35						lazy => 1,
36						builder => sub {
37	408		408		3275	my $self = shift;
38	408				579	return substr( ${$self->source}, $self->start, ( $self->end - $self->start ) );
	408				2929
39						},
40	17				14909	);
41
42						# Allow to overide, for example to return multiple tokens.
43						sub tokens {
44	412		412	0	168828	return shift;
45						}
46
47	17				13891	1;
48						}
49						{
50	17		17		7359	package Markdown::Compiler::Lexer::Token::EscapedChar;
51	17		17		9680	use Moo;
	17				116
	17				226
52	17				81	extends 'Markdown::Compiler::Lexer::Token';
53
54	294		294	0	618	sub type { 'EscapedChar' }
55	438		438	0	1539	sub match { [ qr/\G(\\\\\|\\\`\|\\\*\|\\\_\|\\\{\|\\\}\|\\\[\|\\\]\|\\$\|\\$\|\\\#\|\\\+\|\\\-\|\\\.\|\\\!)/ ] }
56
57						# Delete the first \
58						around content => sub {
59	14				307	my $orig = shift;
60	14				211	my $value = $orig->(@_);
61
62	14				61	return substr($value,1);
63	17				3060	};
64
65	17				45089	1;
66						}
67						{
68	17				44	package Markdown::Compiler::Lexer::Token::CodeBlock;
69	17		17		9216	use Moo;
	17				42
	17				78
70	17				93	extends 'Markdown::Compiler::Lexer::Token';
71
72	30		30	0	85	sub type { 'CodeBlock' }
73						sub match {[
74	424		424	0	1399	qr\|\G\`\`\`(?:\n\\|$)\|,
75						qr\|\G\`\`\`[ ]\S+[ ]\n\|,
76						]}
77
78
79
80						has language => (
81						is => 'ro',
82						lazy => 1,
83						builder => sub {
84	2		2		52	my $content = shift->content;
85
86	2	50			9	if ( $content =~ m\|\`\`\`[ ](\S+)[ ]\n\| ) {
87	0				0	return $1;
88						}
89	2				10	return undef;
90						}
91	17				2900	);
92
93	17				23454	1;
94						}
95
96						{
97	17				47	package Markdown::Compiler::Lexer::Token::HR;
98	17		17		8930	use Moo;
	17				58
	17				90
99	17				85	extends 'Markdown::Compiler::Lexer::Token';
100
101	12		12	0	35	sub type { 'HR' }
102	420		420	0	1119	sub match { [ qr/\G(?:(?<=^)\|(?<=\n))((?:(\\s\\s\)\|(-\s-\s-)\|(_\s_\s_))[-_\s]*)\n/ ] }
103	17				2533	1;
104						}
105
106						{
107	17				46	package Markdown::Compiler::Lexer::Token::Image;
108	17		17		7980	use Moo;
	17				38
	17				71
109	17				76	extends 'Markdown::Compiler::Lexer::Token';
110	17		17		16107	use Regexp::Common qw( URI );
	17				47914
	17				74
111
112						# Regexp::Common::URI doesn't support fragments, I should make a patch for it.
113	17				2368	my $url_match = qr/$RE{URI}{HTTP}{ -scheme => 'https?' }(?:\#[A-z0-9-_]+)?/;
114
115	62		62	0	169	sub type { 'Image' }
116						# sub match {[
117						# qr/\G\!\[(.)\]$($url_match)\s+"([^"]+)"\s$/,
118						# qr/\G\!\[(.)\]$($url_match\s)$/,
119						# qr/\G\!($url_match)/,
120						# ]}
121						sub match {[
122	418		418	0	5803	qr/\G\!\[(.)\]$([^ ]+)\s+"([^"]+)"\s$/,
123						qr/\G\!\[(.)\]$([^ ]+\s)$/,
124						qr/\G\!($url_match)/,
125						]}
126
127						has text => (
128						is => 'ro',
129						lazy => 1,
130	4		4		87	builder => sub { shift->data->{text} },
131	17				8487	);
132
133						has href => (
134						is => 'ro',
135						lazy => 1,
136	4		4		155	builder => sub { shift->data->{href} },
137	17				22544	);
138
139						has title => (
140						is => 'ro',
141						lazy => 1,
142	4		4		86	builder => sub { shift->data->{title} },
143	17				14392	);
144
145						has data => (
146						is => 'ro',
147						lazy => 1,
148						builder => sub {
149	4		4		87	my $content = shift->content;
150
151	4	100			214	if ( $content =~ /!\[(.)\]$([^ ]+)\s+"([^"]+)"\s$/ ) {
		100
		50
152						return {
153	1				23	text => $1,
154						href => $2,
155						title => $3,
156						}
157						} elsif ( $content =~ /!\[(.)\]$([^ ]+\s)$/ ) {
158						return {
159	1				23	text => $1,
160						href => $2,
161						title => undef,
162						}
163						} elsif ( $content =~ /!($url_match)/ ) {
164						return {
165	2				55	text => undef,
166						href => $1,
167						title => undef,
168						};
169						}
170						}
171	17				14019	);
172
173	17				14108	1;
174						}
175						{
176	17				40	package Markdown::Compiler::Lexer::Token::Link;
177	17		17		453259	use Moo;
	17				51
	17				153
178	17				100	extends 'Markdown::Compiler::Lexer::Token';
179	17		17		6401	use Regexp::Common qw( URI );
	17				43
	17				77
180
181						# Regexp::Common::URI doesn't support fragments, I should make a patch for it.
182	17				2838	my $url_match = qr/$RE{URI}{HTTP}{ -scheme => 'https?' }(?:#[A-z0-9-_]+)?(?=[ )])/;
183
184	132		132	0	316	sub type { 'Link' }
185						# qr/\G\[.\]$$url_match\s+"([^"]+)"\s$/,
186						# qr/\G\[.*\]$$url_match$/,
187						# qr/\G$url_match/,
188						sub match {[
189	414		414	0	12316	qr/\G\[.?\]$$url_match\s+"([^"]+)"\s$/,
190						qr/\G\[.*?\]$$url_match$/,
191						qr/\G$url_match/,
192						qr/\G$RE{URI}{HTTP}{ -scheme => 'https?' }/,
193						]}
194
195						has text => (
196						is => 'ro',
197						lazy => 1,
198	10		10		222	builder => sub { shift->data->{text} },
199	17				6207	);
200
201						has title => (
202						is => 'ro',
203						lazy => 1,
204	10		10		218	builder => sub { shift->data->{title} },
205	17				23182	);
206
207						has href => (
208						is => 'ro',
209						lazy => 1,
210	10		10		388	builder => sub { shift->data->{href} },
211	17				14031	);
212
213						has data => (
214						is => 'ro',
215						lazy => 1,
216						builder => sub {
217	10		10		205	my $content = shift->content;
218
219	10	100			4279	if ( $content =~ /\[(.)\]$($url_match)\s+"([^"]+)"\s$/ ) {
		100
		50
		50
220						return {
221	1				26	text => $1,
222						href => $2,
223						title => $3,
224						};
225						} elsif ( $content =~ /\[(.)\]$($url_match\s)$/ ) {
226						return {
227	7				183	text => $1,
228						href => $2,
229						title => undef,
230						};
231						} elsif ( $content =~ /($url_match)/ ) {
232						return {
233	0				0	text => undef,
234						href => $1,
235						title => undef,
236						};
237						} elsif ( $content =~ /($RE{URI}{HTTP}{ -scheme => 'https?' })/ ) {
238						return {
239	2				534	text => undef,
240						href => $1,
241						title => undef,
242						};
243						}
244						},
245	17				14168	);
246
247	17				14491	1;
248						}
249
250						{
251	17				47	package Markdown::Compiler::Lexer::Token::Item;
252	17		17		9564	use Moo;
	17				74
	17				124
253	17				92	extends 'Markdown::Compiler::Lexer::Token';
254
255	126		126	0	470	sub type { 'Item' }
256						sub match { [
257						# Unordered / Beginning of line, then * + or -
258	404		404	0	1457	qr/\G(?:(?<=^)\|(?<=\n))(?:\*\|\+\|\-) /,
259
260						# Numbered / Beginning of line, [number].[space]
261						qr/\G(?:(?<=^)\|(?<=\n))\d+\.\s+/,
262						]}
263
264						# Note: I have the following version of this I should solve why I did this:
265						# $str =~ /\G(?:(?=^)\|(?=\n))(?:\\|\+\|\-) /gc or ( exists $tokens[-1] and $tokens[-1]->{type} eq 'line_break' and $str =~ /\G(?:\\|\+\|\-) /gc
266
267	17				2578	1;
268						}
269
270						{
271	17				44	package Markdown::Compiler::Lexer::Token::TableStart;
272	17		17		8423	use Moo;
	17				44
	17				81
273	17				79	extends 'Markdown::Compiler::Lexer::Token';
274
275	57		57	0	570	sub type { 'TableStart' }
276	392		392	0	1045	sub match { [ qr/\G(?:(?<=^)\|(?<=\n))\\| / ] }
277
278	17				2294	1;
279						}
280
281						{
282	17				46	package Markdown::Compiler::Lexer::Token::TableHeaderSep;
283	17		17		7317	use Moo;
	17				42
	17				73
284	17				69	extends 'Markdown::Compiler::Lexer::Token';
285
286	0		0	0	0	sub type { 'TableHeaderSep' }
287						# sub match { [ qr/\G(?:(?<=^)\|(?<=\n))\\| / ] }
288
289						sub match { return [
290	386		386	0	1425	qr/\G:---:/,
291						qr/\G:--/,
292						qr/\G--:/,
293						];
294						}
295
296	17				2161	1;
297						}
298
299						{
300	17				42	package Markdown::Compiler::Lexer::Token::BlockQuote;
301	17		17		7634	use Moo;
	17				38
	17				74
302	17				96	extends 'Markdown::Compiler::Lexer::Token';
303
304	18		18	0	63	sub type { 'BlockQuote' }
305	382		382	0	976	sub match { [ qr/\G(?:(?=^)\|(?=\n)\|(?=>\s))> / ] }
306
307	17				2301	1;
308						}
309
310						{
311	17				36	package Markdown::Compiler::Lexer::Token::Header;
312	17		17		7223	use Moo;
	17				318
	17				86
313	17				85	extends 'Markdown::Compiler::Lexer::Token';
314
315	2		2	0	8	sub type { 'Header' }
316	379		379	0	976	sub match { [ qr/\G([\#]+) (.+?)(?=\n\|$)/ ] }
317
318						has size => (
319						is => 'ro',
320						lazy => 1,
321	1				28	default => sub { length(shift->data->{header}) },
322	17				2409	);
323
324						has title => (
325						is => 'ro',
326						lazy => 1,
327	1				25	default => sub { shift->data->{title} },
328	17				23742	);
329
330						has data => (
331						is => 'ro',
332						lazy => 1,
333						builder => sub {
334	1		1		28	my $content = shift->content;
335
336	1	50			10	if ( $content =~ /^([\#]+)\s+(.+?)$/ ) {
337						return {
338	1				25	header => $1,
339						title => $2,
340						};
341						}
342						},
343	17				14359	);
344
345
346	17				13873	1;
347						}
348
349						{
350	17				39	package Markdown::Compiler::Lexer::Token::InlineCode;
351	17		17		10689	use Moo;
	17				48
	17				82
352	17				70	extends 'Markdown::Compiler::Lexer::Token';
353
354	40		40	0	88	sub type { 'InlineCode' }
355	386		386	0	981	sub match { [ qr/\G`/ ] }
356
357	17				2592	1;
358						}
359
360						{
361	17				52	package Markdown::Compiler::Lexer::Token::Bold;
362	17		17		6994	use Moo;
	17				37
	17				103
363	17				79	extends 'Markdown::Compiler::Lexer::Token';
364
365	132		132	0	304	sub type { 'Bold' }
366
367	17				2349	1;
368						}
369
370						{
371	17				40	package Markdown::Compiler::Lexer::Token::Italic;
372	17		17		5786	use Moo;
	17				52
	17				74
373	17				75	extends 'Markdown::Compiler::Lexer::Token';
374
375	156		156	0	376	sub type { 'Italic' }
376
377	17				2145	1;
378						}
379
380						{
381	17				51	package Markdown::Compiler::Lexer::Token::BoldItalic;
382	17		17		5884	use Moo;
	17				36
	17				115
383	17				91	extends 'Markdown::Compiler::Lexer::Token';
384
385	0		0	0	0	sub type { 'BoldItalic' }
386
387	17				2194	1;
388						}
389
390						{
391	17				39	package Markdown::Compiler::Lexer::Token::BoldItalicMaker;
392	17		17		6460	use Moo;
	17				57
	17				118
393	17				111	extends 'Markdown::Compiler::Lexer::Token';
394
395	0		0	0	0	sub type { 'ShortAttribute' }
396						sub match {
397						return [
398	377		377	0	2207	qr/\G\\\*/,
399						qr/\G___/,
400						qr/\G\\/,
401						qr/\G(?:(?<=^)\|(?<=[\s]))\*(?=\S\|$)/,
402						qr/\G(?<=[\S])\*/,
403						qr/\G__/,
404						qr/\G_/,
405						]
406						}
407
408						sub tokens {
409	26		26	0	6337	my ( $self ) = @_;
410	26				476	my $content = $self->content;
411
412	26	50			169	if ( $content =~ /^___/ ) {
		50
		100
		100
		100
		50
413	0				0	return Markdown::Compiler::Lexer::Token::BoldItalic->new(
414						start => $self->start,
415						end => $self->end,
416						source => $self->source
417						);
418						} elsif ( $content =~ /^\\\*/ ) {
419	0				0	return Markdown::Compiler::Lexer::Token::BoldItalic->new(
420						start => $self->start,
421						end => $self->end,
422						source => $self->source
423						);
424						} elsif ( $content =~ /^\\/ ) {
425	8				149	return Markdown::Compiler::Lexer::Token::Bold->new(
426						start => $self->start,
427						end => $self->end,
428						source => $self->source
429						);
430						} elsif ( $content =~ /^__/ ) {
431	4				82	return Markdown::Compiler::Lexer::Token::Bold->new(
432						start => $self->start,
433						end => $self->end,
434						source => $self->source
435						);
436						} elsif ( $content =~ /^_/ ) {
437	6				121	return Markdown::Compiler::Lexer::Token::Italic->new(
438						start => $self->start,
439						end => $self->end,
440						source => $self->source
441						);
442						} elsif ( $content =~ /^\*/ ) {
443	8				143	return Markdown::Compiler::Lexer::Token::Italic->new(
444						start => $self->start,
445						end => $self->end,
446						source => $self->source
447						);
448
449						}
450						};
451
452
453
454	17				2216	1;
455						}
456
457						{
458	17				48	package Markdown::Compiler::Lexer::Token::LineBreak;
459	17		17		13447	use Moo;
	17				46
	17				104
460	17				71	extends 'Markdown::Compiler::Lexer::Token';
461
462	219		219	0	789	sub type { 'LineBreak' }
463	351		351	0	944	sub match { [ qr/\G\n/ ] }
464
465	17				2167	1;
466						}
467
468						{
469	17				43	package Markdown::Compiler::Lexer::Token::Space;
470	17		17		6901	use Moo;
	17				38
	17				70
471	17				69	extends 'Markdown::Compiler::Lexer::Token';
472
473	1192		1192	0	2854	sub type { 'Space' }
474	310		310	0	807	sub match { [ qr/\G\s+/ ] }
475
476						has length => (
477						is => 'ro',
478						lazy => 1,
479	1		1		30	builder => sub { length(shift->content) },
480	17				2500	);
481
482	17				23339	1;
483						}
484
485						{
486	17				41	package Markdown::Compiler::Lexer::Token::Word;
487	17		17		7237	use Moo;
	17				35
	17				70
488	17				75	extends 'Markdown::Compiler::Lexer::Token';
489
490						# We'll match words to avoid making too many objects, such
491						# that "Hello World" becomes 11 objects.
492	1744		1744	0	4134	sub type { 'Word' }
493	203		203	0	808	sub match { [ qr\|\G[a-zA-Z]+\|, qr\|\G\d+\.\d+\|, qr\|\G\d+\| ] }
494
495	17				2432	1;
496						}
497
498						{
499	17				42	package Markdown::Compiler::Lexer::Token::Char;
	17				43
500	17		17		8050	use Moo;
	17				49
	17				132
501	17				74	extends 'Markdown::Compiler::Lexer::Token';
502
503	518		518	0	1458	sub type { 'Char' }
504	58		58	0	185	sub match { [ qr/\G./s ] }
505
506	17				2699	1;
507						}
508						}
509	17		17		143	use Moo;
	17				35
	17				78
510	17		17		5227	use v5.10;
	17				74
511
512						has source => (
513						is => 'ro',
514						required => 1,
515
516						);
517
518						has tokens => (
519						is => 'ro',
520						builder => '_build_tokens',
521						init_arg => undef,
522
523						);
524
525						has token_table => (
526						is => 'ro',
527						lazy => 1,
528						builder => sub {
529	0		0		0	my ( $self ) = @_;
530
531	0				0	my $str;
532
533	0				0	foreach my $token ( @{$self->tokens} ) {
	0				0
534	0				0	( my $content = $token->content ) =~ s/\n//g;
535	0				0	$str .= sprintf( "%20s \| %s\n", $content, $token->type );
536						}
537
538	0				0	return $str;
539						}
540						);
541
542						has hooks => (
543						is => 'ro',
544						default => sub { [] },
545						);
546
547						has lexer_tokens => (
548						is => 'ro',
549						default => sub {
550						return [qw(
551						Markdown::Compiler::Lexer::Token::EscapedChar
552						Markdown::Compiler::Lexer::Token::CodeBlock
553						Markdown::Compiler::Lexer::Token::HR
554						Markdown::Compiler::Lexer::Token::Image
555						Markdown::Compiler::Lexer::Token::Link
556						Markdown::Compiler::Lexer::Token::Item
557						Markdown::Compiler::Lexer::Token::TableStart
558						Markdown::Compiler::Lexer::Token::TableHeaderSep
559						Markdown::Compiler::Lexer::Token::InlineCode
560						Markdown::Compiler::Lexer::Token::BlockQuote
561						Markdown::Compiler::Lexer::Token::Header
562						Markdown::Compiler::Lexer::Token::BoldItalicMaker
563						Markdown::Compiler::Lexer::Token::LineBreak
564						Markdown::Compiler::Lexer::Token::Space
565						Markdown::Compiler::Lexer::Token::Word
566						Markdown::Compiler::Lexer::Token::Char
567						)];
568						# Removed from betweenb Space and Char, might have been
569						# more trouble than it's worth.
570						# Markdown::Compiler::Lexer::Token::Word
571						}
572						);
573
574						sub _build_tokens {
575	66		66		343	my ( $self ) = @_;
576
577	66				183	my $str = $self->source;
578
579	66				217	pos($str) = 0;
580	66				151	my @tokens;
581
582	66				217	PARSE: while ( length($str) != pos($str) ) {
583	438				781	my $start_pos = pos($str);
584
585	438				653	TOKEN: foreach my $token_class ( @{$self->lexer_tokens} ) {
	438				1273
586	5742				14323	my $matches = $token_class->match;
587
588	5742				70414	foreach my $match ( @{$matches} ) {
	5742				9394
589	11737	100			48742	if ( $str =~ m\|$match\|gc ) {
590	438				9517	push @tokens, $token_class->new(
591						source => \$self->source,
592						start => $start_pos,
593						end => pos($str),
594						)->tokens;
595	438				8552	next PARSE;
596						}
597						}
598						}
599						# We were not able to match the content, so we're blowing up now.
600	0				0	die "Error at offset $start_pos of document: next 10 chars" . substr($self->source, $start_pos, 10 );
601						}
602
603	66				1553	return [ @tokens ];
604						}
605
606						1;