File Coverage

blib/lib/Markdown/Compiler/Lexer.pm

Criterion	Covered	Total	%
statement	236	257	91.8
branch	24	32	75.0
condition			n/a
subroutine	71	77	92.2
pod	0	37	0.0
total	331	403	82.1

line	stmt	bran	sub	pod	time	code
1						package Markdown::Compiler::Lexer;
2						BEGIN {
3						{
4						package Markdown::Compiler::Lexer::Token;
5	18		18		120	use Moo;
	18				35
	18				97
6
7	18				83	has source => (
8						is => 'ro',
9						required => 1,
10						);
11
12	18				12837	has start => (
13						is => 'ro',
14						required => 1,
15						);
16
17	18				4086	has end => (
18						is => 'ro',
19						required => 1,
20						);
21
22						has line => (
23						is => 'ro',
24						lazy => 1,
25						builder => sub {
26	0		0		0	my $self = shift;
27
28	0				0	my $lines = grep { $_ eq "\n" } (split(//, substr(${$self->source}, 0, $self->start)));
	0				0
	0				0
29	0				0	return $lines;
30						},
31	18				3938	);
32
33						has content => (
34						is => 'ro',
35						lazy => 1,
36						builder => sub {
37	398		398		2548	my $self = shift;
38	398				465	return substr( ${$self->source}, $self->start, ( $self->end - $self->start ) );
	398				2387
39						},
40	18				14856	);
41
42						# Allow to overide, for example to return multiple tokens.
43						sub tokens {
44	398		398	0	130479	return shift;
45						}
46
47	18				12315	1;
48						}
49						{
50	18		18		5987	package Markdown::Compiler::Lexer::Token::EscapedChar;
51	18		18		9098	use Moo;
	18				37
	18				68
52	18				63	extends 'Markdown::Compiler::Lexer::Token';
53
54	294		294	0	517	sub type { 'EscapedChar' }
55	424		424	0	1185	sub match { [ qr/\G(\\\\\|\\\`\|\\\*\|\\\_\|\\\{\|\\\}\|\\\[\|\\\]\|\\$\|\\$\|\\\#\|\\\+\|\\\-\|\\\.\|\\\!)/ ] }
56
57						# Delete the first \
58						around content => sub {
59	14				252	my $orig = shift;
60	14				170	my $value = $orig->(@_);
61
62	14				47	return substr($value,1);
63	18				2817	};
64
65	18				42979	1;
66						}
67						{
68	18				37	package Markdown::Compiler::Lexer::Token::CodeBlock;
69	18		18		7458	use Moo;
	18				43
	18				69
70	18				142	extends 'Markdown::Compiler::Lexer::Token';
71
72	30		30	0	78	sub type { 'CodeBlock' }
73						sub match {[
74	410		410	0	1120	qr\|\G\`\`\`(?:\n\\|$)\|,
75						qr\|\G\`\`\`[ ]\S+[ ]\n\|,
76						]}
77
78
79
80						has language => (
81						is => 'ro',
82						lazy => 1,
83						builder => sub {
84	2		2		48	my $content = shift->content;
85
86	2	50			6	if ( $content =~ m\|\`\`\`[ ](\S+)[ ]\n\| ) {
87	0				0	return $1;
88						}
89	2				16	return undef;
90						}
91	18				2563	);
92
93	18				21894	1;
94						}
95
96						{
97	18				37	package Markdown::Compiler::Lexer::Token::HR;
98	18		18		8148	use Moo;
	18				33
	18				65
99	18				88	extends 'Markdown::Compiler::Lexer::Token';
100
101	0		0	0	0	sub type { 'HR' }
102	406		406	0	941	sub match { [ qr/\G(?:(?<=^)\|(?<=\n))((?:(\\s\\s\)\|(-\s-\s-)\|(_\s_\s_))[-_\s]*)\n/ ] }
103	18				2142	1;
104						}
105
106						{
107	18				43	package Markdown::Compiler::Lexer::Token::Image;
108	18		18		6681	use Moo;
	18				38
	18				83
109	18				64	extends 'Markdown::Compiler::Lexer::Token';
110	18		18		13907	use Regexp::Common qw( URI );
	18				41989
	18				97
111
112						# Regexp::Common::URI doesn't support fragments, I should make a patch for it.
113	18				2027	my $url_match = qr/$RE{URI}{HTTP}{ -scheme => 'https?' }(?:\#[A-z0-9-_]+)?/;
114
115	62		62	0	156	sub type { 'Image' }
116						# sub match {[
117						# qr/\G\!\[(.)\]$($url_match)\s+"([^"]+)"\s$/,
118						# qr/\G\!\[(.)\]$($url_match\s)$/,
119						# qr/\G\!($url_match)/,
120						# ]}
121						sub match {[
122	406		406	0	4600	qr/\G\!\[(.)\]$([^ ]+)\s+"([^"]+)"\s$/,
123						qr/\G\!\[(.)\]$([^ ]+\s)$/,
124						qr/\G\!($url_match)/,
125						]}
126
127						has text => (
128						is => 'ro',
129						lazy => 1,
130	4		4		95	builder => sub { shift->data->{text} },
131	18				6960	);
132
133						has href => (
134						is => 'ro',
135						lazy => 1,
136	4		4		148	builder => sub { shift->data->{href} },
137	18				21320	);
138
139						has title => (
140						is => 'ro',
141						lazy => 1,
142	4		4		83	builder => sub { shift->data->{title} },
143	18				12624	);
144
145						has data => (
146						is => 'ro',
147						lazy => 1,
148						builder => sub {
149	4		4		104	my $content = shift->content;
150
151	4	100			220	if ( $content =~ /!\[(.)\]$([^ ]+)\s+"([^"]+)"\s$/ ) {
		100
		50
152						return {
153	1				23	text => $1,
154						href => $2,
155						title => $3,
156						}
157						} elsif ( $content =~ /!\[(.)\]$([^ ]+\s)$/ ) {
158						return {
159	1				22	text => $1,
160						href => $2,
161						title => undef,
162						}
163						} elsif ( $content =~ /!($url_match)/ ) {
164						return {
165	2				52	text => undef,
166						href => $1,
167						title => undef,
168						};
169						}
170						}
171	18				12560	);
172
173	18				12280	1;
174						}
175						{
176	18				38	package Markdown::Compiler::Lexer::Token::Link;
177	18		18		386188	use Moo;
	18				59
	18				135
178	18				83	extends 'Markdown::Compiler::Lexer::Token';
179	18		18		5555	use Regexp::Common qw( URI );
	18				78
	18				374
180
181						# Regexp::Common::URI doesn't support fragments, I should make a patch for it.
182	18				2630	my $url_match = qr/$RE{URI}{HTTP}{ -scheme => 'https?' }(?:#[A-z0-9-_]+)?(?=[ )])/;
183
184	132		132	0	267	sub type { 'Link' }
185						# qr/\G\[.\]$$url_match\s+"([^"]+)"\s$/,
186						# qr/\G\[.*\]$$url_match$/,
187						# qr/\G$url_match/,
188						sub match {[
189	402		402	0	9672	qr/\G\[.?\]$$url_match\s+"([^"]+)"\s$/,
190						qr/\G\[.*?\]$$url_match$/,
191						qr/\G$url_match/,
192						qr/\G$RE{URI}{HTTP}{ -scheme => 'https?' }/,
193						]}
194
195						has text => (
196						is => 'ro',
197						lazy => 1,
198	10		10		185	builder => sub { shift->data->{text} },
199	18				5973	);
200
201						has title => (
202						is => 'ro',
203						lazy => 1,
204	10		10		180	builder => sub { shift->data->{title} },
205	18				20417	);
206
207						has href => (
208						is => 'ro',
209						lazy => 1,
210	10		10		354	builder => sub { shift->data->{href} },
211	18				12550	);
212
213						has data => (
214						is => 'ro',
215						lazy => 1,
216						builder => sub {
217	10		10		171	my $content = shift->content;
218
219	10	100			3431	if ( $content =~ /\[(.)\]$($url_match)\s+"([^"]+)"\s$/ ) {
		100
		50
		50
220						return {
221	1				19	text => $1,
222						href => $2,
223						title => $3,
224						};
225						} elsif ( $content =~ /\[(.)\]$($url_match\s)$/ ) {
226						return {
227	7				145	text => $1,
228						href => $2,
229						title => undef,
230						};
231						} elsif ( $content =~ /($url_match)/ ) {
232						return {
233	0				0	text => undef,
234						href => $1,
235						title => undef,
236						};
237						} elsif ( $content =~ /($RE{URI}{HTTP}{ -scheme => 'https?' })/ ) {
238						return {
239	2				469	text => undef,
240						href => $1,
241						title => undef,
242						};
243						}
244						},
245	18				12203	);
246
247	18				11986	1;
248						}
249
250						{
251	18				39	package Markdown::Compiler::Lexer::Token::Item;
252	18		18		8453	use Moo;
	18				308
	18				94
253	18				64	extends 'Markdown::Compiler::Lexer::Token';
254
255	126		126	0	490	sub type { 'Item' }
256						sub match { [
257						# Unordered / Beginning of line, then * + or -
258	392		392	0	1239	qr/\G(?:(?<=^)\|(?<=\n))(?:\*\|\+\|\-) /,
259
260						# Numbered / Beginning of line, [number].[space]
261						qr/\G(?:(?<=^)\|(?<=\n))\d+\.\s+/,
262						]}
263
264						# Note: I have the following version of this I should solve why I did this:
265						# $str =~ /\G(?:(?=^)\|(?=\n))(?:\\|\+\|\-) /gc or ( exists $tokens[-1] and $tokens[-1]->{type} eq 'line_break' and $str =~ /\G(?:\\|\+\|\-) /gc
266
267	18				2426	1;
268						}
269
270						{
271	18				41	package Markdown::Compiler::Lexer::Token::TableStart;
272	18		18		7044	use Moo;
	18				35
	18				58
273	18				72	extends 'Markdown::Compiler::Lexer::Token';
274
275	57		57	0	792	sub type { 'TableStart' }
276	380		380	0	830	sub match { [ qr/\G(?:(?<=^)\|(?<=\n))\\| / ] }
277
278	18				2390	1;
279						}
280
281						{
282	18				33	package Markdown::Compiler::Lexer::Token::TableHeaderSep;
283	18		18		6140	use Moo;
	18				54
	18				71
284	18				74	extends 'Markdown::Compiler::Lexer::Token';
285
286	0		0	0	0	sub type { 'TableHeaderSep' }
287						# sub match { [ qr/\G(?:(?<=^)\|(?<=\n))\\| / ] }
288
289						sub match { return [
290	374		374	0	1153	qr/\G:---:/,
291						qr/\G:--/,
292						qr/\G--:/,
293						];
294						}
295
296	18				1954	1;
297						}
298
299						{
300	18				39	package Markdown::Compiler::Lexer::Token::BlockQuote;
301	18		18		7185	use Moo;
	18				35
	18				66
302	18				78	extends 'Markdown::Compiler::Lexer::Token';
303
304	18		18	0	56	sub type { 'BlockQuote' }
305	370		370	0	788	sub match { [ qr/\G(?:(?=^)\|(?=\n)\|(?=>\s))> / ] }
306
307	18				2004	1;
308						}
309
310						{
311	18				43	package Markdown::Compiler::Lexer::Token::Header;
312	18		18		5895	use Moo;
	18				48
	18				84
313	18				90	extends 'Markdown::Compiler::Lexer::Token';
314
315	2		2	0	46	sub type { 'Header' }
316	367		367	0	805	sub match { [ qr/\G([\#]+) (.+?)(?=\n\|$)/ ] }
317
318						has size => (
319						is => 'ro',
320						lazy => 1,
321	1				26	default => sub { length(shift->data->{header}) },
322	18				2242	);
323
324						has title => (
325						is => 'ro',
326						lazy => 1,
327	1				25	default => sub { shift->data->{title} },
328	18				20835	);
329
330						has data => (
331						is => 'ro',
332						lazy => 1,
333						builder => sub {
334	1		1		28	my $content = shift->content;
335
336	1	50			9	if ( $content =~ /^([\#]+)\s+(.+?)$/ ) {
337						return {
338	1				22	header => $1,
339						title => $2,
340						};
341						}
342						},
343	18				12521	);
344
345
346	18				12134	1;
347						}
348
349						{
350	18				67	package Markdown::Compiler::Lexer::Token::InlineCode;
351	18		18		9093	use Moo;
	18				52
	18				149
352	18				62	extends 'Markdown::Compiler::Lexer::Token';
353
354	40		40	0	93	sub type { 'InlineCode' }
355	374		374	0	789	sub match { [ qr/\G`/ ] }
356
357	18				2287	1;
358						}
359
360						{
361	18				36	package Markdown::Compiler::Lexer::Token::Bold;
362	18		18		5962	use Moo;
	18				76
	18				79
363	18				60	extends 'Markdown::Compiler::Lexer::Token';
364
365	132		132	0	240	sub type { 'Bold' }
366
367	18				2077	1;
368						}
369
370						{
371	18				44	package Markdown::Compiler::Lexer::Token::Italic;
372	18		18		5744	use Moo;
	18				35
	18				67
373	18				66	extends 'Markdown::Compiler::Lexer::Token';
374
375	156		156	0	350	sub type { 'Italic' }
376
377	18				1907	1;
378						}
379
380						{
381	18				36	package Markdown::Compiler::Lexer::Token::BoldItalic;
382	18		18		5931	use Moo;
	18				60
	18				91
383	18				72	extends 'Markdown::Compiler::Lexer::Token';
384
385	0		0	0	0	sub type { 'BoldItalic' }
386
387	18				1858	1;
388						}
389
390						{
391	18				38	package Markdown::Compiler::Lexer::Token::BoldItalicMaker;
392	18		18		5298	use Moo;
	18				47
	18				82
393	18				112	extends 'Markdown::Compiler::Lexer::Token';
394
395	0		0	0	0	sub type { 'ShortAttribute' }
396						sub match {
397						return [
398	365		365	0	1802	qr/\G\\\*/,
399						qr/\G___/,
400						qr/\G\\/,
401						qr/\G(?:(?<=^)\|(?<=[\s]))\*(?=\S\|$)/,
402						qr/\G(?<=[\S])\*/,
403						qr/\G__/,
404						qr/\G_/,
405						]
406						}
407
408						sub tokens {
409	26		26	0	5448	my ( $self ) = @_;
410	26				373	my $content = $self->content;
411
412	26	50			162	if ( $content =~ /^___/ ) {
		50
		100
		100
		100
		50
413	0				0	return Markdown::Compiler::Lexer::Token::BoldItalic->new(
414						start => $self->start,
415						end => $self->end,
416						source => $self->source
417						);
418						} elsif ( $content =~ /^\\\*/ ) {
419	0				0	return Markdown::Compiler::Lexer::Token::BoldItalic->new(
420						start => $self->start,
421						end => $self->end,
422						source => $self->source
423						);
424						} elsif ( $content =~ /^\\/ ) {
425	8				119	return Markdown::Compiler::Lexer::Token::Bold->new(
426						start => $self->start,
427						end => $self->end,
428						source => $self->source
429						);
430						} elsif ( $content =~ /^__/ ) {
431	4				65	return Markdown::Compiler::Lexer::Token::Bold->new(
432						start => $self->start,
433						end => $self->end,
434						source => $self->source
435						);
436						} elsif ( $content =~ /^_/ ) {
437	6				118	return Markdown::Compiler::Lexer::Token::Italic->new(
438						start => $self->start,
439						end => $self->end,
440						source => $self->source
441						);
442						} elsif ( $content =~ /^\*/ ) {
443	8				135	return Markdown::Compiler::Lexer::Token::Italic->new(
444						start => $self->start,
445						end => $self->end,
446						source => $self->source
447						);
448
449						}
450						};
451
452
453
454	18				1989	1;
455						}
456
457						{
458	18				47	package Markdown::Compiler::Lexer::Token::LineBreak;
459	18		18		11770	use Moo;
	18				41
	18				110
460	18				151	extends 'Markdown::Compiler::Lexer::Token';
461
462	195		195	0	972	sub type { 'LineBreak' }
463	339		339	0	765	sub match { [ qr/\G\n/ ] }
464
465	18				2054	1;
466						}
467
468						{
469	18				81	package Markdown::Compiler::Lexer::Token::Space;
470	18		18		6431	use Moo;
	18				37
	18				66
471	18				81	extends 'Markdown::Compiler::Lexer::Token';
472
473	1175		1175	0	2549	sub type { 'Space' }
474	301		301	0	645	sub match { [ qr/\G\s+/ ] }
475
476						has length => (
477						is => 'ro',
478						lazy => 1,
479	1		1		24	builder => sub { length(shift->content) },
480	18				2002	);
481
482	18				21197	1;
483						}
484
485						{
486	18				56	package Markdown::Compiler::Lexer::Token::Word;
487	18		18		6764	use Moo;
	18				34
	18				110
488	18				77	extends 'Markdown::Compiler::Lexer::Token';
489
490						# We'll match words to avoid making too many objects, such
491						# that "Hello World" becomes 11 objects.
492	1700		1700	0	3635	sub type { 'Word' }
493	196		196	0	636	sub match { [ qr\|\G[a-zA-Z]+\|, qr\|\G\d+\.\d+\|, qr\|\G\d+\| ] }
494
495	18				2245	1;
496						}
497
498						{
499	18				41	package Markdown::Compiler::Lexer::Token::Char;
	18				79
500	18		18		6898	use Moo;
	18				60
	18				95
501	18				72	extends 'Markdown::Compiler::Lexer::Token';
502
503	490		490	0	1310	sub type { 'Char' }
504	55		55	0	144	sub match { [ qr/\G./s ] }
505
506	18				2370	1;
507						}
508						}
509	18		18		122	use Moo;
	18				36
	18				66
510	18		18		4736	use v5.10;
	18				73
511
512						has source => (
513						is => 'ro',
514						required => 1,
515
516						);
517
518						has tokens => (
519						is => 'ro',
520						builder => '_build_tokens',
521						init_arg => undef,
522
523						);
524
525						has token_table => (
526						is => 'ro',
527						lazy => 1,
528						builder => sub {
529	0		0		0	my ( $self ) = @_;
530
531	0				0	my $str;
532
533	0				0	foreach my $token ( @{$self->tokens} ) {
	0				0
534	0				0	( my $content = $token->content ) =~ s/\n//g;
535	0				0	$str .= sprintf( "%20s \| %s\n", $content, $token->type );
536						}
537
538	0				0	return $str;
539						}
540						);
541
542						has hooks => (
543						is => 'ro',
544						default => sub { [] },
545						);
546
547						has lexer_tokens => (
548						is => 'ro',
549						default => sub {
550						return [qw(
551						Markdown::Compiler::Lexer::Token::EscapedChar
552						Markdown::Compiler::Lexer::Token::CodeBlock
553						Markdown::Compiler::Lexer::Token::HR
554						Markdown::Compiler::Lexer::Token::Image
555						Markdown::Compiler::Lexer::Token::Link
556						Markdown::Compiler::Lexer::Token::Item
557						Markdown::Compiler::Lexer::Token::TableStart
558						Markdown::Compiler::Lexer::Token::TableHeaderSep
559						Markdown::Compiler::Lexer::Token::InlineCode
560						Markdown::Compiler::Lexer::Token::BlockQuote
561						Markdown::Compiler::Lexer::Token::Header
562						Markdown::Compiler::Lexer::Token::BoldItalicMaker
563						Markdown::Compiler::Lexer::Token::LineBreak
564						Markdown::Compiler::Lexer::Token::Space
565						Markdown::Compiler::Lexer::Token::Word
566						Markdown::Compiler::Lexer::Token::Char
567						)];
568						# Removed from betweenb Space and Char, might have been
569						# more trouble than it's worth.
570						# Markdown::Compiler::Lexer::Token::Word
571						}
572						);
573
574						sub _build_tokens {
575	65		65		322	my ( $self ) = @_;
576
577	65				165	my $str = $self->source;
578
579	65				184	pos($str) = 0;
580	65				122	my @tokens;
581
582	65				206	PARSE: while ( length($str) != pos($str) ) {
583	424				641	my $start_pos = pos($str);
584
585	424				504	TOKEN: foreach my $token_class ( @{$self->lexer_tokens} ) {
	424				959
586	5561				11703	my $matches = $token_class->match;
587
588	5561				57440	foreach my $match ( @{$matches} ) {
	5561				7239
589	11368	100			38543	if ( $str =~ m\|$match\|gc ) {
590	424				7523	push @tokens, $token_class->new(
591						source => \$self->source,
592						start => $start_pos,
593						end => pos($str),
594						)->tokens;
595	424				7119	next PARSE;
596						}
597						}
598						}
599						# We were not able to match the content, so we're blowing up now.
600	0				0	die "Error at offset $start_pos of document: next 10 chars" . substr($self->source, $start_pos, 10 );
601						}
602
603	65				1356	return [ @tokens ];
604						}
605
606						1;