File Coverage

blib/lib/Text/Lorem/JA.pm

Criterion	Covered	Total	%
statement	184	226	81.4
branch	45	82	54.8
condition	13	24	54.1
subroutine	22	28	78.5
pod	5	5	100.0
total	269	365	73.7

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Lorem::JA;
2	5			5		95145	use 5.008005;
	5					19
	5					236
3	5			5		30	use strict;
	5					12
	5					213
4	5			5		35	use warnings;
	5					11
	5					158
5	5			5		5011	use utf8;
	5					42
	5					39
6	5			5		124	use Carp;
	5					9
	5					516
7	5			5		24	use File::Basename qw( dirname );
	5					8
	5					477
8	5			5		5123	use File::Spec::Functions qw( catdir catfile canonpath );
	5					6567
	5					441
9	5			5		30	use Cwd qw( abs_path );
	5					9
	5					627
10
11							our $VERSION = "0.04";
12
13	5					1110	use constant DICTIONARY_DIR =>
14	5			5		36	canonpath(catdir(dirname(__FILE__), '..', '..', '..', 'auto', 'share', 'dist', 'Text-Lorem-JA'));
	5					10
15	5					15619	use constant DEFAULT_DICTIONARY =>
16	5			5		86	catdir(DICTIONARY_DIR, 'dict.txt');
	5					11
17
18							sub new {
19	5			5	1	2241	my ($class, %options) = @_;
20	5					25	my $self = bless {}, $class;
21
22	5					32	$self->{dictionary} = $options{dictionary};
23	5		50			39	$self->{chain} = $options{chain} \|\| 1;
24
25	5					11	my $lazy = $options{lazy};
26
27	5					14	$self->{dict} = [];
28	5					12	$self->{tree} = {};
29	5					11	$self->{loaded} = 0;
30
31	5	50				19	if (! $lazy) {
32	5					26	$self->_load_dict($self->{dictionary});
33							}
34
35	5					40	return $self;
36							}
37
38							my $_singleton;
39							sub _singleton {
40	0			0		0	my ($class) = @_;
41
42	0	0				0	unless (defined $_singleton) {
43	0					0	$_singleton = $class->new();
44							}
45
46	0					0	return $_singleton;
47							}
48
49							sub sentences {
50	3			3	1	12	my ($self, $count, %options) = @_;
51
52	3	50				10	unless (ref $self) {
53	0					0	$self = $self->_singleton();
54							}
55
56	3					8	return join("", map { $self->sentence(%options) } ( 1 .. $count ));
	6					17
57							}
58
59							sub sentence {
60	16			16	1	3767	my ($self, %options) = @_;
61
62	16	50				42	unless (ref $self) {
63	0					0	$self = $self->_singleton();
64							}
65
66	16	50				38	unless ($self->{loaded}) {
67	0					0	$self->_load_dict($self->{dictionary});
68							}
69
70	16		33			74	my $chain = $options{chain} \|\| $self->{chain};
71
72	16	50				37	unless ($chain > 0) {
73	0					0	croak "invalid chain option value ($chain)";
74							}
75	16	50				37	unless ($chain <= $self->{chain}) {
76	0					0	croak "chain option value ($chain) exceeds dict's chain ($self->{chain})";
77							}
78
79	16					19	my @tokens;
80	16					35	my @stack = ( 0 ) x $chain;
81
82	16					18	my $limitter = 100;
83	16					42	while ($limitter -- > 0) {
84	64					137	my @cands = $self->_lookup_candidates(@stack);
85	64					269	my $cand = $cands[int(rand(scalar @cands))];
86	64	100				154	last if $cand < 0; # EOS
87
88	48					101	push @tokens, $self->{dict}->[$cand];
89
90	48					207	shift @stack;
91	48					144	push @stack, $cand;
92							}
93
94	16					121	return join("", @tokens);
95							}
96
97							sub words {
98	6			6	1	16	my ($self, $count) = @_;
99
100	6	50				15	unless (ref $self) {
101	0					0	$self = $self->_singleton();
102							}
103
104	6	50				12	unless ($self->{loaded}) {
105	0					0	$self->_load_dict($self->{dictionary});
106							}
107
108	6					8	my @tokens;
109	6					12	while ($count > @tokens) {
110	6					15	push @tokens, $self->_words($count - @tokens);
111							}
112
113	6					36	return join("", @tokens);
114							}
115
116							sub _words {
117	6			6		7	my ($self, $count) = @_;
118
119	6					6	my @tokens;
120	6					9	my @stack = ( 0 );
121
122	6					11	while ($count > 0) {
123	24					48	my @cands = grep { $_ >= 0 } $self->_lookup_candidates(@stack);
	28					74
124
125	24	50	66			772	last if @cands == 0 \|\| @cands == 1 && $cands[0] < 0; # EOS only
			33
126
127	24					26	my @new_cands;
128	24					28	foreach my $cand (@cands) {
129	28					50	foreach my $next_cand ($self->_lookup_candidates($cand)) {
130	28	100				67	if ($next_cand >= 0) {
131	24					26	push @new_cands, $cand;
132	24					49	last;
133							}
134							}
135							}
136	24	50				42	last unless @new_cands;
137
138	24					82	my $cand = $new_cands[int(rand(scalar @new_cands))];
139
140	24					46	push @tokens, $self->{dict}->[$cand];
141	24					21	$count --;
142
143	24					25	shift @stack;
144	24					65	push @stack, $cand;
145							}
146
147	6					27	return @tokens;
148							}
149
150							sub word {
151	6			6	1	17	my ($self, $length) = @_;
152
153	6	50				18	unless (ref $self) {
154	0					0	$self = $self->_singleton();
155							}
156
157	6	50				17	unless ($self->{loaded}) {
158	0					0	$self->_load_dict($self->{dictionary});
159							}
160
161	6					7	my $word = "";
162	6					19	while ($length > length $word) {
163	6					18	$word .= $self->_word($length - length $word);
164							}
165
166	6					26	return $word;
167							}
168
169							sub _word {
170	6			6		8	my ($self, $length) = @_;
171
172	6					9	my $dict = $self->{dict};
173	6					8	my $word = "";
174	6					9	my @stack = ( 0 );
175
176	6					13	while ($length > 0) {
177							my @cands
178	16					72	= grep { $dict->[$_] !~ m{\A[。、．，]\z}xmso }
	16					46
179	14					37	grep { $_ >= 0 }
180							$self->_lookup_candidates(@stack);
181
182	14	50	66			83	last if @cands == 0 \|\| @cands == 1 && $cands[0] < 0; # EOS only
			33
183
184	14					19	my @new_cands;
185	14					19	foreach my $cand (@cands) {
186	16					34	foreach my $next_cand ($self->_lookup_candidates($cand)) {
187	16	100	66			89	if ($next_cand >= 0
188							&& $dict->[$next_cand] !~ m{\A[。、．，]\z}xmso) {
189	14					21	push @new_cands, $cand;
190	14					33	last;
191							}
192							}
193							}
194	14	50				30	last unless @new_cands;
195
196							my @short_cands
197	14					19	= grep { $length >= length $dict->[$_] } @new_cands;
	14					41
198
199	14	100				31	if (! @short_cands) {
200							@short_cands
201	5					11	= sort { length $dict->[$a] <=> length $dict->[$b] }
	0					0
202							@new_cands;
203	5					7	my $shortest = length $dict->[$short_cands[0]];
204							@short_cands
205	5					6	= grep { $shortest >= length $dict->[$_] } @short_cands;
	5					15
206							}
207
208	14					74	my $cand = $short_cands[int(rand(scalar @short_cands))];
209	14					21	my $token = $dict->[$cand];
210	14	100				37	if (length $token > $length) {
211	5					12	$token = substr $token, 0, $length;
212							}
213
214	14					18	$word .= $token;
215	14					16	$length -= length $token;
216
217	14					17	shift @stack;
218	14					51	push @stack, $cand;
219							}
220
221	6					25	return $word;
222							}
223
224							sub _load_dict {
225	5			5		11	my ($self, $dictionary) = @_;
226
227	5	50				34	if (! ref $dictionary) {
		50
		0
		0
228	0					0	$self->_load_dict_from_file(_dictionary_file($dictionary));
229							}
230							elsif (ref $dictionary eq 'SCALAR') {
231	5					152	my @lines = split /(\r?\n)/, $$dictionary;
232							$self->_load_dict_from_stream(
233							sub {
234	70			70		133	my ($line, $lf) = splice @lines, 0, 2;
235	70	100				152	return unless defined $line;
236	65		50			542	$line . ($lf \|\| "");
237							}
238	5					50	);
239							}
240							elsif (ref $dictionary eq 'IO') {
241	0					0	$self->_load_dict_from_handle($dictionary);
242							}
243	0					0	elsif (eval { $dictionary->can('getline') }) {
244							# IO::Handle like interface
245	0			0		0	$self->_load_dict_from_stream(sub { $dictionary->getline() });
	0					0
246							}
247							else {
248	0					0	croak "Unsupported type for dictionary ($dictionary).";
249							}
250							}
251
252							sub _dictionary_file {
253	0			0		0	my ($filename) = @_;
254
255	0					0	my $pathname;
256	0	0				0	if ($filename) {
257	0					0	$pathname = abs_path($filename);
258	0	0				0	unless (-f $pathname) {
259	0					0	$pathname = catfile(DICTIONARY_DIR, $filename);
260	0	0				0	unless (-f $pathname) {
261	0					0	$pathname = undef;
262							}
263							}
264							} else {
265	0					0	$filename = DEFAULT_DICTIONARY;
266	0					0	$pathname = $filename;
267							}
268
269	0	0				0	unless ($pathname) {
270	0					0	croak "dictionary file ($filename) not found";
271							}
272
273	0					0	return $pathname;
274							}
275
276							sub _load_dict_from_file {
277	0			0		0	my ($self, $filename) = @_;
278
279	0	0				0	open my $handle, '<:encoding(UTF-8)', $filename
280							or croak "open $filename error: $!";
281
282	0					0	$self->_load_dict_from_handle($handle);
283
284	0					0	close $handle;
285							}
286
287							sub _load_dict_from_handle {
288	0			0		0	my ($self, $handle) = @_;
289
290	0			0		0	$self->_load_dict_from_stream(sub { <$handle> });
	0					0
291							}
292
293							sub _load_dict_from_stream {
294	5			5		11	my ($self, $sub_getline) = @_;
295
296	5					10	my $step = 0;
297	5					9	my @stack;
298
299	5					15	while (my $line = &$sub_getline()) {
300	65					90	chomp $line;
301	65	50				679	next if $line =~ /^#/o; # comment line
302
303	65	100				1407	if ($step == 0) {
		100
		100
304							# chain
305	5					17	$self->{chain} = +$line;
306	5					73	$step = 1;
307							}
308							elsif ($step == 1) {
309							# first word dict entry must be "" (empty)
310	5					8	push @{$self->{dict}}, $line;
	5					14
311	5					13	$step = 2;
312							}
313							elsif ($step == 2) {
314							# word dictionary
315	24	100				41	if ($line eq "") { # separator
316	5					14	$step = 3;
317							} else {
318	19					23	push @{$self->{dict}}, $line;
	19					58
319							}
320							}
321							else {
322							# probability tree
323
324							# turn heading spaces into preceding stack
325	31					34	my @new_stack;
326	31					82	my @tokens = split / /o, $line;
327	31					65	while (@tokens) {
328	39	100				72	if ($tokens[0] eq "") {
329	8					8	shift @tokens; # trim first (empty) token
330	8					17	push @new_stack, shift @stack;
331							} else {
332	31					55	push @new_stack, join("", @tokens);
333	31					81	@tokens = ();
334							}
335							}
336	31					53	@stack = @new_stack;
337
338	31					73	$self->_insert_tree_node(@stack);
339							}
340							}
341
342	5					18	$self->{loaded} = 1;
343							}
344
345							sub _insert_tree_node {
346	31			31		42	my ($self, @stack) = @_;
347
348	31					48	my $node = $self->{tree};
349	31					64	while (@stack) {
350	39					48	my $token = shift @stack;
351
352	39	100				104	if ($token =~ /=/o) {
353	27					71	my ($child, $cands) = split '=', $token, 2;
354	27					37	my $word_id = +$child;
355	33					104	$node->{$word_id}
356	27					67	= [ map { +$_ } split(/,\s*/o, $cands) ];
357	27					103	last;
358							} else {
359	12					11	my $word_id = +$token;
360	12		100			39	$node->{$word_id} \|\|= {};
361	12					37	$node = $node->{$word_id};
362							}
363							}
364							}
365
366							sub _lookup_candidates {
367	146			146		281	my ($self, @stack) = @_;
368
369	146					226	my $node = $self->{tree};
370	146					279	while (@stack) {
371	166	50				320	last unless $node;
372	166	50				326	return @$node if ref $node eq 'ARRAY';
373
374	166					212	my $word = shift @stack;
375	166					454	$node = $node->{$word};
376							}
377
378	146	50				377	if (ref($node) eq 'HASH') {
		50
379	0					0	return keys %$node;
380							} elsif (ref($node) eq 'ARRAY') {
381	146					396	return @$node;
382							} else {
383	0						return ( -1 ); # EOS
384							}
385							}
386
387							1;
388							__END__