File Coverage

blib/lib/Acme/Lingua/ZH/Remix.pm

Criterion	Covered	Total	%
statement	116	118	98.3
branch	16	20	80.0
condition	24	34	70.5
subroutine	14	14	100.0
pod	3	7	42.8
total	173	193	89.6

line	stmt	bran	cond	sub	pod	time	code
1							package Acme::Lingua::ZH::Remix;
2	4			4		329689	use v5.10;
	4					41
3							our $VERSION = "0.99";
4
5							=pod
6
7							=encoding utf8
8
9							=head1 NAME
10
11							Acme::Lingua::ZH::Remix - The Chinese sentence generator.
12
13							=head1 SYNOPSIS
14
15							use Acme::Lingua::ZH::Remix;
16
17							my $x = Acme::Lingua::ZH::Remix->new;
18
19							# Generate a random sentance
20							say $x->random_sentence;
21
22							=head1 DESCRIPTION
23
24							Because lipsum is not funny enough, that is the reason to write this
25							module.
26
27							This module is a L-based, with C method being the constructor.
28
29							The C method returns a string of one sentence
30							of Chinese like:
31
32							真是完全失敗，孩子！怎麼不動了呢？
33
34							By default, it uses small corpus data from Project Gutenberg. The generated
35							sentences are remixes of the corpus.
36
37							You can feed you own corpus data to the `feed` method:
38
39							my $x = Acme::Lingua::ZH::Remix->new;
40							$x->feed($my_corpus);
41
42							# Say something based on $my_corpus
43							say $x->random_santence;
44
45							The corpus should use full-width punctuation characters.
46
47							=cut
48
49	4			4		1013	use utf8;
	4					28
	4					19
50	4			4		1810	use Moo;
	4					53757
	4					20
51	4			4		6858	use Types::Standard qw(HashRef Int);
	4					275920
	4					43
52	4			4		5322	use List::MoreUtils qw(uniq);
	4					29327
	4					30
53	4			4		5264	use Hash::Merge qw(merge);
	4					28664
	4					5048
54
55							has phrases => (is => "rw", isa => HashRef, lazy => 1, builder => "_build_phrases");
56
57							sub _build_phrases {
58	4			4		58	my $self = shift;
59	4					24	local $/ = undef;
60	4					307	my $corpus = ;
61	4					14	my %phrase;
62	4					17	my @phrases = $self->split_corpus($corpus);
63	4					22	for (@phrases) {
64	2619					4411	my $p = substr($_, -1);
65	2619		100			2761	push @{$phrase{$p} \|\|=[]}, \$_;
	2619					4911
66							}
67	4					153	return \%phrase;
68							}
69
70							sub phrase_count {
71	316886			316886	0	402850	my $self = shift;
72	316886					359715	my %p = %{$self->phrases};
	316886					3964868
73	316886					2049978	my $count = 0;
74	316886					631217	for(keys %p) {
75	2218168					2439015	$count += scalar @{$p{$_}};
	2218168					2879916
76							}
77	316886					647859	return $count;
78							}
79
80	397063			397063	0	1473036	sub random(@) { $_[ rand @_ ] }
81
82							=head1 METHODS
83
84							=head2 split_corpus($corpus_text)
85
86							Takes a scalar, returns an list.
87
88							This is an utility method that does not change the internal state of
89							the topic object.
90
91							=cut
92
93							sub split_corpus {
94	6			6	1	2791	my ($self, $corpus) = @_;
95	6	100				19	return () unless $corpus;
96
97	5					276	$corpus =~ s/^\#.*$//gm;
98
99							# Squeeze whitespaces
100	5					10019	$corpus =~ s/(\s\|　)*//gs;
101
102							# Ignore certain punctuations
103	5					436	$corpus =~ s/(——\|──)//gs;
104
105	5					3904	my @xc = split /(?:（(.+?)）\|：?「(.+?)」\|〔(.+?)〕\|“(.+?)”)/, $corpus;
106							my @phrases = uniq sort grep /.(，\|。\|？\|！)$/,
107							map {
108	479					4024	my @x = split /(，\|。\|？\|！)/, $_;
109	479					661	my @r = ();
110	479					738	while (@x) {
111	2846					3797	my $s = shift @x;
112	2846	100				4151	my $p = shift @x or next;
113
114	2735					4147	$s =~ s/^(，\|。\|？\|！\|\s)+//;
115	2735					5516	push @r, "$s$p";
116							}
117	479					6253	@r;
118							} map {
119	479					708	s/^\s+//;
120	479					820	s/\s+$//;
121	479					640	s/^(.+?) //;
122	479					608	$_;
123	5					25	} grep { $_ } @xc;
	1415					1565
124
125	5					555	return @phrases;
126							}
127
128							=head2 feed($corpus_text)
129
130							Instance method. Takes a scalar, return the topic object.
131
132							Merge C<$corpus_text> into the internal phrases corpus of the object.
133
134							=cut
135
136							sub feed {
137	1			1	1	2877	my $self = shift;
138	1					2	my $corpus = shift;
139
140	1					2	my %phrase;
141	1					3	my @phrases = $self->split_corpus($corpus);
142
143	1					4	for (@phrases) {
144	4					11	my $p = substr($_, -1);
145	4		50			6	push @{$phrase{$p} \|\|=[]}, \$_;
	4					17
146							}
147
148	1					20	$self->phrases(merge($self->phrases, \%phrase));
149	1					315	return $self;
150							}
151
152							sub phrase_ratio {
153	316884			316884	0	404893	my $self = shift;
154	316884					363599	my $type = shift;
155	316884		100			4339334	my $phrases = $self->phrases->{$type}\|\|=[];
156	316884					1843753	my $count = $self->phrase_count;
157	316884	50				542821	return 0 if $count == 0;
158	316884					355383	return @{$phrases} / $count;
	316884					727842
159							}
160
161							sub random_phrase {
162	79806			79806	0	102195	my $self = shift;
163	79806					114307	my $type = shift;
164
165	79806	50	50			89331	return ${ random(@{ $self->phrases->{$type}\|\|=[] }) \|\| \'' };
	79806					109073
	79806					1081226
166							}
167
168							=head2 random_sentence( min => $min, max => $max )
169
170							Instance method. Optionally takes "min" or "max" parameter as the constraint of
171							sentence length (number of characters).
172
173							Both min and max values are required to be integers greater or equal to
174							zero. The value of max should be greater then the value of min. If any of these
175							values are invalidate, it is treated as if they are not passed.
176
177							The default values of min, max are 0 and 140, respectively.
178
179							The implementation random algorthm based, thus it needs indefinite time to
180							generate the result. If it takes more then 1000 iterations, it aborts and return
181							the results anyway, regardless the length constraint. This can happen when the
182							lengths of phrases from corpus do no adds up to a value within the given range.
183
184							The returned scalar is the generate sentence string of wide characters. (Which
185							makes Encode::is_utf8 return true.)
186
187							=cut
188
189							sub random_sentence {
190	301			301	1	184853	my ($self, %options) = @_;
191
192	301					686	for my $p (qw(min max)) {
193	602					966	my $x = $options{$p};
194	602	50	66			2229	unless (defined($x) && int($x) eq $x && $x >= 0) {
			66
195	402					663	delete $options{$p}
196							}
197							}
198
199	301	50	66			1088	if (defined($options{max}) && defined($options{min}) && $options{max} < $options{min}) {
			66
200	0					0	delete $options{max};
201	0					0	delete $options{min};
202							}
203
204	301		100			1269	$options{min} \|\|= 0;
205	301		100			1108	$options{max} \|\|= 140;
206
207	301					494	my $str = "";
208	301					818	my @phrases;
209
210	301		50			698	my $ending = $self->random_phrase(random(qw/。！？/)) \|\| "…";
211
212	301					1099	while ( length($ending) > $options{max} ) {
213	72		50			136	$ending = $self->random_phrase(random(qw/。！？/)) \|\| "…";
214							}
215
216	301					779	unshift @phrases, $ending;
217
218	301					470	my $l = length($ending);
219
220	301					455	my $iterations = 0;
221	301					458	my $max_iterations = 1000;
222	301					706	my $average = ($options{min} + $options{max}) / 2;
223	301		33			1202	my $desired = int(rand($options{max} - $options{min}) + $options{min}) \|\| $average \|\| $options{max};
224
225	301					691	while ($iterations++ < $max_iterations) {
226	79433					91149	my $x;
227	79433					88448	do {
228	316884					490208	$x = random('，', '」', '）', '/')
229							} while ($self->phrase_ratio($x) == 0);
230
231	79433					145349	my $p = $self->random_phrase($x);
232
233	79433	100				201687	if ($l + length($p) < $options{max}) {
234	3353					6437	unshift @phrases, $p;
235	3353					4379	$l += length($p);
236							}
237
238	79433					139014	my $r = abs(1 - $l/$desired);
239	79433	100				128174	last if $r < 0.1;
240	79230	100	100			221516	last if $r < 0.2 && $iterations >= $max_iterations/2;
241							}
242
243	301					1254	$str = join "", @phrases;
244	301					1060	$str =~ s/，$//;
245	301					625	$str =~ s/^「(.+)」$/$1/;
246
247	301	100				715	if (rand > 0.5) {
248	152					403	$str =~ s/(，)……/$1/gs;
249							} else {
250	149					393	$str =~ s/，(……)/$1/gs;
251							}
252
253	301					1474	return $str;
254							}
255
256							1;
257
258							=head1 AUTHOR
259
260							Kang-min Liu
261
262							=head1 COPYRIGHT
263
264							Copyright 2010- by Kang-min Liu,
265
266							This program is free software; you can redistribute it a nd/or modify
267							it under the same terms as Perl itself.
268
269							See L
270
271							=cut
272
273							# Data coming from Wikisource
274							# http://zh.wikisource.org/zh-hant/%E5%BF%98%E4%B8%8D%E4%BA%86%E7%9A%84%E9%81%8E%E5%B9%B4
275
276							__DATA__