File Coverage

blib/lib/Text/Shirasu.pm

Criterion	Covered	Total	%
statement	13	15	86.6
branch			n/a
condition			n/a
subroutine	5	5	100.0
pod			n/a
total	18	20	90.0

line	stmt	sub	time	code
1				package Text::Shirasu;
2
3	5	5	135044	use strict;
	5		15
	5		150
4	5	5	32	use warnings;
	5		12
	5		139
5	5	5	510	use utf8;
	5		23
	5		35
6
7	5	5	144	use Exporter 'import';
	5		18
	5		157
8	5	5	7269	use Text::MeCab;
	0
	0
9				use Carp 'croak';
10				use Text::Shirasu::Node;
11				use Text::Shirasu::Tree;
12				use Lingua::JA::NormalizeText;
13				use Encode qw/encode_utf8 decode_utf8/;
14
15				our $VERSION = "0.0.4";
16				our @EXPORT_OK = (@Lingua::JA::NormalizeText::EXPORT_OK, qw/normalize_hyphen normalize_symbols/);
17
18				*nfkc = \&Lingua::JA::NormalizeText::nfkc;
19				*nfkd = \&Lingua::JA::NormalizeText::nfkd;
20				*nfc = \&Lingua::JA::NormalizeText::nfc;
21				*nfd = \&Lingua::JA::NormalizeText::nfd;
22				*decode_entities = \&Lingua::JA::NormalizeText::decode_entities;
23				*alnum_z2h = \&Lingua::JA::NormalizeText::alnum_z2h;
24				*alnum_h2z = \&Lingua::JA::NormalizeText::alnum_h2z;
25				*space_z2h = \&Lingua::JA::NormalizeText::space_z2h;
26				*space_h2z = \&Lingua::JA::NormalizeText::space_h2z;
27				*katakana_z2h = \&Lingua::JA::NormalizeText::katakana_z2h;
28				*katakana_h2z = \&Lingua::JA::NormalizeText::katakana_h2z;
29				*katakana2hiragana = \&Lingua::JA::NormalizeText::katakana2hiragana;
30				*hiragana2katakana = \&Lingua::JA::NormalizeText::hiragana2katakana;
31				*dakuon_normalize = \&Lingua::JA::NormalizeText::dakuon_normalize;
32				*handakuon_normalize = \&Lingua::JA::NormalizeText::handakuon_normalize;
33				*all_dakuon_normalize = \&Lingua::JA::NormalizeText::all_dakuon_normalize;
34				*square2katakana = \&Lingua::JA::NormalizeText::square2katakana;
35				*circled2kana = \&Lingua::JA::NormalizeText::circled2kana;
36				*circled2kanji = \&Lingua::JA::NormalizeText::circled2kanji;
37				*strip_html = \&Lingua::JA::NormalizeText::strip_html;
38				*wave2tilde = \&Lingua::JA::NormalizeText::wave2long;
39				*tilde2wave = \&Lingua::JA::NormalizeText::tilde2wave;
40				*wavetilde2long = \&Lingua::JA::NormalizeText::wavetilde2long;
41				*wave2long = \&Lingua::JA::NormalizeText::wave2long;
42				*tilde2long = \&Lingua::JA::NormalizeText::tilde2long;
43				*fullminus2long = \&Lingua::JA::NormalizeText::fullminus2long;
44				*dashes2long = \&Lingua::JA::NormalizeText::dashes2long;
45				*drawing_lines2long = \&Lingua::JA::NormalizeText::drawing_lines2long;
46				*unify_long_repeats = \&Lingua::JA::NormalizeText::unify_long_repeats;
47				*unify_long_spaces = \&Lingua::JA::NormalizeText::unify_long_spaces;
48				*unify_whitespaces = \&Lingua::JA::NormalizeText::unify_whitespaces;
49				*trim = \&Lingua::JA::NormalizeText::trim;
50				*ltrim = \&Lingua::JA::NormalizeText::ltrim;
51				*rtrim = \&Lingua::JA::NormalizeText::rtrim;
52				*nl2space = \&Lingua::JA::NormalizeText::nl2space;
53				*unify_nl = \&Lingua::JA::NormalizeText::unify_nl;
54				*tab2space = \&Lingua::JA::NormalizeText::tab2space;
55				*old2new_kana = \&Lingua::JA::NormalizeText::old2new_kana;
56				*remove_controls = \&Lingua::JA::NormalizeText::remove_controls;
57				*remove_spaces = \&Lingua::JA::NormalizeText::remove_spaces;
58				*remove_DFC = \&Lingua::JA::NormalizeText::remove_DFC;
59				*old2new_kanji = \&Lingua::JA::NormalizeText::old2new_kanji;
60				*decompose_parenthesized_kanji
61				= \&Lingua::JA::NormalizeText::decompose_parenthesized_kanji;
62
63				=encoding utf-8
64
65				=head1 NAME
66
67				Text::Shirasu - Text::MeCab wrapped for natural language processing
68
69				=head1 SYNOPSIS
70
71				use utf8;
72				use feature ':5.10';
73				use Text::Shirasu;
74				my $ts = Text::Shirasu->new(cabocha => 1); # you can use Text::CaboCha
75				my $normalize = $ts->normalize("昨日の晩御飯は「鮭のふりかけ」と「味噌汁」だけでした。");
76				$ts->parse($normalize);
77
78				for my $node (@{ $ts->nodes }) {
79				say $node->surface;
80				}
81
82				say $ts->join_surface;
83
84				my $filter = $ts->filter(type => [qw/名詞助動詞/], 記号 => [qw/括弧開括弧閉/]);
85				say $filter->join_surface;
86
87				for my $tree (@{ $ts->trees }) {
88				say $tree->surface;
89				}
90
91				=head1 DESCRIPTION
92
93				Text::Shirasu is wrapped L.
94				This module is easy to normalize text and filter part of speech.
95				Also to use L by setting the cabocha option to true.
96
97				=cut
98
99				=head1 METHODS
100				=cut
101				=head2 new
102
103				Text::Shirasu->new(
104				# If you want to use cabocha
105				cabocha => 1,
106				# Text::MeCab arguments
107				rcfile => $rcfile, # Also it will be ailias as mecabrc for Text::CaboCha
108				dicdir => $dicdir, # Also it will be ailias as mecab_dicdir for Text::CaboCha
109				userdic => $userdic, # Also it will be ailias as mecab_userdic for Text::CaboCha
110				lattice_level => $lattice_level,
111				all_morphs => $all_morphs,
112				output_format_type => $output_format_type,
113				partial => $partial,
114				node_format => $node_format,
115				unk_format => $unk_format,
116				bos_format => $bos_format,
117				eos_format => $eos_format,
118				input_buffer_size => $input_buffer_size,
119				allocate_sentence => $allocate_sentence,
120				nbest => $nbest,
121				theta => $theta,
122
123				# Text::CaboCha arguments
124				ne => $ne,
125				parser_model => $parser_model_file,
126				chunker_model => $chunker_model_file,
127				ne_model => $ne_tagger_model_file,
128				);
129
130				=cut
131
132				sub new {
133				my $class = shift;
134				my %args = ref $_[0] eq 'HASH' ? %{ $_[0] } : @_;
135				my %cabocha_opts;
136				my $use_cabocha = delete $args{cabocha};
137				if ($use_cabocha) {
138				local $@;
139				eval { require Text::CaboCha };
140				if ($@ \|\| $Text::CaboCha::VERSION < "0.04") {
141				croak("If you want to use some functions of Text::CaboCha, you need to install Text::CaboCha >= 0.04");
142				}
143				# Arguments for Text::Cabocha
144				for my $opt (qw/ne parser_model chunker_model ne_model/) {
145				if (exists $args{$opt}) {
146				$cabocha_opts{$opt} = delete $args{$opt};
147				}
148				}
149				# Get from arguments of Text::MeCab
150				for my $opt (qw/rcfile dicdir userdic/) {
151				if (exists $args{$opt}) {
152				if ($opt eq 'rcfile') {
153				$cabocha_opts{mecabrc} = $args{$opt};
154				} else {
155				$cabocha_opts{"mecab_${opt}"} = $args{$opt};
156				}
157				}
158				}
159				}
160
161				my $self = bless {
162				mecab => Text::MeCab->new(%args),
163				nodes => +[],
164				normalize => +[qw/
165				nfkc
166				nfkd
167				nfc
168				nfd
169				alnum_z2h
170				space_z2h
171				katakana_h2z
172				decode_entities
173				unify_nl
174				unify_whitespaces
175				unify_long_spaces
176				trim
177				old2new_kana
178				old2new_kanji
179				tab2space
180				all_dakuon_normalize
181				square2katakana
182				circled2kana
183				circled2kanji
184				decompose_parenthesized_kanji
185				/, \&normalize_hyphen, \&normalize_symbols
186				],
187				} => $class;
188
189				if ($use_cabocha) {
190				$self->{trees} = +[];
191				$self->{cabocha} = Text::CaboCha->new(%cabocha_opts);
192				}
193
194				return $self;
195				}
196
197				=head2 parse
198
199				This method wraps the parse method of Text::MeCab.
200				The analysis result is saved as array reference of Text::Shirasu::Node instance in the Text::Shirasu instance.
201				Also, If you used cabocha mode, it save as array reference of Text::Shirasu::Tree instance in the Text::Shirasu instance when used this method.
202				It return Text::Shirasu instance.
203
204				$ts->parse("このおにぎりは「母」が握ってくれたものです。");
205
206				=cut
207
208				sub parse {
209				my $self = shift;
210				my $sentence = $_[0];
211
212				croak "Sentence has not been inputted" unless $sentence;
213
214				my $mt = $self->{mecab};
215
216				# initialize
217				$self->{nodes} = [];
218				my $node = $mt->parse($sentence);
219
220				# when cabocha mode
221				if (exists $self->{cabocha}) {
222				my $ct = $self->{cabocha};
223				my $tree = $ct->parse_from_node($node);
224				my $cid = 0;
225				for my $token (@{ $tree->tokens }) {
226				if ($token->chunk) {
227				push @{ $self->{trees} }, bless {
228				cid => $cid++,
229				link => $token->chunk->link,
230				head_pos => $token->chunk->head_pos,
231				func_pos => $token->chunk->func_pos,
232				score => $token->chunk->score,
233				surface => $token->surface,
234				feature => [ split /,/, $token->feature ],
235				ne => $token->ne,
236				}, 'Text::Shirasu::Tree';
237				}
238				}
239				}
240
241				for (; $node && $node->surface; $node = $node->next) {
242				push @{ $self->{nodes} }, bless {
243				id => $node->id,
244				surface => $node->surface,
245				feature => [ split /,/, $node->feature ],
246				length => $node->length,
247				rlength => $node->rlength,
248				rcattr => $node->rcattr,
249				lcattr => $node->lcattr,
250				stat => $node->stat,
251				isbest => $node->isbest,
252				alpha => $node->alpha,
253				beta => $node->beta,
254				prob => $node->prob,
255				wcost => $node->wcost,
256				cost => $node->cost,
257				}, 'Text::Shirasu::Node';
258				}
259
260				return $self;
261				}
262
263				=head2 normalize
264
265				It will normalize text using L.
266
267				$ts->normalize("あ━ ”（＊）” を〰〰 ’＋１’")
268				$ts->normalize("テキスト〰〰", qw/nfkc, alnum_z2h/, \&your_create_routine)
269
270				It accepts a string as the first argument, and receives the Lingua::JA::NormalizeText options and subroutines after the second argument.
271				If you do not specify a subroutine to be used in normalization, use the following Lingua::JA::NormalizeText options and subroutines by default.
272
273				Please read the documentation of L for details on how each Lingua::JA::NormalizeText option works.
274
275				Lingua::JA::NormalizeText options
276
277				C
278
279				Subroutines
280
281				C
282
283				=cut
284
285				sub normalize {
286				my $self = shift;
287				my $text = shift;
288				my $normalizer = Lingua::JA::NormalizeText->new(@_ ? @_ : @{ $self->{normalize} });
289				$normalizer->normalize(utf8::is_utf8($text) ? $text : decode_utf8($text));
290				}
291
292				=head2 filter
293
294				Please use after parse method execution.
295				Filter the surface based on the features stored in the Text::Shirasu instance.
296				Passing subtype to value with part of speech name as key allows you to more filter the string.
297
298				# filtering nodes only
299				$ts->filter(type => [qw/名詞/]);
300				$ts->filter(type => [qw/名詞記号/], 記号 => [qw/括弧開括弧閉/]);
301
302				# filtering trees only
303				$ts->filter(tree => 1, node => 0, type => [qw/名詞/]);
304				$ts->filter(tree => 1, node => 0, type => [qw/名詞記号/], 記号 => [qw/括弧開括弧閉/]);
305
306				# filtering nodes and trees
307				$ts->filter(tree => 1, type => [qw/名詞/]);
308				$ts->filter(tree => 1, type => [qw/名詞記号/], 記号 => [qw/括弧開括弧閉/]);
309
310				=cut
311
312				sub filter {
313				my $self = shift;
314				my %params = ref $_[0] eq 'HASH' ? %{ $_[0] } : @_;
315
316				# and search filter
317				my @type = @{ delete $params{type} }
318				or croak 'Query has not been inputted: "type"';
319
320				# create parameter as /名詞\|動詞/ or /名詞/
321				my $query = encode_utf8 join '\|', map { $_ } @type;
322
323				# filtering trees
324				if (delete $params{tree}) {
325				$self->{trees} = [
326				grep {
327				$_->{feature}->[0] =~ /($query)/
328				and _sub_query( $_->{feature}->[1], $params{decode_utf8($1)} )
329				} @{ $self->{trees} }
330				];
331				}
332
333				# filtering nodes if unset "node" argument or "node => true value"
334				if (!exists $params{node} \|\| delete $params{node}) {
335				$self->{nodes} = [
336				grep {
337				$_->{feature}->[0] =~ /($query)/
338				and _sub_query( $_->{feature}->[1], $params{decode_utf8($1)} )
339				} @{ $self->{nodes} }
340				];
341				}
342
343				return $self;
344				}
345
346
347				=head2 join_surface
348
349				Returns a string that combined the surfaces stored in the instance.
350
351				$ts->join_surface
352
353				=cut
354
355				sub join_surface {
356				my $self = shift;
357				croak "Does not exist parsed nodes" unless exists $self->{nodes};
358				return join '', map { $_->{surface} } @{ $self->{nodes} };
359				}
360
361				=head2 nodes
362
363				Return the array reference of the Text::Shirasu::Node instance.
364
365				$ts->nodes
366
367				=cut
368
369				sub nodes { $_[0]->{nodes} }
370
371				=head2 trees
372
373				Return the array reference of the Text::Shirasu::Tree instance.
374
375				$ts->trees
376
377				=cut
378
379				sub trees { $_[0]->{trees} }
380
381				=head2 mecab
382
383				Return the Text::MeCab instance.
384
385				$ts->mecab
386
387				=cut
388
389				sub mecab { $_[0]->{mecab} }
390
391				=head2 cabocha
392
393				Return the Text::CaboCha instance.
394
395				$ts->cabocha
396
397				=cut
398
399				sub cabocha { $_[0]->{cabocha} }
400
401				# private
402				sub _sub_query {
403				my ( $subtype, $query ) = @_;
404
405				return 1 unless ref $query eq 'ARRAY';
406
407				my $judge = join '\|', map { encode_utf8($_) } @$query;
408
409				return $subtype =~ /$judge/;
410				}
411
412				1;
413
414				=head1 SUBROUTINES
415
416				These subroutines perform the following substitution.
417
418				=head2 normalize_hyphen
419
420				s/[˗֊‐‑‒–⁃⁻₋−]/-/g;
421				s/[﹣－ｰ—―─━ー]/ー/g;
422				s/[~∼∾〜〰～]//g;
423				s/ー+/ー/g;
424
425				=head2 normalize_symbols
426
427				tr/。、・「」/｡､･｢｣/;
428
429				=cut
430
431				sub normalize_hyphen {
432				local $_ = shift;
433				return undef unless defined $_;
434				s/[˗֊‐‑‒–⁃⁻₋−]/-/g;
435				s/[﹣－ｰ—―─━ー]/ー/g;
436				s/[~∼∾〜〰～]//g;
437				s/ー+/ー/g;
438				$_;
439				}
440
441				sub normalize_symbols {
442				local $_ = shift;
443				return undef unless defined $_;
444				tr/。、・「」/｡､･｢｣/;
445				$_;
446				}
447
448				=head1 LICENSE
449
450				Copyright (C) Kei Kamikawa(Code-Hex).
451
452				This library is free software; you can redistribute it and/or modify
453				it under the same terms as Perl itself.
454
455				=head1 AUTHOR
456
457				Kei Kamikawa Ex00.x7f@gmail.comE
458
459				=cut