File Coverage

blib/lib/Treex/Block/W2A/JA/TagMeCab.pm

Criterion	Covered	Total	%
statement	13	15	86.6
branch			n/a
condition			n/a
subroutine	5	5	100.0
pod			n/a
total	18	20	90.0

line	stmt	sub	time	code
1				package Treex::Block::W2A::JA::TagMeCab;
2				$Treex::Block::W2A::JA::TagMeCab::VERSION = '0.13095';
3	1	1	37319	use strict;
	1		3
	1		85
4	1	1	6	use warnings;
	1		3
	1		38
5
6	1	1	1149	use Moose;
	1		565043
	1		9
7	1	1	16065	use Encode;
	1		13675
	1		105
8	1	1	429	use Treex::Core::Common;
	0
	0
9				use Treex::Tool::Tagger::MeCab;
10
11				extends 'Treex::Core::Block';
12
13				has _form_corrections => (
14				is => 'ro',
15				isa => 'HashRef[Str]',
16				default => sub {
17				{
18				q(``) => q("),
19				q('') => q("),
20				}
21				},
22				documentation => q{Possible changes in forms done by tagger},
23				);
24
25				has tagger => ( is => 'rw' );
26
27				sub BUILD {
28				my ($self) = @_;
29
30				return;
31				}
32
33				sub process_start {
34				my ($self) = @_;
35
36				my $tagger = Treex::Tool::Tagger::MeCab->new();
37				$self->set_tagger( $tagger );
38
39				return;
40				}
41
42				sub process_zone {
43
44
45				my ( $self, $zone ) = @_;
46
47				# get the source sentence
48				my $sentence = $zone->sentence;
49				log_fatal("No sentence in zone") if !defined $sentence;
50				log_fatal(qq{There's already atree in zone}) if $zone->has_atree();
51				log_debug("Processing sentence: $sentence");
52
53				my $result = "";
54
55				my $debug = "";
56
57				my ( @tokens ) = $self->tagger->process_sentence( $sentence );
58
59				# modifies the output format of MeCab wrapper
60				foreach my $token ( @tokens ) {
61				my @features = split /\t/, $token;
62				my $wordform = $features[0];
63
64				my $bTag = $features[1].'-'.$features[2].'-'.$features[3].'-'.$features[4];
65				my $lemma = $features[7];
66
67				if ($bTag !~ "BOS" && $bTag !~ "ç©ºç™½") {
68
69				$lemma = $wordform if $lemma =~ m/\*/;
70				$bTag =~ s{^(.+)$}{<$1>};
71				my $eTag = $bTag;
72				$eTag =~ s{<}{</};
73
74				$result .= ' '.$bTag.$wordform.$eTag.$lemma;
75
76				}
77				}
78				$result =~ s{^\s+}{};
79
80				# split on whitespace, tags nor tokens doesn't contain spaces
81				my @tagged = split /\s+/, $result;
82
83				# create a-tree
84				my $a_root = $zone->create_atree();
85				my $tag_regex = qr{
86				<([^\-]+\-[^\-]+\-[^\-]+\-[^\-]+)> #<tag>
87				([^<]+) #form
88				</\1> #</tag>
89				(.+) #lemma
90				}x;
91				my $space_start = qr{^\s+};
92				my $ord = 1;
93				foreach my $tag_pair (@tagged) {
94				if ( $tag_pair =~ $tag_regex ) {
95				my $form = $2;
96
97				my $tag = $1;
98				$tag =~ s{^<(\w+)>.$}{$1};
99
100				my $lemma = $3;
101
102				$debug .= "$form $tag; ";
103
104				if ( $sentence =~ s/^\Q$form\E// ) {
105
106				# check if there is space after word
107				my $no_space_after = $sentence =~ m/$space_start/ ? 0 : 1;
108				if ( $sentence eq q{} ) {
109				$no_space_after = 0;
110				}
111
112				# delete it
113				$sentence =~ s{$space_start}{};
114
115				# and create node under root
116				$a_root->create_child(
117				form => $form,
118				tag => $tag,
119				lemma => $lemma,
120				no_space_after => $no_space_after,
121				ord => $ord++,
122				);
123				}
124				else {
125				log_fatal("Mismatch between tagged word and original sentence: Tagged: $form; $debug. Original: $sentence.");
126				}
127				}
128				else {
129				log_fatal("Incorrect output format from MeCab: $tag_pair debug: $debug");
130				}
131
132				}
133				return 1;
134				}
135
136				1;
137
138				__END__
139
140				=pod
141
142				=encoding utf-8
143
144				=head1 NAME
145
146				Treex::Block::W2A::JA::TagMeCab
147
148				=head1 VERSION
149
150				version 0.13095
151
152				=head1 DESCRIPTION
153
154				Each sentence is tokenized and tagged using C<MeCab> (Ipadic POS tags).
155				Ipadic tagset uses hierarchical tags. There are four levels of hierarchy,
156				each level is separated by "-". Empty kategories are marked as "*".
157				Tags are in kanji, in the future they should be replaced by Romanized tags or their abbreviations (other japanese treex modules should be modified accordingly).
158
159				=head1 SEE ALSO
160
161				L<MeCab Home Page\|http://mecab.googlecode.com/svn/trunk/mecab/doc/index.html>
162
163				=cut