File Coverage

blib/lib/Treex/Tool/Tagger/MeCab.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line	stmt	sub	time	code
1				package Treex::Tool::Tagger::MeCab;
2				$Treex::Tool::Tagger::MeCab::VERSION = '0.13095';
3	1	1	35055	use strict;
	1		3
	1		42
4	1	1	5	use warnings;
	1		2
	1		37
5
6	1	1	7429	use Moose;
	1		566703
	1		8
7	1	1	7200	use Treex::Core::Common;
	0
	0
8				use Treex::Core::Config;
9				use Treex::Tool::ProcessUtils;
10				use Treex::Core::Resource;
11
12				sub BUILD {
13				my ($self) = @_;
14
15				# TODO find architecture independent solution
16				my $bin_path = require_file_from_share(
17				'installed_tools/tagger/MeCab/bin/mecab',
18				ref($self)
19				);
20
21				my $cmd = "$bin_path".' 2>/dev/null';
22
23				# start MeCab tagger
24				my ( $reader, $writer, $pid ) = Treex::Tool::ProcessUtils::bipipe( $cmd, ':encoding(utf-8)' );
25
26				$self->{reader} = $reader;
27				$self->{writer} = $writer;
28				$self->{pid} = $pid;
29
30				return;
31				}
32
33				sub process_sentence {
34				my ( $self, $sentence ) = @_;
35
36				my @tokens;
37				my $writer = $self->{writer};
38				my $reader = $self->{reader};
39
40				print $writer $sentence."\n";
41
42				my $line = <$reader>;
43
44				# we store each line, which consists of wordform+features into @tokens as a string where each feature/wordform is separated by '\t'
45				# other block should edit this output as needed
46				# EOS marks end of sentence
47				while ( $line !~ "EOS" ) {
48
49				log_fatal("Unitialized line (perhaps MeCab was not initialized correctly).") if (!defined $line); # even with empty string input we should get at least "EOS" line in output, otherwise the tagger wasn't correctly initialized
50
51				# we don't want to substitute actual commas in the sentence
52				$line =~ s{^(.*),\t}{$1#comma\t};
53
54				$line =~ s{(.),}{$1\t}g;
55
56				$line =~ s{#comma}{,};
57
58				push @tokens, $line;
59				$line = <$reader>;
60				}
61
62				return @tokens;
63
64				}
65
66				# ----------------- cleaning up ------------------
67				# # TODO : cleanup
68
69				1;
70
71				__END__
72
73				=pod
74
75				=encoding utf-8
76
77				=head1 NAME
78
79				Treex::Tool::Tagger::MeCab - perl wrapper for C implemented japanese morphological analyzer MeCab
80
81				=head1 VERSION
82
83				version 0.13095
84
85				=head1 SYNOPSIS
86
87				use Treex::Tool::Tagger::MeCab;
88				my $tagger = Treex::Tool::Tagger::MeCab->new();
89				my $sentence = qw(ã‚ãŸã—ã¯æ—¥æœ¬èªžã‚’è©±ã—ã¾ã™);
90				my @tokens = $tagger->process_sentence($sentence);
91
92				=head1 DESCRIPTION
93
94				This is a Perl wrapper for MeCab tagger and tokenizer implemented in C++.
95				Generates string of features (first one is wordform) for each token generated. Returns array of tokens for further use.
96
97				=head1 INSTALLATION
98
99				Before installing MeCab, make sure you have properly installed the Treex-Core package (see L<Treex Installation\|http://ufal.mff.cuni.cz/treex/install.html>), since it is prerequisite for this module anyway.
100				After installing Treex-Core you can install MeCab using this L<Makefile\|https://svn.ms.mff.cuni.cz/svn/tectomt_devel/trunk/install/tool_installation/MeCab/Makefile> (username "public" passwd "public"). Prior to runing the makefile, you must set the enviromental variable "$TMT_ROOT" to the location of your .treex directory.
101
102				You can also install MeCab manually but then you must link the installation directory to the ${TMT_ROOT}/share/installed_tools/tagger/MeCab/ (location within Treex share), otherwise the modules will not be able to use the program.
103
104				=head1 METHODS
105
106				=over
107
108				=item @tokens = $tagger->process_sentence($sentence);
109
110				Returns list of "tokens" for the tokenized input with its morphological categories each separated by \t.
111
112				=back
113
114				=head1 SEE ALSO
115
116				L<MeCab Home Page\|http://mecab.googlecode.com/svn/trunk/mecab/doc/index.html>
117
118				=head1 AUTHOR
119
120				DuÅ¡an VariÅ¡ <dvaris@seznam.cz>
121
122				=head1 COPYRIGHT AND LICENSE
123
124				Copyright Â© 2014 by Institute of Formal and Applied Linguistics, Charles University in Prague
125
126				This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
127
128				=cut