File Coverage

blib/lib/Treex/Tool/Tagger/MeCab.pm
Criterion Covered Total %
statement 10 12 83.3
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 14 16 87.5


line stmt bran cond sub pod time code
1             package Treex::Tool::Tagger::MeCab;
2             $Treex::Tool::Tagger::MeCab::VERSION = '0.13095';
3 1     1   35055 use strict;
  1         3  
  1         42  
4 1     1   5 use warnings;
  1         2  
  1         37  
5              
6 1     1   7429 use Moose;
  1         566703  
  1         8  
7 1     1   7200 use Treex::Core::Common;
  0            
  0            
8             use Treex::Core::Config;
9             use Treex::Tool::ProcessUtils;
10             use Treex::Core::Resource;
11              
12             sub BUILD {
13             my ($self) = @_;
14              
15             # TODO find architecture independent solution
16             my $bin_path = require_file_from_share(
17             'installed_tools/tagger/MeCab/bin/mecab',
18             ref($self)
19             );
20            
21             my $cmd = "$bin_path".' 2>/dev/null';
22              
23             # start MeCab tagger
24             my ( $reader, $writer, $pid ) = Treex::Tool::ProcessUtils::bipipe( $cmd, ':encoding(utf-8)' );
25              
26             $self->{reader} = $reader;
27             $self->{writer} = $writer;
28             $self->{pid} = $pid;
29              
30             return;
31             }
32              
33             sub process_sentence {
34             my ( $self, $sentence ) = @_;
35              
36             my @tokens;
37             my $writer = $self->{writer};
38             my $reader = $self->{reader};
39              
40             print $writer $sentence."\n";
41              
42             my $line = <$reader>;
43              
44             # we store each line, which consists of wordform+features into @tokens as a string where each feature/wordform is separated by '\t'
45             # other block should edit this output as needed
46             # EOS marks end of sentence
47             while ( $line !~ "EOS" ) {
48            
49             log_fatal("Unitialized line (perhaps MeCab was not initialized correctly).") if (!defined $line); # even with empty string input we should get at least "EOS" line in output, otherwise the tagger wasn't correctly initialized
50            
51             # we don't want to substitute actual commas in the sentence
52             $line =~ s{^(.*),\t}{$1#comma\t};
53              
54             $line =~ s{(.),}{$1\t}g;
55              
56             $line =~ s{#comma}{,};
57              
58             push @tokens, $line;
59             $line = <$reader>;
60             }
61              
62             return @tokens;
63              
64             }
65              
66             # ----------------- cleaning up ------------------
67             # # TODO : cleanup
68              
69             1;
70              
71             __END__
72              
73             =pod
74              
75             =encoding utf-8
76              
77             =head1 NAME
78              
79             Treex::Tool::Tagger::MeCab - perl wrapper for C implemented japanese morphological analyzer MeCab
80              
81             =head1 VERSION
82              
83             version 0.13095
84              
85             =head1 SYNOPSIS
86              
87             use Treex::Tool::Tagger::MeCab;
88             my $tagger = Treex::Tool::Tagger::MeCab->new();
89             my $sentence = qw(わたしは日本語を話します);
90             my @tokens = $tagger->process_sentence($sentence);
91              
92             =head1 DESCRIPTION
93              
94             This is a Perl wrapper for MeCab tagger and tokenizer implemented in C++.
95             Generates string of features (first one is wordform) for each token generated. Returns array of tokens for further use.
96              
97             =head1 INSTALLATION
98              
99             Before installing MeCab, make sure you have properly installed the Treex-Core package (see L<Treex Installation|http://ufal.mff.cuni.cz/treex/install.html>), since it is prerequisite for this module anyway.
100             After installing Treex-Core you can install MeCab using this L<Makefile|https://svn.ms.mff.cuni.cz/svn/tectomt_devel/trunk/install/tool_installation/MeCab/Makefile> (username "public" passwd "public"). Prior to runing the makefile, you must set the enviromental variable "$TMT_ROOT" to the location of your .treex directory.
101              
102             You can also install MeCab manually but then you must link the installation directory to the ${TMT_ROOT}/share/installed_tools/tagger/MeCab/ (location within Treex share), otherwise the modules will not be able to use the program.
103              
104             =head1 METHODS
105              
106             =over
107              
108             =item @tokens = $tagger->process_sentence($sentence);
109              
110             Returns list of "tokens" for the tokenized input with its morphological categories each separated by \t.
111              
112             =back
113              
114             =head1 SEE ALSO
115              
116             L<MeCab Home Page|http://mecab.googlecode.com/svn/trunk/mecab/doc/index.html>
117              
118             =head1 AUTHOR
119              
120             DuÅ¡an VariÅ¡ <dvaris@seznam.cz>
121              
122             =head1 COPYRIGHT AND LICENSE
123              
124             Copyright © 2014 by Institute of Formal and Applied Linguistics, Charles University in Prague
125              
126             This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
127              
128             =cut