File Coverage

blib/lib/Lingua/ZH/Toke.pm
Criterion Covered Total %
statement 4 6 66.6
branch n/a
condition n/a
subroutine 2 2 100.0
pod n/a
total 6 8 75.0


line stmt bran cond sub pod time code
1             # $File: //member/autrijus/Lingua-ZH-Toke/lib/Lingua/ZH/Toke.pm $ $Author: autrijus $
2             # $Revision: #2 $ $Change: 9667 $ $DateTime: 2004/01/11 12:56:49 $
3              
4             package Lingua::ZH::Toke;
5             $Lingua::ZH::Toke::VERSION = '0.02';
6              
7 3     3   2655 use strict;
  3         86  
  3         135  
8 3     3   2172 use Lingua::ZH::TaBE ();
  0            
  0            
9              
10             =encoding big5
11              
12             =head1 NAME
13              
14             Lingua::ZH::Toke - Chinese Tokenizer
15              
16             =head1 VERSION
17              
18             This document describes version 0.02 of Lingua::ZH::Toke, released
19             January 11, 2004.
20              
21             =head1 SYNOPSIS
22              
23             use Lingua::ZH::Toke;
24              
25             # -- if inputs are unicode strings, use the two lines below instead
26             # use utf8;
27             # use Lingua::ZH::Toke 'utf8';
28              
29             # Create Lingua::ZH::Toke::Sentence object (->Sentence also works)
30             my $token = Lingua::ZH::Toke->new( '那人卻在/燈火闌珊處/益發意興闌珊' );
31              
32             # Easy tokenization via array deferencing
33             print $token->[0] # Fragment - 那人卻在
34             ->[2] # Phrase - 卻在
35             ->[0] # Character - 卻
36             ->[0] # Pronounciation - ㄑㄩㄝˋ
37             ->[2]; # Phonetic - ㄝ
38              
39             # Magic histogram via hash deferencing
40             print $token->{'那人卻在'}; # 1 - One such fragment there
41             print $token->{'意興闌珊'}; # 1 - One such phrase there
42             print $token->{'發意興闌'}; # undef - That's not a phrase
43             print $token->{'珊'}; # 2 - Two such character there
44             print $token->{'ㄧˋ'}; # 2 - Two such pronounciation: 益意
45             print $token->{'ㄨ'}; # 3 - Three such phonetics: 那火處
46              
47             # Iteration over fragments
48             while (my $fragment = <$token>) {
49             # Iteration over phrases
50             while (my $phrase = <$fragment>) {
51             # ...
52             }
53             }
54              
55             =head1 DESCRIPTION
56              
57             This module puts a thin wrapper around L, by blessing
58             refereces to B's objects into its English counterparts.
59              
60             Besides offering more readable class names, this module also offers
61             various overloaded methods for tokenization; please see L for
62             the three major ones.
63              
64             Since L is a Big5-oriented module, we also provide a
65             simple utf8 layer around it; if you have Perl version 5.6.1 or later,
66             just use this:
67              
68             use utf8;
69             use Lingua::ZH::Toke 'utf8';
70              
71             With the C flag set, all B objects will stringify to unicode
72             strings, and constructors will take either unicode strings, or
73             big5-encoded bytestrings.
74              
75             Note that on Perl 5.6.x, L is needed for the C
76             feature to work.
77              
78             =head1 METHODS
79              
80             The constructor methods correspond to the six object levels:
81             C<-ESentence>, C<-EFragment>, C<-EPhrase>, C<-ECharacter>,
82             C<-EPronounciation> and C<-EPhonetic>. Each of them takes one
83             string argument, representing the string to be tokenized.
84              
85             The C<-Enew> method is an alias to C<-E>Sentence>.
86              
87             All object methods, except C<-Enew>, are passed to the underlying
88             B object.
89              
90             =head1 CAVEATS
91              
92             Under I mode, you may sometimes need to explicitly stringify
93             the return values, so their utf8 flag can be properly set:
94              
95             $value = $token->[0]; # this may or may not work
96             $value = "$token->[0]"; # this is guaranteed to work
97              
98             This module does not care about efficiency or memory consumption yet,
99             hence it's likely to fail miserably if you demand either of them.
100             Patches welcome.
101              
102             As the name suggests, the chosen interface is very bizzare. Use it at
103             the risk of your own sanity.
104              
105             =cut
106              
107             use vars '$AUTOLOAD';
108              
109             my @hier = qw(Chu Chunk Tsi Zhi Yin ZuYin);
110             my @name = qw(Sentence Fragment Phrase Character Pronounciation Phonetic);
111              
112             my %next; @next{'', @hier} = (@hier, '');
113             my %tabe; @tabe{@hier, @name} = (@hier, @hier);
114             my %toke; @toke{@name, @hier} = (@name, @name);
115              
116             for my $h (\%next, \%tabe, \%toke) {
117             $h->{_tabe($_)} = $h->{_toke($_)} = $h->{$_} for grep $_, keys %$h;
118             }
119              
120             { no strict 'refs'; @{_toke($_) . '::ISA'} = __PACKAGE__ for @name }
121              
122             my (%hist, %iter, $_b2u, $_u2b);
123              
124             BEGIN { $_b2u = $_u2b = sub { ${$_[0]} } }
125              
126             sub import {
127             my $class = shift;
128             my $encoding = shift;
129             if ($encoding eq 'utf8') {
130             if ($] < 5.007) {
131             eval { require Encode::compat; 1 }
132             or die "Pre-5.8 perls needs Encode::compat to use the 'utf8' feature";
133             }
134              
135             require Encode;
136              
137             $_b2u = sub {
138             Encode::decode( big5 => ${$_[0]} )
139             };
140             $_u2b = sub {
141             Encode::is_utf8(${$_[0]})
142             ? Encode::encode( big5 => ${$_[0]} )
143             : ${$_[0]};
144             };
145             }
146             }
147              
148             use overload (
149             '""' => sub { $_b2u->(@_) },
150             '0+' => sub {
151             # scalar @{$_[0]}
152             # ... somehow look up freq ...
153             },
154             '@{}' => sub {
155             my $meth = ${$_[0]}->can(lc("$next{_tabe($_[0])}s")) or return [];
156             [ map bless(\$_, _toke($_)), $meth->(${$_[0]}) ]
157             },
158             '%{}' => sub {
159             $hist{overload::StrVal(${$_[0]})} ||= do {
160             my %o; $o{"$_"}++ for @{$_[0]};
161             my %h;
162             for my $c (@{$_[0]}) {
163             $h{"$_"} += $c->{"$_"} for keys %$c;
164             }
165             +{ %o, %h };
166             };
167             },
168             '<>' => sub {
169             $_[0]->[$iter{overload::StrVal($_[0])}++];
170             },
171             'fallback' => 1,
172             );
173              
174             my $Tabe;
175              
176             sub new {
177             my $class = shift;
178             my $child = $_[1] || $class;
179             my $method = $tabe{ref($child) || $child} || $hier[0];
180             my $obj = ($Tabe ||= Lingua::ZH::TaBE->new)->$method($_u2b->(\$_[0]));
181             my $self = bless(\$obj, _toke($obj));
182             }
183              
184             sub AUTOLOAD {
185             no strict 'refs';
186             $AUTOLOAD =~ s/.*:://;
187              
188             my $name = _toke($AUTOLOAD)
189             or return ${$_[0]}->$AUTOLOAD(@_[1..$#_]);
190              
191             return $name->new(@_[1..$#_]);
192             }
193              
194             sub CLONE { }
195             sub DESTROY { }
196              
197             sub _tabe { 'Lingua::ZH::TaBE::' . ($tabe{ref($_[0]) || $_[0]} || die $_[0]) }
198             sub _toke { 'Lingua::ZH::Toke::' . ($toke{ref($_[0]) || $_[0]} || die $_[0]) }
199              
200             1;
201              
202             =head1 SEE ALSO
203              
204             L, L, L
205              
206             =head1 AUTHORS
207              
208             Autrijus Tang Eautrijus@autrijus.orgE
209              
210             =head1 COPYRIGHT
211              
212             Copyright 2003, 2004 by Autrijus Tang Eautrijus@autrijus.orgE.
213              
214             This program is free software; you can redistribute it and/or modify it
215             under the same terms as Perl itself.
216              
217             See L
218              
219             =cut