File Coverage

blib/lib/Regexp/Chinese/TradSimp.pm
Criterion Covered Total %
statement 28 28 100.0
branch 8 8 100.0
condition n/a
subroutine 6 6 100.0
pod 2 4 50.0
total 44 46 95.6


line stmt bran cond sub pod time code
1             package Regexp::Chinese::TradSimp;
2              
3 6     6   9497 use strict;
  6         10  
  6         201  
4              
5 6     6   5644 use Encode::HanConvert;
  6         141760  
  6         3440  
6              
7             our $VERSION = '0.01';
8              
9             =encoding utf8
10              
11             =head1 NAME
12              
13             Regexp::Chinese::TradSimp - Take a string containing Chinese text, and turn it
14             into a traditional-simplified-insensitive regexp.
15              
16             =head1 SYNOPSIS
17              
18             #!/usr/bin/perl -w
19             use strict;
20             use utf8;
21              
22             my $regexp = Regexp::Chinese::TradSimp->make_regexp( "鳳爪" );
23              
24             my $text = "豉汁蒸凤爪";
25             if ( $text =~ $regexp ) {
26             print "Chicken feet detected!\n";
27             }
28              
29             # Alternatively:
30             my $tradsimp = Regexp::Chinese::TradSimp->new;
31             my $regexp = $tradsimp->make_regexp( "鳳爪" );
32              
33             =head1 DESCRIPTION
34              
35             Given a string containing Chinese text, transforms it into a regexp that can
36             be used to match both the simplified and the traditional version of the
37             text. The distribution also includes a commandline tool, C
38             (Bsensitise Braditional-Bimplified).
39              
40             =head1 METHODS
41              
42             =over
43              
44             =item B
45              
46             # This returns /[凤鳳]爪/.
47             my $regexp = Regexp::Chinese::TradSimp->make_regexp( "鳳爪" );
48              
49             # This returns /[水虾蝦][饺餃]/
50             my $regexp = Regexp::Chinese::TradSimp->make_regexp( "[水蝦]餃" );
51              
52             # This returns /([虾蝦]|[带帶]子)[饺餃]/
53             my $regexp = Regexp::Chinese::TradSimp->make_regexp( "(虾|带子)饺" );
54              
55             C attempts to create a regular expression that will match its
56             argument in a traditional-simplified-insensitive way. The argument should
57             be a string of Chinese characters, but you can include certain other aspects
58             of regular expressions such as character classes and bracketed groupings.
59             Arguments of forms other than those shown above are not guaranteed to work.
60              
61             =item B
62              
63             Does exactly the same as C but returns a string instead of a
64             regexp, e.g. "[凤鳳]爪" rather than /[凤鳳]爪/.
65              
66             We are also -ise/-ize agnostic:
67              
68             # These do the same thing.
69             my $regexp = $tradsimp->desensitise( qr/叉燒包/ );
70             my $regexp = $tradsimp->desensitize( qr/叉燒包/ );
71              
72             =back
73              
74             =cut
75              
76             sub new {
77 5     5 0 444 my ( $class, @args ) = @_;
78 5         13 my $self = { };
79 5         18 bless $self, $class;
80 5         17 return $self;
81             }
82              
83             sub desensitise {
84 15     15 1 27 my $self = shift;
85 15         28 my $text = shift;
86 15         24 my ( @characters, $inclass );
87              
88 15         77 foreach my $character ( split //, $text ) {
89             # This is hairy and fragile.
90 89 100       258 if ( $character eq "[" ) {
    100          
91 1         3 $inclass++;
92             } elsif ( $character eq "]" ) {
93 1         1 $inclass--;
94             }
95 89         217 my $trad = trad_to_simp( $character );
96 89         2223 my $simp = simp_to_trad( $character );
97 89 100       1927 if ( $trad eq $simp ) {
    100          
98 72         151 push @characters, $character;
99             } elsif ( $inclass ) {
100 1         3 push @characters, "$trad$simp";
101             } else {
102 16         66 push @characters, "[$trad$simp]";
103             }
104             }
105              
106 15         56 $text = join( "", @characters );
107 15         48 return $text;
108             }
109              
110             sub desensitize {
111 1     1 0 18 return desensitise( @_ );
112             }
113              
114             sub make_regexp {
115 14     14 1 2372 my $text = desensitise( @_ );
116 14         296 return qr/$text/;
117             }
118              
119             =head1 AUTHOR
120              
121             Kake L Pugh
122              
123             =head1 COPYRIGHT
124              
125             Copyright (C) 2010 Kake L Pugh. All Rights Reserved.
126              
127             This is free software; you can redistribute it and/or modify it under
128             the same terms as Perl itself.
129              
130             =cut
131              
132             1;