File Coverage

blib/lib/Lingua/JA/Name/Splitter.pm
Criterion Covered Total %
statement 68 88 77.2
branch 23 38 60.5
condition 2 3 66.6
subroutine 9 9 100.0
pod 3 3 100.0
total 105 141 74.4


line stmt bran cond sub pod time code
1             package Lingua::JA::Name::Splitter;
2              
3 1     1   69170 use warnings;
  1         2  
  1         32  
4 1     1   6 use strict;
  1         5  
  1         85  
5             require Exporter;
6             our @ISA = qw(Exporter);
7             our @EXPORT_OK = qw/split_kanji_name split_romaji_name $kkre kkname/;
8             our %EXPORT_TAGS = ('all' => \@EXPORT_OK);
9             our $VERSION = '0.11';
10 1     1   666 use utf8;
  1         15  
  1         4  
11 1     1   31 use Carp;
  1         2  
  1         68  
12 1     1   604 use Lingua::JA::Moji ':all';
  1         50059  
  1         541  
13              
14             # The probabilities that these characters are part of the family name.
15              
16             my %known;
17              
18             my $file = __FILE__;
19             $file =~ s/Splitter\.pm/probabilities.txt/;
20 1     1   8 open my $in, "<:encoding(utf8)", $file or die $!;
  1         2  
  1         18  
21             while (<$in>) {
22             my ($kanji, $prob) = split /\s/, $_;
23             $known{$kanji} = $prob;
24             }
25             close $in or die $!;
26              
27             # The weight to give the position in the kanji if it is a known
28             # kanji.
29              
30             our $length_weight = 0.735; # 42030 successes
31              
32             # The cutoff for splitting the name
33              
34             our $split_cutoff = 0.5;
35              
36             # Set this to a true value to print debugging messages.
37              
38             my $debug;
39              
40             =head2 $kkre
41              
42             Kanji-kana regular expression. This is intended to match kanji and
43             kana names.
44              
45             =cut
46              
47             our $kkre = qr!
48             \p{InCJKUnifiedIdeographs}|
49             [々〆]|
50             \p{InKana}
51             !x;
52              
53             sub kkname
54             {
55 7     7 1 12 my ($kanji) = @_;
56 7 50       92 if ($kanji !~ /^($kkre)+$/) {
57 0         0 return undef;
58             }
59 7         2173 return 1;
60             }
61              
62             sub split_kanji_name
63             {
64 7     7 1 4230 my ($kanji) = @_;
65             # Validate the user's input
66 7 50       20 if (! $kanji) {
67 0         0 carp "No valid name was provided to split_kanji_name";
68 0         0 return undef;
69             }
70 7 50       24 if (length $kanji == 1) {
71 0         0 carp "$kanji is only one character long, so there is nothing to split";
72 0         0 return ($kanji, '');
73             }
74 7 50       17 if (! kkname ($kanji)) {
75 0         0 carp "$kanji does not look like a kanji/kana name";
76             }
77 7 50       16 if (! wantarray ()) {
78 0         0 carp "The return value of split_kanji_name is an array";
79             }
80             # If the name is only two characters, there is only one possibility.
81 7 50       18 if (length $kanji == 2) {
82 0         0 return split '', $kanji;
83             }
84              
85             # What we guess is the given name part of the name
86 7         13 my $given;
87             # What we guess is the family name part of the name
88             my $family;
89             # The characters in the name, which may not be kanji.
90 7         31 my @kanji = split '', $kanji;
91             # Probability this character is part of the family name.
92 7         12 my @probability;
93             # First character is definitely part of the family name.
94 7         13 $probability[0] = 1;
95             # Last character is definitely part of the given name.
96 7         14 $probability[$#kanji] = 0;
97 7         13 my $length = length $kanji;
98             # Loop from the second kanji to the second-from-last kanji
99 7         21 for my $i (1..$#kanji - 1) {
100 14         33 my $p = 1 - $i / ($length - 1);
101 14         19 my $moji = $kanji[$i];
102 14 100       35 if (is_kana ($moji)) {
    100          
    50          
103             # Assume that hiragana is not part of surname (not correct
104             # in practice).
105 2         21 $p = 0;
106             }
107             elsif ($known{$moji}) {
108 11         129 $p = $length_weight * $p + (1 - $length_weight) * $known{$moji};
109             }
110             elsif ($moji eq '々') {
111             # This repeated kanji has the same probability as the
112             # original kanji
113 1         18 $p = $probability[$i - 1];
114             }
115 14         34 $probability[$i] = $p;
116             }
117 7 50       23 if ($debug) {
118 0         0 print "@probability\n";
119 0         0 print "@kanji\n";
120             }
121 7         12 my $in_given;
122 7         17 for my $i (0..$#kanji) {
123 28 100       60 if ($probability[$i] < $split_cutoff) {
124 15         20 $in_given = 1;
125             }
126 28 100       45 if ($in_given) {
127 15         34 $given .= $kanji[$i];
128             }
129             else {
130 13         23 $family .= $kanji[$i];
131             }
132             }
133 7         33 return ($family, $given);
134             }
135              
136             sub split_romaji_name
137             {
138 2     2 1 1165 my ($name) = @_;
139 2 50       8 if (! $name) {
140 0         0 carp "No name given to split_romaji_name";
141 0         0 return undef;
142             }
143 2 50       5 if (! wantarray ()) {
144 0         0 carp "The return value of split_romaji_name is an array";
145             }
146              
147             # What we guess is the family name
148 2         5 my $last;
149             # What we guess is the personal name
150             my $first;
151 2 50       27 if ($name !~ /\s|,/) {
152 0 0       0 if ($name =~ /^([A-Z][a-z]+)([A-Z]+)$/) {
153 0         0 $first = $1;
154 0         0 $last = $2;
155             }
156             else {
157             # If there is no space or comma, assume that this is the last name.
158 0         0 $first = '';
159 0         0 $last = $name;
160             }
161             }
162             else {
163             # Remove leading and trailing spaces.
164 2         10 $name =~ s/^\s+|\s+$//g;
165 2         13 my @parts = split /,?\s+/, $name;
166 2         6 for (@parts) {
167 4 50       6635 if (! is_romaji_strict ($_)) {
168 0         0 carp "'$_' doesn't look like Japanese romaji";
169             }
170             }
171             # If there are more than two parts to the name after splitting by spaces
172 2 50       1040 if (@parts > 2) {
173 0         0 carp "Strange Japanese name '$name' with middle name?";
174             }
175             # If the last name is capitalized, or if there is a comma in the
176             # name.
177 2 100 66     20 if ($parts[0] =~ /^[A-Z]+$/ || $name =~ /,/) {
178 1         4 $last = $parts[0];
179 1         2 $first = $parts[1];
180             }
181             else {
182 1         3 $last = $parts[1];
183 1         3 $first = $parts[0];
184             }
185             }
186             # Regularise the name
187 2         6 $first = ucfirst lc $first;
188 2         5 $last = ucfirst lc $last;
189 2         8 return ($first, $last);
190             }
191              
192             1;