File Coverage

blib/lib/Lingua/JA/Name/Splitter.pm
Criterion Covered Total %
statement 79 79 100.0
branch 34 34 100.0
condition 3 3 100.0
subroutine 9 9 100.0
pod 3 3 100.0
total 128 128 100.0


line stmt bran cond sub pod time code
1             package Lingua::JA::Name::Splitter;
2              
3 1     1   86000 use warnings;
  1         8  
  1         34  
4 1     1   5 use strict;
  1         2  
  1         90  
5             require Exporter;
6             our @ISA = qw(Exporter);
7             our @EXPORT_OK = qw/split_kanji_name split_romaji_name $kkre kkname/;
8             our %EXPORT_TAGS = ('all' => \@EXPORT_OK);
9             our $VERSION = '0.12';
10 1     1   684 use utf8;
  1         14  
  1         4  
11 1     1   31 use Carp;
  1         2  
  1         57  
12 1     1   609 use Lingua::JA::Moji ':all';
  1         49626  
  1         545  
13              
14             # The probabilities that these characters are part of the family name.
15              
16             my %known;
17              
18             my $file = __FILE__;
19             $file =~ s/Splitter\.pm/probabilities.txt/;
20 1     1   6 open my $in, "<:encoding(utf8)", $file or die $!;
  1         1  
  1         8  
21             while (<$in>) {
22             my ($kanji, $prob) = split /\s/, $_;
23             $known{$kanji} = $prob;
24             }
25             close $in or die $!;
26              
27             # The weight to give the position in the kanji if it is a known
28             # kanji.
29              
30             our $length_weight = 0.735; # 42030 successes
31              
32             # The cutoff for splitting the name
33              
34             our $split_cutoff = 0.5;
35              
36             # Set this to a true value to print debugging messages.
37              
38             #my $debug = 1;
39             #use open qw(:std :encoding(UTF-8));a # when debugging
40              
41             =head2 $kkre
42              
43             Kanji-kana regular expression. This is intended to match kanji and
44             kana names.
45              
46             =cut
47              
48             our $kkre = qr!
49             \p{InCJKUnifiedIdeographs}|
50             [々〆]|
51             \p{InKana}
52             !x;
53              
54             sub kkname
55             {
56 11     11 1 21 my ($kanji) = @_;
57 11 100       129 if ($kanji !~ /^($kkre)+$/) {
58 1         4 return undef;
59             }
60 10         2091 return 1;
61             }
62              
63             sub split_kanji_name
64             {
65 13     13 1 20468 my ($kanji) = @_;
66             # Validate the user's input
67 13 100       38 if (! $kanji) {
68 1         171 carp "No valid name was provided to split_kanji_name";
69 1         8 return undef;
70             }
71 12 100       35 if (length $kanji == 1) {
72 1         189 carp "$kanji is only one character long, so there is nothing to split";
73 1         10 return ($kanji, '');
74             }
75 11 100       22 if (! kkname ($kanji)) {
76 1         114 carp "$kanji does not look like a kanji/kana name";
77             }
78 11 100       34 if (! wantarray ()) {
79 1         124 carp "The return value of split_kanji_name is an array";
80             }
81             # If the name is only two characters, there is only one possibility.
82 11 100       40 if (length $kanji == 2) {
83 1         9 return split '', $kanji;
84             }
85              
86             # The characters in the name, which may not be kanji.
87 10         38 my @kanji = split '', $kanji;
88             # Probability this character is part of the family name.
89 10         18 my @probability;
90             # First character is definitely part of the family name.
91 10         22 $probability[0] = 1;
92             # Last character is definitely part of the given name.
93 10         19 $probability[$#kanji] = 0;
94 10         19 my $length = length $kanji;
95             # Loop from the second kanji to the second-from-last kanji
96 10         27 for my $i (1..$#kanji - 1) {
97 18         41 my $p = 1 - $i / ($length - 1);
98 18         65 my $moji = $kanji[$i];
99 18 100       49 if (is_kana ($moji)) {
    100          
    100          
100             # Assume that hiragana is not part of surname (not correct
101             # in practice).
102 2         29 $p = 0;
103             }
104             elsif ($known{$moji}) {
105 13         143 $p = $length_weight * $p + (1 - $length_weight) * $known{$moji};
106             }
107             elsif ($moji eq '々') {
108             # This repeated kanji has the same probability as the
109             # original kanji
110 1         23 $p = $probability[$i - 1];
111             }
112 18         57 $probability[$i] = $p;
113             #if ($debug) { # Commented out to improve test coverage
114             # print STDERR "$kanji[$i] i=$i p=$p\n";
115             #}
116 18 100       55 if ($probability[$i] < $split_cutoff) {
117 9         64 return (substr ($kanji, 0, $i), substr ($kanji, $i));
118             }
119             }
120 1         7 return (substr ($kanji, 0, -1), substr ($kanji, -1));
121             }
122              
123             sub split_romaji_name
124             {
125 9     9 1 13164 my ($name) = @_;
126 9 100       26 if (! $name) {
127 1         105 carp "No name given to split_romaji_name";
128 1         18 return undef;
129             }
130 8 100       20 if (! wantarray ()) {
131 1         102 carp "The return value of split_romaji_name is an array";
132             }
133              
134             # What we guess is the family name
135 8         21 my $last;
136             # What we guess is the personal name
137             my $first;
138 8 100       43 if ($name !~ /\s|,/) {
139 2 100       20 if ($name =~ /^([A-Z][a-z]+)([A-Z]+)$/) {
140 1         3 $first = $1;
141 1         3 $last = $2;
142             }
143             else {
144             # If there is no space or comma, assume that this is the last name.
145 1         3 $first = '';
146 1         3 $last = $name;
147             }
148             }
149             else {
150             # Remove leading and trailing spaces.
151 6         30 $name =~ s/^\s+|\s+$//g;
152 6         38 my @parts = split /,?\s+/, $name;
153 6         16 for (@parts) {
154 13 100       8952 if (! is_romaji_strict ($_)) {
155 1         172 carp "'$_' doesn't look like Japanese romaji";
156             }
157             }
158             # If there are more than two parts to the name after splitting by spaces
159 6 100       2571 if (@parts > 2) {
160 1         105 carp "Strange Japanese name '$name' with middle name?";
161             }
162             # If the last name is capitalized, or if there is a comma in the
163             # name.
164 6 100 100     62 if ($parts[0] =~ /^[A-Z]+$/ || $name =~ /,/) {
165 2         5 $last = $parts[0];
166 2         4 $first = $parts[1];
167             }
168             else {
169 4         10 $last = $parts[1];
170 4         9 $first = $parts[0];
171             }
172             }
173             # Regularise the name
174 8         21 $first = ucfirst lc $first;
175 8         14 $last = ucfirst lc $last;
176 8         28 return ($first, $last);
177             }
178              
179             1;