File Coverage

blib/lib/Lingua/JA/Name/Splitter.pm

Criterion	Covered	Total	%
statement	68	88	77.2
branch	23	38	60.5
condition	2	3	66.6
subroutine	9	9	100.0
pod	3	3	100.0
total	105	141	74.4

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::JA::Name::Splitter;
2
3	1			1		73879	use warnings;
	1					3
	1					43
4	1			1		7	use strict;
	1					15
	1					112
5							require Exporter;
6							our @ISA = qw(Exporter);
7							our @EXPORT_OK = qw/split_kanji_name split_romaji_name $kkre kkname/;
8							our %EXPORT_TAGS = ('all' => \@EXPORT_OK);
9							our $VERSION = '0.10';
10	1			1		826	use utf8;
	1					20
	1					7
11	1			1		43	use Carp;
	1					3
	1					93
12	1			1		757	use Lingua::JA::Moji ':all';
	1					49884
	1					545
13
14							# The probabilities that these characters are part of the family name.
15
16							my %known;
17
18							my $file = __FILE__;
19							$file =~ s/Splitter\.pm/probabilities.txt/;
20	1			1		6	open my $in, "<:encoding(utf8)", $file or die $!;
	1					3
	1					19
21							while (<$in>) {
22							my ($kanji, $prob) = split /\s/, $_;
23							$known{$kanji} = $prob;
24							}
25							close $in or die $!;
26
27							# The weight to give the position in the kanji if it is a known
28							# kanji.
29
30							our $length_weight = 0.736; # 42030 successes
31
32							# The cutoff for splitting the name
33
34							our $split_cutoff = 0.5;
35
36							# Set this to a true value to print debugging messages.
37
38							my $debug;
39
40							=head2 $kkre
41
42							Kanji-kana regular expression. This is intended to match kanji and
43							kana names.
44
45							=cut
46
47							our $kkre = qr!
48							\p{InCJKUnifiedIdeographs}\|
49							[々〆]\|
50							\p{InKana}
51							!x;
52
53							sub kkname
54							{
55	7			7	1	15	my ($kanji) = @_;
56	7	50				101	if ($kanji !~ /^($kkre)+$/) {
57	0					0	return undef;
58							}
59	7					2148	return 1;
60							}
61
62							sub split_kanji_name
63							{
64	7			7	1	4192	my ($kanji) = @_;
65							# Validate the user's input
66	7	50				20	if (! $kanji) {
67	0					0	carp "No valid name was provided to split_kanji_name";
68	0					0	return undef;
69							}
70	7	50				22	if (length $kanji == 1) {
71	0					0	carp "$kanji is only one character long, so there is nothing to split";
72	0					0	return ($kanji, '');
73							}
74	7	50				15	if (! kkname ($kanji)) {
75	0					0	carp "$kanji does not look like a kanji/kana name";
76							}
77	7	50				17	if (! wantarray ()) {
78	0					0	carp "The return value of split_kanji_name is an array";
79							}
80							# If the name is only two characters, there is only one possibility.
81	7	50				18	if (length $kanji == 2) {
82	0					0	return split '', $kanji;
83							}
84
85							# What we guess is the given name part of the name
86	7					14	my $given;
87							# What we guess is the family name part of the name
88							my $family;
89							# The characters in the name, which may not be kanji.
90	7					42	my @kanji = split '', $kanji;
91							# Probability this character is part of the family name.
92	7					11	my @probability;
93							# First character is definitely part of the family name.
94	7					13	$probability[0] = 1;
95							# Last character is definitely part of the given name.
96	7					12	$probability[$#kanji] = 0;
97	7					13	my $length = length $kanji;
98							# Loop from the second kanji to the second-from-last kanji
99	7					18	for my $i (1..$#kanji - 1) {
100	14					30	my $p = 1 - $i / ($length - 1);
101	14					22	my $moji = $kanji[$i];
102	14	100				37	if (is_kana ($moji)) {
		100
		50
103							# Assume that hiragana is not part of surname (not correct
104							# in practice).
105	2					19	$p = 0;
106							}
107							elsif ($known{$moji}) {
108	11					145	$p = $length_weight * $p + (1 - $length_weight) * $known{$moji};
109							}
110							elsif ($moji eq '々') {
111							# This repeated kanji has the same probability as the
112							# original kanji
113	1					19	$p = $probability[$i - 1];
114							}
115	14					31	$probability[$i] = $p;
116							}
117	7	50				16	if ($debug) {
118	0					0	print "@probability\n";
119	0					0	print "@kanji\n";
120							}
121	7					9	my $in_given;
122	7					14	for my $i (0..$#kanji) {
123	28	100				54	if ($probability[$i] < $split_cutoff) {
124	15					28	$in_given = 1;
125							}
126	28	100				46	if ($in_given) {
127	15					28	$given .= $kanji[$i];
128							}
129							else {
130	13					24	$family .= $kanji[$i];
131							}
132							}
133	7					30	return ($family, $given);
134							}
135
136							sub split_romaji_name
137							{
138	2			2	1	1189	my ($name) = @_;
139	2	50				8	if (! $name) {
140	0					0	carp "No name given to split_romaji_name";
141	0					0	return undef;
142							}
143	2	50				6	if (! wantarray ()) {
144	0					0	carp "The return value of split_romaji_name is an array";
145							}
146
147							# What we guess is the family name
148	2					16	my $last;
149							# What we guess is the personal name
150							my $first;
151	2	50				17	if ($name !~ /\s\|,/) {
152	0	0				0	if ($name =~ /^([A-Z][a-z]+)([A-Z]+)$/) {
153	0					0	$first = $1;
154	0					0	$last = $2;
155							}
156							else {
157							# If there is no space or comma, assume that this is the last name.
158	0					0	$first = '';
159	0					0	$last = $name;
160							}
161							}
162							else {
163							# Remove leading and trailing spaces.
164	2					14	$name =~ s/^\s+\|\s+$//g;
165	2					14	my @parts = split /,?\s+/, $name;
166	2					7	for (@parts) {
167	4	50				6400	if (! is_romaji_strict ($_)) {
168	0					0	carp "'$_' doesn't look like Japanese romaji";
169							}
170							}
171							# If there are more than two parts to the name after splitting by spaces
172	2	50				1041	if (@parts > 2) {
173	0					0	carp "Strange Japanese name '$name' with middle name?";
174							}
175							# If the last name is capitalized, or if there is a comma in the
176							# name.
177	2	100	66			20	if ($parts[0] =~ /^[A-Z]+$/ \|\| $name =~ /,/) {
178	1					3	$last = $parts[0];
179	1					3	$first = $parts[1];
180							}
181							else {
182	1					4	$last = $parts[1];
183	1					3	$first = $parts[0];
184							}
185							}
186							# Regularise the name
187	2					8	$first = ucfirst lc $first;
188	2					5	$last = ucfirst lc $last;
189	2					9	return ($first, $last);
190							}
191
192							1;