File Coverage

blib/lib/Geo/Postcodes/JP/Process.pm

Criterion	Covered	Total	%
statement	9	9	100.0
branch			n/a
condition			n/a
subroutine	3	3	100.0
pod			n/a
total	12	12	100.0

line	stmt	sub	time	code
1				=encoding utf8
2
3				=head1 NAME
4
5				Geo::Postcodes::JP::Process - process Japan Post Office postcode data
6
7				=cut
8
9				package Geo::Postcodes::JP::Process;
10
11				require Exporter;
12				@ISA = qw(Exporter);
13				@EXPORT_OK = qw/
14				read_ken_all
15				read_jigyosyo
16				find_duplicates
17				concatenate_multi_line
18				process_line
19				process_jigyosyo_line
20				improve_postcodes
21				/;
22				%EXPORT_TAGS = (
23				all => \@EXPORT_OK,
24				);
25	5	5	73757	use utf8;
	5		89
	5		30
26
27	5	5	187	use warnings;
	5		11
	5		154
28	5	5	24	use strict;
	5		10
	5		967
29				our $VERSION = '0.014';
30
31				#line 29 "Process.pm.tmpl"
32
33				# Lingua::JA::Moji supplies the routine to convert half-width katakana
34				# into full-width katakana.
35
36				use Convert::Moji 'make_regex';
37				use Lingua::JA::Moji ':all';
38				use Carp;
39
40				# The default name of the address file.
41
42				my $ken_all_default = 'KEN_ALL.CSV';
43
44				# The default name of the file of postal codes for places of business.
45
46				my $jigyosyo_default = 'jigyosyo.csv';
47
48				sub open_file
49				{
50				my ($file_name) = @_;
51				open my $input, "<:encoding(shift_jis):crlf", $file_name
52				or croak "cannot open '$file_name': $!";
53				return $input;
54				}
55
56				# These are various types of geographical division in japan, in kanji
57				# and kana.
58
59				my @divisions = (
60				[qw/市シ/],
61				[qw/町チョウ郡グン/],
62				[qw/町マチ郡グン/],
63				[qw/郡グン/],
64				[qw/村ムラ郡グン/],
65				[qw/村ソン郡グン/],
66				[qw/区ク市シ/],
67				);
68
69				# These are the fields of the postcode file, in order.
70
71				my @fields = qw/
72				number
73				old_postcode
74				new_postcode
75				ken_kana
76				city_kana
77				address_kana
78				ken_kanji
79				city_kanji
80				address_kanji
81				one-region-multiple-postcodes
82				numbering-start
83				has-choume
84				one-postcode-multiple-regions
85				koushin-no-hyouji
86				henkou-riyuu
87				/;
88				#line 74 "Process.pm.tmpl"
89
90				my @jigyosyo_fields = qw/
91				number
92				kana
93				kanji
94				ken_kanji
95				city_kanji
96				address_kanji
97				street_number
98				new_postcode
99				old_postcode
100				post-office
101				type
102				multiple-postcode
103				Alteration code
104				/;
105				#line 81 "Process.pm.tmpl"
106
107				=head2 read_ken_all
108
109				my $postcodes_ref = read_ken_all ('KEN_ALL.CSV');
110
111				Read the file F. The return value is an array reference
112				containing the lines of the postcode file in the same order as the
113				file itself. The routine issues a fatal error if a problem is
114				encountered.
115
116				The return value is a double indexed array.
117
118				=cut
119
120				sub read_ken_all
121				{
122				my ($file_name) = @_;
123				# If no file name is supplied, assume that the file to be read is
124				# the default file name in the current working directory.
125				if (! $file_name) {
126				$file_name = $ken_all_default;
127				}
128				# Check whether the file exists.
129				if (! -f $file_name) {
130				croak "a file called '$file_name' does not exist";
131				}
132				# This is the return value.
133				my @postcodes;
134				# Open the data file. The file is in the Shift-JIS format.
135				my $input = open_file ($file_name);
136				# Read the file line by line.
137				while (my $line = <$input>) {
138				chomp $line;
139				# Remove all double quotes before splitting the line.
140				$line =~ s/"//g;
141				my @values = split ",", $line;
142				push @postcodes, \@values;
143				}
144				# Close the input file.
145				close $input or croak "cannot close '$file_name': $!";
146				# Return an array containing all the postal codes.
147				return \@postcodes;
148				}
149
150				=head2 process_line
151
152				my %values = process_line ($line);
153
154				Turn a line of the postcode file into a hash of its values.
155
156				The values of the hash are
157
158				=over
159
160
161				=item number
162
163				The JIS code number for the region. The JIS standards for regions of
164				Japan are numbered JIS X 0401 (1973) for the prefecture identification
165				codes, and JIS X0402 (2003) identification codes for cities, towns and
166				villages.
167
168
169
170				=item old_postcode
171
172				The old three or five digit postcode.
173
174
175
176				=item new_postcode
177
178				The new seven digit postcode.
179
180
181
182				=item ken_kana
183
184				The kana version of the prefecture.
185
186
187
188				=item city_kana
189
190				The kana version of the city.
191
192
193
194				=item address_kana
195
196				The kana version of the address.
197
198
199
200				=item ken_kanji
201
202				The kanji version of the prefecture.
203
204
205
206				=item city_kanji
207
208				The kanji version of the city.
209
210
211
212				=item address_kanji
213
214				The kanji version of the address.
215
216
217
218				=item one-region-multiple-postcodes
219
220				This is 1 if the same address has more than one postcode, zero
221				otherwise.
222
223
224
225				=item numbering-start
226
227				Indicates if numbering starts, 1 if so.
228
229
230
231				=item has-choume
232
233				Indicates there is a division into "choume".
234
235
236
237				=item one-postcode-multiple-regions
238
239				This is 1 if the same postcode covers more than one region, zero
240				otherwise.
241
242
243
244				=item koushin-no-hyouji
245
246				0 = no change, 1 = change, 2 = delete
247
248
249
250				=item henkou-riyuu
251
252				Reason for change.
253
254
255
256				=back
257
258				See also the L in Japanese.
259
260				=cut
261
262				#line 150 "Process.pm.tmpl"
263
264				sub process_line
265				{
266				my ($line) = @_;
267				my %values;
268				# @fields is defined above.
269				@values{@fields} = @$line;
270				return %values;
271				}
272
273				=head2 concatenate_multi_line
274
275				$postcodes = concatenate_multi_line ($postcodes, $duplicates);
276
277				Concatenate a single entry which is spread on multiple
278				lines. C<$Duplicates> is the return value of L.
279
280				If you are wondering what "concatenate a single entry which is spread
281				on multiple lines" means, some of the entries in the CSV file are
282				actually single entries but broken into two or more lines if the
283				number of characters in one of the fields exceeds a maximum. This
284				routine attempts to put this broken data back together again.
285
286				At the moment there is no comprehensive check of correctness of the
287				result.
288
289				=cut
290
291				use constant NUMBER => 0;
292				use constant OLD_POSTCODE => 1;
293				use constant NEW_POSTCODE => 2;
294				use constant KEN_KANA => 3;
295				use constant CITY_KANA => 4;
296				use constant ADDRESS_KANA => 5;
297				use constant KEN_KANJI => 6;
298				use constant CITY_KANJI => 7;
299				use constant ADDRESS_KANJI => 8;
300				use constant ONE_REGION_MULTIPLE_POSTCODES => 9;
301				use constant NUMBERING_START => 10;
302				use constant HAS_CHOUME => 11;
303				use constant ONE_POSTCODE_MULTIPLE_REGIONS => 12;
304				use constant KOUSHIN_NO_HYOUJI => 13;
305				use constant HENKOU_RIYUU => 14;
306				#line 182 "Process.pm.tmpl"
307
308				# Add more data to a single entry which spans multiple lines of the
309				# input file.
310
311				sub add_more_data
312				{
313				my ($multi_lines, $line) = @_;
314				# print "Adding @$line to @$multi_lines\n";
315				for my $i (0..$#$line) {
316				if ($i eq ADDRESS_KANA \|\| $i eq ADDRESS_KANJI) {
317				if (defined $multi_lines->[$i]) {
318				$multi_lines->[$i] .= $line->[$i];
319				}
320				else {
321				# Set from the first value.
322				$multi_lines->[$i] = $line->[$i];
323				}
324				}
325				else {
326				if (defined $multi_lines->[$i]) {
327				# This is not the first value.
328				# if ($line->[$i] ne $multi_lines->[$i]) {
329				# warn "Mismatch in field $i: $line->[$i] and $multi_lines->[$i]";
330				# }
331				}
332				else {
333				# Set from the first value.
334				$multi_lines->[$i] = $line->[$i];
335				}
336				}
337				}
338				}
339
340
341				use utf8;
342
343				# Given the list of postcodes and the list of duplicates, turn the
344				# duplicates which are multiline into non-multiline.
345
346				sub concatenate_multi_line
347				{
348				my ($postcodes, $duplicates) = @_;
349				my @concatenated;
350				my $total_brackets = 0;
351				my %done;
352				for my $line (@$postcodes) {
353				my $postcode = $line->[NEW_POSTCODE];
354				if ($duplicates->{$postcode}) {
355				my @dups = @{$duplicates->{$postcode}};
356				my $multi;
357				my @multi_lines;
358				for my $ln (@dups) {
359				if ($done{$ln}) {
360				next;
361				}
362				my $line = $postcodes->[$ln];
363				my $address_kana = $line->[ADDRESS_KANA];
364				my $address_kanji = $line->[ADDRESS_KANJI];
365				if ($address_kanji =~ /\x{FF08}/) {
366				# print "$postcode\n";
367				# print "match: (\n";
368				$multi = 1;
369				$total_brackets++;
370				}
371				if ($address_kanji =~ /\x{FF09}/) {
372				# print "$address_kanji\n";
373				# print "match: )\n";
374				}
375				if ($multi) {
376				# print "In multi: $address_kanji\n";
377				add_more_data (\@multi_lines, $line);
378				}
379				$done{$ln} = 1;
380				}
381				if ($multi) {
382				push @concatenated, \@multi_lines;
383				}
384				else {
385				# Was not a multiline, so add each entry separately.
386				for my $ln (@dups) {
387				if ($done{$ln}) {
388				next;
389				}
390				my $mline = $postcodes->[$ln];
391				push @concatenated, $mline;
392				}
393				}
394				}
395				else {
396				push @concatenated, $line;
397				}
398				}
399				# print $total_brackets;
400				return \@concatenated;
401				}
402
403				=head2 find_duplicates
404
405				my $duplicates = find_duplicates ($postcodes);
406
407				Make a hash whose keys are postcodes which have duplicate references,
408				and whose values are array references to arrays of offsets in the
409				postcode file. The return value is the hash reference.
410
411				=cut
412
413				sub find_duplicates
414				{
415				my ($postcodes) = @_;
416				my %postcodes;
417				my %duplicates;
418				my $ln = 0;
419				for my $line (@$postcodes) {
420				my $postcode = $line->[2];
421				if ($postcodes{$postcode}) {
422				$duplicates{$postcode} = 1;
423				}
424				push @{$postcodes{$postcode}}, $ln;
425				$ln++;
426				}
427				for my $k (keys %duplicates) {
428				$duplicates{$k} = $postcodes{$k};
429				}
430				return \%duplicates;
431				}
432
433				=head2 read_jigyosyo
434
435				my $jigyosyo_data = read_jigyosyo ('/path/to/jigyosyo/csv/file');
436
437				=cut
438
439				sub read_jigyosyo
440				{
441				my ($input_file) = @_;
442
443				my @jigyosho_postcodes;
444				# my $input_file = 'jigyosyo.csv';
445				open my $input, "<:encoding(shift-jis)", $input_file or die $!;
446				binmode STDOUT, ":utf8";
447				while (<$input>) {
448				my @fields = split /,\s*/, $_;
449				if (scalar @fields != 13) {
450				die "$input_file:$.: $_\n";
451				}
452				for (@fields) {
453				s/^"(.*)"$/$1/;
454				}
455				push @jigyosho_postcodes, \@fields;
456				}
457				close $input or die $!;
458				return \@jigyosho_postcodes;
459				}
460
461				=head2 process_jigyosyo_line
462
463				my %values = process_jigyosyo_line ($line);
464
465				Turn the array reference C<$line> into a hash of its values using the
466				fields.
467
468				The values of the hash are
469
470				=over
471
472
473				=item number
474
475				As for the main postcode file.
476
477
478
479				=item kana
480
481				The name of the place of business in kana.
482
483
484
485				=item kanji
486
487				The name of the place of business in kanji.
488
489
490
491				=item ken_kanji
492
493				The kanji version of the prefecture name.
494
495
496
497				=item city_kanji
498
499				The kanji version of the city name.
500
501
502
503				=item address_kanji
504
505				The kanji version of the address name.
506
507
508
509				=item street_number
510
511				The exact street number of the place of business.
512
513
514
515				=item new_postcode
516
517				As for the "ken_all" fields.
518
519
520
521				=item old_postcode
522
523				As for the "ken_all" fields.
524
525
526
527				=item post-office
528
529				The post office which handles mail for this postcode.
530
531
532
533				=item type
534
535				0=Large company
536				1=Private
537
538
539
540				=item multiple-postcode
541
542				0=Not multiple, also 1,2,3.
543
544
545
546				=item Alteration code
547
548				0=No change
549				1=New addition
550				2=Deleted
551
552
553
554				=back
555
556				See also the
557				L
558				in Japanese.
559
560				=cut
561
562				#line 364 "Process.pm.tmpl"
563
564				sub process_jigyosyo_line
565				{
566				my ($line) = @_;
567				my %values;
568				@values{@jigyosyo_fields} = @$line;
569				$values{kana} = hw2katakana ($values{kana});
570				return %values;
571				}
572
573				=head2 remove_bad_addresses
574
575				$postcodes = remove_bad_addresses ($postcodes);
576
577				=cut
578
579				sub remove_bad_addresses
580				{
581				my ($postcodes) = @_;
582
583				# The following array contains "bad addresses", text which is not
584				# an address.
585				my @bad_addresses = (
586				'以下に記載がない場合',
587				# 9013700
588				'以下に掲載がない場合'
589				);
590				my $ba_re = make_regex (@bad_addresses);
591				# These bits should be removed from the kanji and kana addresses.
592				my %remove_stuff = (
593				qr/(（その他）)/ => qr/($ｿﾉﾀ$)/,
594				qr/(（次のビルを除く）)/ => qr/($ﾂｷﾞﾉﾋﾞﾙｦﾉｿﾞｸ$)/,
595				# Don't remove this because
596				# 23223,"474 ","4740002","ｱｲﾁｹﾝ","ｵｵﾌﾞｼ","ｷﾀｻｷﾁｮｳ(ﾁｮｳﾒ)","愛知県","大府市","北崎町（丁目）",1,0,1,0,0,0
597				# 23223,"474 ","4740001","ｱｲﾁｹﾝ","ｵｵﾌﾞｼ","ｷﾀｻｷﾏﾁ","愛知県","大府市","北崎町",1,1,0,0,0,0
598				# qr/(（.丁目）)/ => qr/($.ﾁｮｳﾒ$)/,
599				);
600				for my $postcode (@$postcodes) {
601				my $address_kanji = $postcode->[ADDRESS_KANJI];
602				my $address_kana = $postcode->[ADDRESS_KANA];
603				if ($address_kanji =~ /^($ba_re)$/) {
604				my $other_kanji = $postcode->[ADDRESS_KANJI];
605				my $other_kana = $postcode->[ADDRESS_KANA];
606				$postcode->[ADDRESS_KANJI] = '';
607				$postcode->[ADDRESS_KANA] = '';
608				}
609				else {
610				for my $key (keys %remove_stuff) {
611				if ($address_kanji =~ $key) {
612				my $remove_kanji = $1;
613				if ($address_kana =~ $remove_stuff{$key}) {
614				my $remove_kana = $1;
615				$postcode->[ADDRESS_KANJI] =~ s/\Q$remove_kanji//;
616				$postcode->[ADDRESS_KANA] =~ s/\Q$remove_kana//;
617				last;
618				}
619				}
620				}
621				}
622				}
623				return $postcodes;
624				}
625
626				=head2 improve_postcodes
627
628				$postcodes = improve_postcodes ($postcodes);
629
630				Improve the postcodes as much as possible by unifying lines etc.
631
632				=cut
633
634				sub improve_postcodes
635				{
636				my ($postcodes) = @_;
637				my $duplicates = find_duplicates ($postcodes);
638				$postcodes = concatenate_multi_line ($postcodes, $duplicates);
639				$postcodes = remove_bad_addresses ($postcodes);
640				return $postcodes;
641				}
642
643				1;
644
645				__END__