File Coverage

lib/Geo/Address/Formatter.pm
Criterion Covered Total %
statement 448 508 88.1
branch 180 228 78.9
condition 32 46 69.5
subroutine 42 47 89.3
pod 3 3 100.0
total 705 832 84.7


line stmt bran cond sub pod time code
1             package Geo::Address::Formatter;
2             $Geo::Address::Formatter::VERSION = '1.997';
3             # ABSTRACT: take structured address data and format it according to the various global/country rules
4              
5 6     6   32098 use strict;
  6         12  
  6         170  
6 6     6   27 use warnings;
  6         13  
  6         179  
7 6     6   28 use feature qw(say);
  6         10  
  6         562  
8 6     6   2112 use Clone qw(clone);
  6         11683  
  6         326  
9 6     6   1160 use Data::Dumper;
  6         13219  
  6         376  
10             $Data::Dumper::Sortkeys = 1;
11 6     6   42 use File::Basename qw(dirname);
  6         11  
  6         294  
12 6     6   2045 use File::Find::Rule;
  6         33035  
  6         44  
13 6     6   2702 use Ref::Util qw(is_hashref);
  6         8051  
  6         437  
14 6     6   41 use Scalar::Util qw(looks_like_number);
  6         11  
  6         324  
15 6     6   2630 use Text::Hogan::Compiler;
  6         39795  
  6         214  
16 6     6   2354 use Try::Catch;
  6         4185  
  6         342  
17 6     6   1611 use YAML::XS qw(LoadFile);
  6         11049  
  6         361  
18 6     6   39 use utf8;
  6         59  
  6         48  
19              
20             my $THC = Text::Hogan::Compiler->new;
21              
22             # optional params
23             my $show_warnings = 1;
24             my $debug = 0;
25             my $only_address = 0;
26              
27              
28             sub new {
29 17     17 1 17683 my ($class, %params) = @_;
30              
31 17         39 my $self = {};
32 17   50     62 my $conf_path = $params{conf_path} || die "no conf_path set";
33              
34             # optional params
35 17 50 66     56 if ( defined($params{no_warnings}) && ($params{no_warnings})){
36 1         3 $show_warnings = 0;
37             }
38 17   66     74 $only_address = (defined($params{only_address}) && $params{only_address}) // 0;
      50        
39 17   66     73 $debug = (defined($params{debug}) && $params{debug}) // 0;
      50        
40              
41 17         39 $self->{final_components} = undef;
42 17         36 $self->{set_district_alias} = {};
43              
44 17         36 bless($self, $class);
45              
46 17 50       56 say STDERR "************* in Geo::Address::Formatter::new ***" if ($debug);
47            
48 17 100       52 if ($self->_read_configuration($conf_path)){
49 16         77 return $self;
50             }
51 1         16 die 'unable to read configuration';
52             }
53              
54             sub _read_configuration {
55 17     17   29 my $self = shift;
56 17         21 my $path = shift;
57              
58 17 50       410 return if (! -e $path);
59              
60 17         592 my @a_filenames = File::Find::Rule->file()->name('*.yaml')->in($path . '/countries');
61              
62 17         17781 $self->{templates} = {};
63 17         44 $self->{component_aliases} = {};
64 17         40 $self->{component2type} = {};
65 17         37 $self->{ordered_components} = [];
66              
67             # read the config file(s)
68 17         27 my $loaded = 0;
69 17         48 foreach my $filename (sort @a_filenames) {
70             try {
71 16     16   430 my $rh_templates = LoadFile($filename);
72              
73             # if file 00-default.yaml defines 'DE' (Germany) and
74             # file 01-germany.yaml does as well, then the second
75             # occurance of the key overwrites the first.
76 16         32198 foreach (keys %$rh_templates) {
77 2910         3882 $self->{templates}{$_} = $rh_templates->{$_};
78             }
79 16         307 $loaded = 1;
80             } catch {
81 0     0   0 warn "error parsing country configuration in $filename: $_";
82 16         176 };
83             }
84 17 100       319 return if ($loaded == 0);
85              
86             # see if we can load the components
87             try {
88 16 50   16   383 say STDERR "loading components" if ($debug);
89 16         78 my @c = LoadFile($path . '/components.yaml');
90              
91 16 50       3586 if ($debug){
92 0         0 say STDERR Dumper \@c;
93             }
94              
95 16         38 foreach my $rh_c (@c) {
96 234 50       383 if (defined($rh_c->{name})){
97 234 100       312 if (defined($rh_c->{aliases})){
98 128         256 $self->{component_aliases}{$rh_c->{name}} = $rh_c->{aliases};
99             } else {
100 106         248 $self->{component_aliases}{$rh_c->{name}} = [];
101             }
102             }
103             }
104              
105 16         31 foreach my $rh_c (@c) {
106 234         230 push(@{$self->{ordered_components}}, $rh_c->{name});
  234         345  
107 234         362 $self->{component2type}->{$rh_c->{name}} = $rh_c->{name};
108              
109 234 100       391 if (defined($rh_c->{aliases})) {
110 128         126 foreach my $alias (@{$rh_c->{aliases}}) {
  128         177  
111 480         454 push(@{$self->{ordered_components}}, $alias);
  480         609  
112 480         922 $self->{component2type}->{$alias} = $rh_c->{name};
113             }
114             }
115             }
116 16 50       115 if ($debug){
117 0         0 say STDERR 'component_aliases';
118 0         0 say STDERR Dumper $self->{component_aliases};
119 0         0 say STDERR 'ordered_components';
120 0         0 say STDERR Dumper $self->{ordered_components};
121 0         0 say STDERR 'component2type';
122 0         0 say STDERR Dumper $self->{component2type};
123             }
124             } catch {
125 0     0   0 warn "error parsing component configuration: $_";
126 16         123 };
127              
128             # get the county and state codes and country2lang conf
129 16         359 my @conf_files = qw(county_codes state_codes country2lang);
130 16         32 foreach my $cfile (@conf_files) {
131 48         91431 $self->{$cfile} = {};
132 48         134 my $yfile = $path . '/' . $cfile . '.yaml';
133 48 100       931 if (-e $yfile) {
134             try {
135 42     42   1011 $self->{$cfile} = LoadFile($yfile);
136             } catch {
137 0     0   0 warn "error parsing $cfile configuration: $_";
138 42         491 };
139             }
140             }
141              
142             # get the abbreviations
143 16         7075 my @abbrv_filenames = File::Find::Rule->file()->name('*.yaml')->in($path . '/abbreviations');
144              
145             # read the config files
146 16         23731 foreach my $abbrv_file (@abbrv_filenames) {
147             try {
148 236 50   236   5788 if ($abbrv_file =~ m/\/(\w\w)\.yaml$/) {
149 236         559 my $lang = $1; # two letter lang code like 'en'
150 236         466 my $rh_c = LoadFile($abbrv_file);
151 236         32360 $self->{abbreviations}->{$lang} = $rh_c;
152             }
153             } catch {
154 0     0   0 warn "error parsing abbrv configuration in $abbrv_file: $_";
155 236         4020 };
156             }
157             #say Dumper $self->{abbreviations};
158             #say Dumper $self->{country2lang};
159 16         252 return 1;
160             }
161              
162              
163             sub final_components {
164 6     6 1 1422 my $self = shift;
165 6 100       18 if (defined($self->{final_components})) {
166 5         17 return $self->{final_components};
167             }
168 1 50       14 warn 'final_components not yet set' if ($show_warnings);
169 1         66 return;
170             }
171              
172              
173             sub format_address {
174 449     449 1 428752 my $self = shift;
175 449   50     8809 my $rh_components = clone(shift) || return;
176 449   100     1992 my $rh_options = shift || {};
177              
178             # 1. make sure empty at the beginning
179 449         1718 $self->{final_components} = undef;
180              
181 449 50       890 if ($debug){
182 0         0 say STDERR "*** in format_address ***";
183 0         0 say STDERR Dumper $rh_options;
184 0         0 say STDERR Dumper $rh_components;
185             }
186              
187             # 2. deal with the options
188              
189             # 2a. which country format will we use?
190             # might have been specified in options
191             # otherwise look at components
192             my $cc = $rh_options->{country}
193 449   100     1383 || $self->_determine_country_code($rh_components)
194             || '';
195              
196 449 100       796 if ($cc) {
197 445         742 $rh_components->{country_code} = $cc;
198 445         949 $self->_set_district_alias($cc);
199             }
200              
201             # 2b. should we abbreviate?
202 449   100     1432 my $abbrv = $rh_options->{abbreviate} // 0;
203              
204             # 2c. was only_address set at the formatting level
205 449         588 my $oa = $only_address;
206 449 100       721 if (defined($rh_options->{only_address})){
207 2         5 $oa = $rh_options->{only_address};
208             }
209              
210 449 50       713 if ($debug){
211 0         0 say STDERR "component_aliases";
212 0         0 say STDERR Dumper $self->{component_aliases};
213             }
214              
215             # done with the options
216              
217             # 3. set the aliases, unless this would overwrite something
218             # need to do this in the right order (as defined in the components file)
219             # For example:
220             # both 'city_district' and 'suburb' are aliases of 'neighbourhood'
221             # so which one should we use if both are present?
222             # We should use the one defined first in the list
223              
224 449         439 my $rhh_p2a;
225 449         1558 foreach my $c (keys %$rh_components){
226              
227             # might not need an alias as it is a primary type
228 3532 100       5889 next if (defined($self->{component_aliases}{$c}));
229              
230             # it is not a primary type
231             # is there an alias?
232 727 100       1465 if (defined($self->{component2type}{$c})){
233 494         758 my $ptype = $self->{component2type}{$c};
234             # but is it already set?
235 494 100       1065 if (! defined($rh_components->{$ptype}) ){
236             # no, we will set it later
237 395         995 $rhh_p2a->{$ptype}{$c} = 1;
238              
239             }
240             }
241             }
242              
243             # now we know which primary types have aliases
244 449         1130 foreach my $ptype (keys %$rhh_p2a){
245             # is there more than one?
246 356         439 my @aliases = keys %{$rhh_p2a->{$ptype}};
  356         855  
247 356 100       777 if (scalar @aliases == 1){
248 318         638 $rh_components->{$ptype} = $rh_components->{$aliases[0]};
249 318         546 next; # we are done with this ptype
250             }
251              
252             # if there is more than one we need to go through the list
253             # so we do them in the right order
254 38         65 foreach my $c (@{$self->{component_aliases}->{$ptype}}){
  38         129  
255 44 100       113 if (defined($rh_components->{$c})){
256 38         84 $rh_components->{$ptype} = $rh_components->{$c};
257 38         83 last; # we are done with this ptype
258             }
259             }
260             }
261              
262 449 50       766 if ($debug){
263 0         0 say STDERR "after component_aliases applied";
264 0         0 say STDERR Dumper $rh_components;
265             }
266              
267             # 4. deal wtih terrible inputs
268 449         1349 $self->_sanity_cleaning($rh_components);
269 449 50       710 if ($debug){
270 0         0 say STDERR "after sanity_cleaning applied";
271 0         0 say STDERR Dumper $rh_components;
272             }
273              
274             # 5. determine the template
275 449         485 my $template_text;
276 449   66     1131 my $rh_config = $self->{templates}{uc($cc)} || $self->{templates}{default};
277            
278 449 100       798 if (defined($rh_options->{address_template})) {
279 2         5 $template_text = $rh_options->{address_template};
280             }
281             else {
282              
283 447 100       763 if (defined($rh_config->{address_template})) {
    50          
284 445         766 $template_text = $rh_config->{address_template};
285             } elsif (defined($self->{templates}{default}{address_template})) {
286 2         4 $template_text = $self->{templates}{default}{address_template};
287             }
288            
289             # do we have the minimal components for an address?
290             # or should we instead use the fallback template?
291 447 100       842 if (!$self->_minimal_components($rh_components)) {
292 48 50       130 say STDERR "using fallback" if ($debug);
293 48 100       151 if (defined($rh_config->{fallback_template})) {
    100          
294 33         63 $template_text = $rh_config->{fallback_template};
295             } elsif (defined($self->{templates}{default}{fallback_template})) {
296 14         35 $template_text = $self->{templates}{default}{fallback_template};
297             }
298             # no fallback
299             }
300              
301             }
302              
303 449 50       758 say STDERR 'template text: ' . $template_text if ($debug);
304              
305             # 6. clean up the components, possibly add codes
306 449         936 $self->_fix_country($rh_components);
307 449 50       711 if ($debug){
308 0         0 say STDERR "after fix_country";
309 0         0 say STDERR Dumper $rh_components;
310             }
311              
312 449         1467 $self->_apply_replacements($rh_components, $rh_config->{replace});
313 449 50       930 if ($debug){
314 0         0 say STDERR "after applying_replacements applied";
315 0         0 say STDERR Dumper $rh_components;
316             }
317 449         986 $self->_add_state_code($rh_components);
318 449         951 $self->_add_county_code($rh_components);
319 449 50       703 if ($debug){
320 0         0 say STDERR "after adding codes";
321 0         0 say STDERR Dumper $rh_components;
322             }
323              
324             # 7. add the attention, if needed
325 449 50       755 if ($debug){
326 0         0 say STDERR "object level only_address: $only_address";
327 0         0 say STDERR "formatting level only_address: $oa";
328             }
329              
330 449 100       613 if ($oa){
331 3 50       9 if ($debug){
332 0         0 say STDERR "not looking for unknown_components";
333 0         0 say STDERR "only_address was specified";
334             }
335             }
336             else {
337 446         763 my $ra_unknown = $self->_find_unknown_components($rh_components);
338 446 50       864 if ($debug){
339 0         0 say STDERR "unknown_components:";
340 0         0 say STDERR Dumper $ra_unknown;
341             }
342 446 100       1007 if (scalar(@$ra_unknown)){
343             $rh_components->{attention} =
344 223         425 join(', ', map { $rh_components->{$_} } @$ra_unknown);
  229         822  
345 223 50       562 if ($debug){
346 0         0 say STDERR "putting unknown_components in 'attention'";
347             }
348             }
349             }
350              
351             # 8. abbreviate, if needed
352 449 100       861 if ($abbrv) {
353 5         19 $rh_components = $self->_abbreviate($rh_components);
354             }
355              
356             # 9. prepare the template
357 449         980 $template_text = $self->_replace_template_lambdas($template_text);
358              
359             # 10. compiled the template
360 449         1971 my $compiled_template =
361             $THC->compile($template_text, {'numeric_string_as_string' => 1});
362              
363 449 50       679087 if ($debug){
364 0         0 say STDERR "before _render_template";
365 0         0 say STDERR Dumper $rh_components;
366 0         0 say STDERR "template: ";
367 0         0 say STDERR Dumper $compiled_template;
368             }
369              
370             # 11. render the template
371 449         1031 my $text = $self->_render_template($compiled_template, $rh_components);
372 449 50       848 if ($debug){
373 0         0 say STDERR "text after _render_template $text";
374             }
375              
376             # 11. postformatting
377 449         1527 $text = $self->_postformat($text, $rh_config->{postformat_replace});
378              
379             # 12. clean again
380 449         1078 $text = $self->_clean($text);
381              
382             # 13. set final components
383 449         827 $self->{final_components} = $rh_components;
384              
385             # all done
386 449         2120 return $text;
387             }
388              
389             # remove duplicates ("Berlin, Berlin"), do replacements and similar
390             sub _postformat {
391 451     451   1447 my $self = shift;
392 451         494 my $text = shift;
393 451         759 my $raa_rules = shift;
394              
395 451 50       686 if ($debug){
396 0         0 say STDERR "entering _postformat: $text"
397             }
398              
399             # remove duplicates
400 451         1041 my @before_pieces = split(/, /, $text);
401 451         632 my %seen;
402             my @after_pieces;
403 451         660 foreach my $piece (@before_pieces) {
404 557         991 $piece =~ s/^\s+//g;
405 557         1302 $seen{$piece}++;
406 557 100       1354 if (lc($piece) ne 'new york') {
407 553 100       999 next if ($seen{$piece} > 1);
408             }
409 556         923 push(@after_pieces, $piece);
410             }
411 451         849 $text = join(', ', @after_pieces);
412              
413             # do any country specific rules
414 451         742 foreach my $ra_fromto (@$raa_rules) {
415             try {
416 505     505   14326 my $regexp = qr/$ra_fromto->[0]/;
417 505         959 my $replacement = $ra_fromto->[1];
418              
419             # ultra hack to do substitution
420             # limited to $1 and $2, should really be a while loop
421             # doing every substitution
422              
423 505 100       1102 if ($replacement =~ m/\$\d/) {
424 45 100       370 if ($text =~ m/$regexp/) {
425 20         51 my $tmp1 = $1;
426 20         39 my $tmp2 = $2;
427 20         39 my $tmp3 = $3;
428 20         74 $replacement =~ s/\$1/$tmp1/;
429 20         65 $replacement =~ s/\$2/$tmp2/;
430 20         42 $replacement =~ s/\$3/$tmp3/;
431             }
432             }
433 505         2444 $text =~ s/$regexp/$replacement/;
434             } catch {
435 0     0   0 warn "invalid replacement: " . join(', ', @$ra_fromto);
436 505         6387 };
437             }
438 451         2879 return $text;
439             }
440              
441             sub _sanity_cleaning {
442 450     450   1419 my $self = shift;
443 450   50     768 my $rh_components = shift || return;
444              
445             # catch insane postcodes
446 450 100       845 if (defined($rh_components->{'postcode'})) {
447 309 100       1277 if (length($rh_components->{'postcode'}) > 20) {
    100          
    100          
448 1         3 delete $rh_components->{'postcode'};
449             } elsif ($rh_components->{'postcode'} =~ m/\d+;\d+/) {
450             # sometimes OSM has postcode ranges
451 1         3 delete $rh_components->{'postcode'};
452             } elsif ($rh_components->{'postcode'} =~ m/^(\d{5}),\d{5}/) {
453 1         4 $rh_components->{'postcode'} = $1;
454             }
455             }
456              
457             # remove things that might be empty
458 450         1058 foreach my $c (keys %$rh_components) {
459             # catch empty values
460 3888 50       12357 if (!defined($rh_components->{$c})) {
    100          
    100          
461 0         0 delete $rh_components->{$c};
462             }
463             # no chars
464             elsif ($rh_components->{$c} !~ m/\w/) {
465 1         2 delete $rh_components->{$c};
466             }
467             # catch values containing URLs
468             elsif ($rh_components->{$c} =~ m|https?://|) {
469 1         4 delete $rh_components->{$c};
470             }
471             }
472 450         828 return;
473             }
474              
475             sub _minimal_components {
476 447     447   606 my $self = shift;
477 447   50     746 my $rh_components = shift || return;
478 447         789 my @required_components = qw(road postcode); #FIXME - should be in conf
479 447         519 my $missing = 0; # number of required components missing
480              
481 447         565 my $minimal_threshold = 2;
482 447         561 foreach my $c (@required_components) {
483 894 100       1505 $missing++ if (!defined($rh_components->{$c}));
484 894 100       1582 return 0 if ($missing == $minimal_threshold);
485             }
486 399         910 return 1;
487             }
488              
489             my %valid_replacement_components = ('state' => 1,);
490              
491             # determines which country code to use
492             # may also override other configuration if we are dealing with
493             # a dependent territory
494             sub _determine_country_code {
495 450     450   569 my $self = shift;
496 450   50     731 my $rh_components = shift || return;
497              
498             # FIXME - validate it is a valid country
499 450 100       1056 return if (!defined($rh_components->{country_code}));
500              
501 446 50       1134 if (my $cc = lc($rh_components->{country_code})) {
502              
503             # is it two letters long?
504 446 50       1734 return if ($cc !~ m/^[a-z][a-z]$/);
505 446 50       924 return 'GB' if ($cc eq 'uk');
506              
507 446         870 $cc = uc($cc);
508              
509             # check if the configuration tells us to use
510             # the configuration of another country
511             # used in cases of dependent territories like
512             # American Samoa (AS) and Puerto Rico (PR)
513 446 100 100     2224 if ( defined($self->{templates}{$cc})
514             && defined($self->{templates}{$cc}{use_country}))
515             {
516 49         104 my $old_cc = $cc;
517 49         117 $cc = $self->{templates}{$cc}{use_country};
518 49 100       142 if (defined($self->{templates}{$old_cc}{change_country})) {
519              
520 36         78 my $new_country = $self->{templates}{$old_cc}{change_country};
521 36 100       140 if ($new_country =~ m/\$(\w*)/) {
522 2         7 my $component = $1;
523 2 50       7 if (defined($rh_components->{$component})) {
524 2         26 $new_country =~ s/\$$component/$rh_components->{$component}/;
525             } else {
526 0         0 $new_country =~ s/\$$component//;
527             }
528             }
529 36         82 $rh_components->{country} = $new_country;
530             }
531 49 100       141 if (defined($self->{templates}{$old_cc}{add_component})) {
532 12         23 my $tmp = $self->{templates}{$old_cc}{add_component};
533 12         54 my ($k, $v) = split(/=/, $tmp);
534             # check whitelist of valid replacement components
535 12 100       43 if (defined($valid_replacement_components{$k})) {
536 11         34 $rh_components->{$k} = $v;
537             }
538             }
539             }
540              
541 446 100       724 if ($cc eq 'NL') {
542 5 50       14 if (defined($rh_components->{state})) {
543 5 100       29 if ($rh_components->{state} eq 'Curaçao') {
    50          
    100          
544 1         2 $cc = 'CW';
545 1         3 $rh_components->{country} = 'Curaçao';
546             } elsif ($rh_components->{state} =~ m/^sint maarten/i) {
547 0         0 $cc = 'SX';
548 0         0 $rh_components->{country} = 'Sint Maarten';
549             } elsif ($rh_components->{state} =~ m/^Aruba/i) {
550 1         7 $cc = 'AW';
551 1         3 $rh_components->{country} = 'Aruba';
552             }
553             }
554             }
555 446         1600 return $cc;
556             }
557 0         0 return;
558             }
559              
560             # hacks for bad country data
561             sub _fix_country {
562 449     449   537 my $self = shift;
563 449   50     730 my $rh_components = shift || return;
564              
565             # is the country a number?
566             # if so, and there is a state, use state as country
567 449 100       763 if (defined($rh_components->{country})) {
568 446 100       1513 if (looks_like_number($rh_components->{country})) {
569 1 50       9 if (defined($rh_components->{state})) {
570 1         6 $rh_components->{country} = $rh_components->{state};
571 1         3 delete $rh_components->{state};
572             }
573             }
574             }
575 449         610 return;
576             }
577              
578             # sets and returns a state code
579             # note may also set other values in some odd edge cases
580             sub _add_state_code {
581 454     454   1063 my $self = shift;
582 454         559 my $rh_components = shift;
583 454         846 return $self->_add_code('state', $rh_components);
584             }
585              
586             sub _add_county_code {
587 450     450   997 my $self = shift;
588 450         443 my $rh_components = shift;
589 450         666 return $self->_add_code('county', $rh_components);
590             }
591              
592             sub _add_code {
593 904     904   910 my $self = shift;
594 904   50     1574 my $keyname = shift // return;
595 904         970 my $rh_components = shift;
596 904 100       1391 return if !$rh_components->{country_code}; # do we know country?
597 895 100       1650 return if !$rh_components->{$keyname}; # do we know state/county?
598              
599 559         965 my $code = $keyname . '_code';
600              
601 559 100       1129 if (defined($rh_components->{$code})) { # do we already have code?
602             # but could have situation
603             # where code and long name are
604             # the same which we want to correct
605 22 100       75 if ($rh_components->{$code} ne $rh_components->{$keyname}) {
606 21         42 return;
607             }
608             }
609              
610             # ensure country_code is uppercase as we use it as conf key
611 538         873 $rh_components->{country_code} = uc($rh_components->{country_code});
612 538         691 my $cc = $rh_components->{country_code};
613              
614 538 100       1638 if (my $mapping = $self->{$code . 's'}{$cc}) {
615              
616 282         471 my $name = $rh_components->{$keyname};
617 282         468 my $uc_name = uc($name);
618              
619 282         9174 LOCCODE: foreach my $abbrv (keys %$mapping) {
620              
621 6576         6261 my @confnames; # can have multiple names for the place
622             # for example in different languages
623              
624 6576 100       8459 if (is_hashref($mapping->{$abbrv})) {
625 766         768 push(@confnames, values %{$mapping->{$abbrv}});
  766         1924  
626             } else {
627 5810         7113 push(@confnames, $mapping->{$abbrv});
628             }
629              
630 6576         6933 foreach my $confname (@confnames) {
631 7358 100       12132 if ($uc_name eq uc($confname)) {
632 180         339 $rh_components->{$code} = $abbrv;
633 180         414 last LOCCODE;
634             }
635             # perhaps instead of passing in a name, we passed in a code
636             # example: state => 'NC'
637             # we want to turn that into
638             # state => 'North Carolina'
639             # state_code => 'NC'
640             #
641 7178 100       11690 if ($uc_name eq $abbrv) {
642 6         12 $rh_components->{$keyname} = $confname;
643 6         11 $rh_components->{$code} = $abbrv;
644 6         14 last LOCCODE;
645             }
646             }
647             }
648             # didn't find a valid code or name
649              
650             # try again for odd variants like "United States Virgin Islands"
651 282 100       885 if ($keyname eq 'state') {
652 253 100       550 if (!defined($rh_components->{state_code})) {
653 80 100       183 if ($cc eq 'US') {
654 2 50       9 if ($rh_components->{state} =~ m/^united states/i) {
655 0         0 my $state = $rh_components->{state};
656 0         0 $state =~ s/^United States/US/i;
657 0         0 foreach my $k (keys %$mapping) {
658 0 0       0 if (uc($state) eq uc($k)) {
659 0         0 $rh_components->{state_code} = $mapping->{$k};
660 0         0 last;
661             }
662             }
663             }
664 2 50       11 if ($rh_components->{state} =~ m/^washington,? d\.?c\.?/i) {
665 2         5 $rh_components->{state_code} = 'DC';
666 2         78 $rh_components->{state} = 'District of Columbia';
667 2         8 $rh_components->{city} = 'Washington';
668             }
669             }
670             }
671             }
672             }
673 538         871 return $rh_components->{$code};
674             }
675              
676             sub _apply_replacements {
677 452     452   1746 my $self = shift;
678 452         464 my $rh_components = shift;
679 452         752 my $raa_rules = shift;
680              
681 452 50       650 if ($debug){
682 0         0 say STDERR "in _apply_replacements";
683 0         0 say STDERR Dumper $raa_rules;
684             }
685              
686 452         1058 foreach my $component (keys %$rh_components) {
687 3887         15096 foreach my $ra_fromto (@$raa_rules) {
688              
689 8121         71499 my $regexp;
690             # do key specific replacement
691 8121 100       27914 if ($ra_fromto->[0] =~ m/^$component=/){
692 82         160 my $from = $ra_fromto->[0];
693 82         418 $from =~ s/^$component=//;
694 82 100       215 if ($rh_components->{$component} eq $from){
695 5         14 $rh_components->{$component} = $ra_fromto->[1];
696             } else {
697 77         140 $regexp = $from;
698             }
699             } else {
700 8039         10332 $regexp = $ra_fromto->[0];
701             }
702 8121 100       12504 if (defined($regexp)){
703             try {
704 8116     8116   202839 my $re = qr/$regexp/;
705 8115         32066 $rh_components->{$component} =~ s/$re/$ra_fromto->[1]/;
706             } catch {
707 1     1   21 warn "invalid replacement: " . join(', ', @$ra_fromto);
708 8116         31407 };
709             }
710             }
711             }
712 452         2313 return $rh_components;
713             }
714              
715             sub _abbreviate {
716 7     7   653 my $self = shift;
717 7   50     15 my $rh_comp = shift // return;
718              
719             # do we know the country?
720 7 100       15 if (!defined($rh_comp->{country_code})) {
721 2 100       5 if ($show_warnings){
722 1         3 my $error_msg = 'no country_code, unable to abbreviate';
723 1 50       8 if (defined($rh_comp->{country})) {
724 1         5 $error_msg .= ' - country: ' . $rh_comp->{country};
725             }
726 1         16 warn $error_msg
727             }
728 2         87 return;
729             }
730              
731             # do we have abbreviations for this country?
732 5         9 my $cc = uc($rh_comp->{country_code});
733              
734             # 1. which languages?
735 5 50       13 if (defined($self->{country2lang}{$cc})) {
736              
737 5         15 my @langs = split(/,/, $self->{country2lang}{$cc});
738              
739 5         9 foreach my $lang (@langs) {
740             # do we have abbrv for this lang?
741 9 100       22 if (defined($self->{abbreviations}->{$lang})) {
742              
743 6         10 my $rh_abbr = $self->{abbreviations}->{$lang};
744 6         15 foreach my $comp_name (keys %$rh_abbr) {
745 10 50       17 next if (!defined($rh_comp->{$comp_name}));
746 10         11 foreach my $long (keys %{$rh_abbr->{$comp_name}}) {
  10         29  
747 62         102 my $short = $rh_abbr->{$comp_name}->{$long};
748 62         436 $rh_comp->{$comp_name} =~ s/\b$long\b/$short/;
749             }
750             }
751             } else {
752             #warn "no abbreviations defined for lang $lang";
753             }
754             }
755             }
756              
757 5         11 return $rh_comp;
758             }
759              
760             # " abc,,def , ghi " => 'abc, def, ghi'
761             sub _clean {
762 904     904   2756 my $self = shift;
763 904   100     1624 my $out = shift // return;
764 903 50       1452 if ($debug){
765 0         0 say STDERR "entering _clean \n$out";
766             }
767              
768 903         1550 $out =~ s/\&#39\;/'/g;
769              
770 903         6235 $out =~ s/[\},\s]+$//;
771 903         2145 $out =~ s/^[,\s]+//;
772              
773 903         1303 $out =~ s/^- //; # line starting with dash due to a parameter missing
774              
775 903         1465 $out =~ s/,\s*,/, /g; # multiple commas to one
776 903         1795 $out =~ s/\h+,\h+/, /g; # one horiz whitespace behind comma
777 903         2523 $out =~ s/\h\h+/ /g; # multiple horiz whitespace to one
778 903         2210 $out =~ s/\h\n/\n/g; # horiz whitespace, newline to newline
779 903         1557 $out =~ s/\n,/\n/g; # newline comma to just newline
780 903         1225 $out =~ s/,,+/,/g; # multiple commas to one
781 903         1238 $out =~ s/,\n/\n/g; # comma newline to just newline
782 903         2264 $out =~ s/\n\h+/\n/g; # newline plus space to newline
783 903         2137 $out =~ s/\n\n+/\n/g; # multiple newline to one
784              
785             # final dedupe across and within lines
786 903         2671 my @before_pieces = split(/\n/, $out);
787 903         1409 my %seen_lines;
788             my @after_pieces;
789 903         1385 foreach my $line (@before_pieces) {
790 3493         5375 $line =~ s/^\h+//g;
791 3493         5163 $line =~ s/\h+$//g;
792 3493         6884 $seen_lines{$line}++;
793 3493 100       6060 next if ($seen_lines{$line} > 1);
794             # now dedupe within the line
795 3475         5854 my @before_words = split(/,/, $line);
796 3475         4012 my %seen_words;
797             my @after_words;
798 3475         4018 foreach my $w (@before_words) {
799 3675         5442 $w =~ s/^\h+//g;
800 3675         5342 $w =~ s/\h+$//g;
801 1 100   1   9 if (lc($w) ne 'new york') {
  1         2  
  1         12  
  3675         7460  
802 3665         30877 $seen_words{$w}++;
803             }
804 3675 100 100     10629 next if ((defined($seen_words{$w})) && ($seen_words{$w} > 1));
805 3674         6112 push(@after_words, $w);
806             }
807 3475         5207 $line = join(', ', @after_words);
808 3475         6755 push(@after_pieces, $line);
809             }
810 903         1547 $out = join("\n", @after_pieces);
811              
812 903         1632 $out =~ s/^\s+//; # remove leading whitespace
813 903         2447 $out =~ s/\s+$//; # remove end whitespace
814              
815 903         1233 $out .= "\n"; # add final newline
816 903         2610 return $out; # we are done
817             }
818              
819             sub _render_template {
820 450     450   2406 my $self = shift;
821 450         499 my $thtemplate = shift;
822 450         483 my $components = shift;
823              
824             # Mustache calls it context
825 450         7759 my $context = clone($components);
826 450 50       1194 say STDERR 'context: ' . Dumper $context if ($debug);
827 450         1108 my $output = $thtemplate->render($context);
828              
829 450         437848 $output = $self->_evaluate_template_lamdas($output);
830              
831 450 50       941 say STDERR "in _render before _clean: $output" if ($debug);
832 450         1669 $output = $self->_clean($output);
833              
834             # is it empty?
835             # if yes and there is only one component then just use that one
836 450 100       1370 if ($output !~ m/\w/) {
837 2         8 my @comps = sort keys %$components;
838 2 50       7 if (scalar(@comps) == 1) {
839 0         0 foreach my $k (@comps) {
840 0         0 $output = $components->{$k};
841             }
842             } # FIXME what if more than one?
843             }
844 450         1663 return $output;
845             }
846              
847             # Text::Hogan apparently caches lambdas when rendering templates. In the past
848             # we needed our lambda 'first', example
849             # {{#first}} {{{city}}} || {{{town}}} {{/first}}
850             # to evaluate the componentes. Whenever the lambda was called with different
851             # component values it consumed memory. Now replace with a simpler implementation
852             #
853             sub _replace_template_lambdas {
854 450     450   1211 my $self = shift;
855 450         682 my $template_text = shift;
856 450         7051 $template_text =~ s!\Q{{#first}}\E(.+?)\Q{{/first}}\E!FIRSTSTART${1}FIRSTEND!g;
857 450         1415 return $template_text;
858             }
859              
860             # We only use a lambda named 'first'
861             sub _evaluate_template_lamdas {
862 450     450   667 my $self = shift;
863 450         663 my $text = shift;
864 450         4411 $text =~ s!FIRSTSTART\s*(.+?)\s*FIRSTEND!_select_first($1)!seg;
  766         1387  
865 450         1126 return $text;
866             }
867              
868             # '|| val1 || || val3' => 'val1'
869             sub _select_first {
870 766     766   1320 my $text = shift;
871 766         4531 my @a_parts = grep { length($_) } split(/\s*\|\|\s*/, $text);
  2553         4097  
872 766 100       4611 return scalar(@a_parts) ? $a_parts[0] : '';
873             }
874              
875             my %small_district = (
876             'br' => 1,
877             'cr' => 1,
878             'es' => 1,
879             'ni' => 1,
880             'py' => 1,
881             'ro' => 1,
882             'tg' => 1,
883             'tm' => 1,
884             'xk' => 1,
885             );
886              
887             # correct the alias for "district"
888             # in OSM some countries use district to mean "city_district"
889             # others to mean "state_district"
890             sub _set_district_alias {
891 445     445   521 my $self = shift;
892 445         485 my $cc = shift;
893              
894             # this may get called repeatedly
895             # no need to do the work again
896 445 50       700 if (defined($cc)){
897 445         687 my $ucc = uc($cc);
898 445 100       1131 return if (defined($self->{set_district_alias}{$ucc}));
899 224         527 $self->{set_district_alias}{$ucc} = 1;
900             }
901              
902 224         276 my $oldalias;
903 224 50       474 if (defined($small_district{$cc})){
904 0         0 $self->{component2type}{district} = 'neighbourhood';
905 0         0 $oldalias = 'state_district';
906              
907             # add to the neighbourhood alias list
908             # though of course we are just sticking it at the end
909 0         0 push(@{$self->{component_aliases}{'neighbourhood'}}, 'district');
  0         0  
910              
911             } else {
912             # set 'district' to be type 'state_district'
913 224         364 $self->{component2type}{district} = 'state_district';
914 224         312 $oldalias = 'neighbourhood';
915              
916             # add to the state_district alias list
917 224         287 push(@{$self->{component_aliases}{'state_district'}}, 'district');
  224         597  
918             }
919              
920             # remove from the old alias list
921 224         308 my @temp = grep { $_ ne 'district' } @{$self->{component_aliases}{$oldalias}};
  2862         4272  
  224         502  
922 224         625 $self->{component_aliases}{$oldalias} = \@temp;
923 224         446 return;
924             }
925              
926              
927             # returns []
928             sub _find_unknown_components {
929 447     447   1016 my $self = shift;
930 447         430 my $rh_components = shift;
931              
932 447         455 my %h_known = map { $_ => 1 } @{$self->{ordered_components}};
  27774         38593  
  447         928  
933 447         3785 my @a_unknown = grep { !exists($h_known{$_}) } sort keys %$rh_components;
  4036         5954  
934 447         2465 return \@a_unknown;
935             }
936              
937             1;
938              
939             __END__
940              
941             =pod
942              
943             =encoding UTF-8
944              
945             =head1 NAME
946              
947             Geo::Address::Formatter - take structured address data and format it according to the various global/country rules
948              
949             =head1 VERSION
950              
951             version 1.997
952              
953             =head1 SYNOPSIS
954              
955             #
956             # get the templates (or use your own)
957             # git clone git@github.com:OpenCageData/address-formatting.git
958             #
959             my $GAF = Geo::Address::Formatter->new( conf_path => '/path/to/templates' );
960              
961             my $components = { ... }
962             my $text = $GAF->format_address($components, { country => 'FR' } );
963             my $rh_final_components = $GAF->final_components();
964             #
965             # or if we want shorter output
966             #
967             my $short_text = $GAF->format_address($components, { country => 'FR', abbreviate => 1, });
968              
969             =head2 new
970              
971             my $GAF = Geo::Address::Formatter->new( conf_path => '/path/to/templates' );
972              
973             Returns one instance. The I<conf_path> is required.
974              
975             Optional parameters are:
976              
977             I<debug>: prints tons of debugging info for use in development.
978              
979             I<no_warnings>: turns off a few warnings if configuration is not optimal.
980              
981             I<only_address>: formatted will only contain known components (will not include POI names like). Note, can be overridden with optional param to format_address method.
982              
983             =head2 final_components
984              
985             my $rh_components = $GAF->final_components();
986              
987             returns a reference to a hash of the final components that are set at the
988             completion of B<format_address>. Warns if called before they have been set
989             (unless I<no_warnings> was set at object creation).
990              
991             =head2 format_address
992              
993             my $text = $GAF->format_address(\%components, \%options );
994              
995             Given a structures address (hashref) and options (hashref) returns a
996             formatted address.
997              
998             Possible options are:
999              
1000             'abbreviate', if supplied common abbreviations are applied
1001             to the resulting output.
1002              
1003             'address_template', a mustache format template to be used instead of the template
1004             defined in the configuration
1005              
1006             'country', which should be an uppercase ISO 3166-1:alpha-2 code
1007             e.g. 'GB' for Great Britain, 'DE' for Germany, etc.
1008             If ommited we try to find the country in the address components.
1009              
1010             'only_address', same as only_address global option but set at formatting level
1011              
1012             =head1 DESCRIPTION
1013              
1014             You have a structured postal address (hash) and need to convert it into a
1015             readable address based on the format of the address country.
1016              
1017             For example, you have:
1018              
1019             {
1020             house_number => 12,
1021             street => 'Avenue Road',
1022             postcode => 45678,
1023             city => 'Deville'
1024             }
1025              
1026             you need:
1027              
1028             Great Britain: 12 Avenue Road, Deville 45678
1029             France: 12 Avenue Road, 45678 Deville
1030             Germany: Avenue Road 12, 45678 Deville
1031             Latvia: Avenue Road 12, Deville, 45678
1032              
1033             It gets more complicated with 200+ countries and territories and dozens more
1034             address components to consider.
1035              
1036             This module comes with a minimal configuration to run tests. Instead of
1037             developing your own configuration please use (and contribute to)
1038             those in https://github.com/OpenCageData/address-formatting
1039             which includes test cases.
1040              
1041             Together we can address the world!
1042              
1043             =head1 AUTHOR
1044              
1045             Ed Freyfogle
1046              
1047             =head1 COPYRIGHT AND LICENSE
1048              
1049             This software is copyright (c) 2023 by Opencage GmbH.
1050              
1051             This is free software; you can redistribute it and/or modify it under
1052             the same terms as the Perl 5 programming language system itself.
1053              
1054             =cut