File Coverage

lib/Geo/Address/Formatter.pm
Criterion Covered Total %
statement 448 508 88.1
branch 180 228 78.9
condition 32 46 69.5
subroutine 42 47 89.3
pod 3 3 100.0
total 705 832 84.7


line stmt bran cond sub pod time code
1             package Geo::Address::Formatter;
2             $Geo::Address::Formatter::VERSION = '1.996';
3             # ABSTRACT: take structured address data and format it according to the various global/country rules
4              
5 6     6   32131 use strict;
  6         11  
  6         173  
6 6     6   29 use warnings;
  6         8  
  6         155  
7 6     6   31 use feature qw(say);
  6         15  
  6         546  
8 6     6   2164 use Clone qw(clone);
  6         12116  
  6         342  
9 6     6   1284 use Data::Dumper;
  6         13602  
  6         354  
10             $Data::Dumper::Sortkeys = 1;
11 6     6   36 use File::Basename qw(dirname);
  6         10  
  6         305  
12 6     6   2026 use File::Find::Rule;
  6         32956  
  6         44  
13 6     6   2785 use Ref::Util qw(is_hashref);
  6         8046  
  6         458  
14 6     6   43 use Scalar::Util qw(looks_like_number);
  6         14  
  6         285  
15 6     6   2630 use Text::Hogan::Compiler;
  6         40252  
  6         207  
16 6     6   2362 use Try::Catch;
  6         4130  
  6         346  
17 6     6   1695 use YAML::XS qw(LoadFile);
  6         11072  
  6         345  
18 6     6   39 use utf8;
  6         54  
  6         34  
19              
20             my $THC = Text::Hogan::Compiler->new;
21              
22             # optional params
23             my $show_warnings = 1;
24             my $debug = 0;
25             my $only_address = 0;
26              
27              
28             sub new {
29 17     17 1 14759 my ($class, %params) = @_;
30              
31 17         36 my $self = {};
32 17   50     65 my $conf_path = $params{conf_path} || die "no conf_path set";
33              
34             # optional params
35 17 50 66     57 if ( defined($params{no_warnings}) && ($params{no_warnings})){
36 1         3 $show_warnings = 0;
37             }
38 17   66     64 $only_address = (defined($params{only_address}) && $params{only_address}) // 0;
      50        
39 17   66     68 $debug = (defined($params{debug}) && $params{debug}) // 0;
      50        
40              
41 17         44 $self->{final_components} = undef;
42 17         26 $self->{set_district_alias} = {};
43              
44 17         29 bless($self, $class);
45              
46 17 50       47 say STDERR "************* in Geo::Address::Formatter::new ***" if ($debug);
47            
48 17 100       43 if ($self->_read_configuration($conf_path)){
49 16         73 return $self;
50             }
51 1         16 die 'unable to read configuration';
52             }
53              
54             sub _read_configuration {
55 17     17   26 my $self = shift;
56 17         26 my $path = shift;
57              
58 17 50       353 return if (! -e $path);
59              
60 17         598 my @a_filenames = File::Find::Rule->file()->name('*.yaml')->in($path . '/countries');
61              
62 17         17161 $self->{templates} = {};
63 17         37 $self->{component_aliases} = {};
64 17         29 $self->{component2type} = {};
65 17         32 $self->{ordered_components} = [];
66              
67             # read the config file(s)
68 17         34 my $loaded = 0;
69 17         56 foreach my $filename (sort @a_filenames) {
70             try {
71 16     16   388 my $rh_templates = LoadFile($filename);
72              
73             # if file 00-default.yaml defines 'DE' (Germany) and
74             # file 01-germany.yaml does as well, then the second
75             # occurance of the key overwrites the first.
76 16         32160 foreach (keys %$rh_templates) {
77 2910         3858 $self->{templates}{$_} = $rh_templates->{$_};
78             }
79 16         303 $loaded = 1;
80             } catch {
81 0     0   0 warn "error parsing country configuration in $filename: $_";
82 16         114 };
83             }
84 17 100       326 return if ($loaded == 0);
85              
86             # see if we can load the components
87             try {
88 16 50   16   405 say STDERR "loading components" if ($debug);
89 16         61 my @c = LoadFile($path . '/components.yaml');
90              
91 16 50       3605 if ($debug){
92 0         0 say STDERR Dumper \@c;
93             }
94              
95 16         43 foreach my $rh_c (@c) {
96 234 50       345 if (defined($rh_c->{name})){
97 234 100       310 if (defined($rh_c->{aliases})){
98 128         267 $self->{component_aliases}{$rh_c->{name}} = $rh_c->{aliases};
99             } else {
100 106         236 $self->{component_aliases}{$rh_c->{name}} = [];
101             }
102             }
103             }
104              
105 16         26 foreach my $rh_c (@c) {
106 234         230 push(@{$self->{ordered_components}}, $rh_c->{name});
  234         386  
107 234         368 $self->{component2type}->{$rh_c->{name}} = $rh_c->{name};
108              
109 234 100       356 if (defined($rh_c->{aliases})) {
110 128         130 foreach my $alias (@{$rh_c->{aliases}}) {
  128         174  
111 480         446 push(@{$self->{ordered_components}}, $alias);
  480         599  
112 480         909 $self->{component2type}->{$alias} = $rh_c->{name};
113             }
114             }
115             }
116 16 50       113 if ($debug){
117 0         0 say STDERR 'component_aliases';
118 0         0 say STDERR Dumper $self->{component_aliases};
119 0         0 say STDERR 'ordered_components';
120 0         0 say STDERR Dumper $self->{ordered_components};
121 0         0 say STDERR 'component2type';
122 0         0 say STDERR Dumper $self->{component2type};
123             }
124             } catch {
125 0     0   0 warn "error parsing component configuration: $_";
126 16         113 };
127              
128             # get the county and state codes and country2lang conf
129 16         336 my @conf_files = qw(county_codes state_codes country2lang);
130 16         30 foreach my $cfile (@conf_files) {
131 48         90198 $self->{$cfile} = {};
132 48         158 my $yfile = $path . '/' . $cfile . '.yaml';
133 48 100       956 if (-e $yfile) {
134             try {
135 42     42   997 $self->{$cfile} = LoadFile($yfile);
136             } catch {
137 0     0   0 warn "error parsing $cfile configuration: $_";
138 42         480 };
139             }
140             }
141              
142             # get the abbreviations
143 16         7139 my @abbrv_filenames = File::Find::Rule->file()->name('*.yaml')->in($path . '/abbreviations');
144              
145             # read the config files
146 16         23756 foreach my $abbrv_file (@abbrv_filenames) {
147             try {
148 236 50   236   5899 if ($abbrv_file =~ m/\/(\w\w)\.yaml$/) {
149 236         571 my $lang = $1; # two letter lang code like 'en'
150 236         472 my $rh_c = LoadFile($abbrv_file);
151 236         33731 $self->{abbreviations}->{$lang} = $rh_c;
152             }
153             } catch {
154 0     0   0 warn "error parsing abbrv configuration in $abbrv_file: $_";
155 236         4126 };
156             }
157             #say Dumper $self->{abbreviations};
158             #say Dumper $self->{country2lang};
159 16         252 return 1;
160             }
161              
162              
163             sub final_components {
164 6     6 1 1351 my $self = shift;
165 6 100       21 if (defined($self->{final_components})) {
166 5         14 return $self->{final_components};
167             }
168 1 50       13 warn 'final_components not yet set' if ($show_warnings);
169 1         84 return;
170             }
171              
172              
173             sub format_address {
174 449     449 1 430459 my $self = shift;
175 449   50     8765 my $rh_components = clone(shift) || return;
176 449   100     2349 my $rh_options = shift || {};
177              
178             # 1. make sure empty at the beginning
179 449         1613 $self->{final_components} = undef;
180              
181 449 50       948 if ($debug){
182 0         0 say STDERR "*** in format_address ***";
183 0         0 say STDERR Dumper $rh_options;
184 0         0 say STDERR Dumper $rh_components;
185             }
186              
187             # 2. deal with the options
188              
189             # 2a. which country format will we use?
190             # might have been specified in options
191             # otherwise look at components
192             my $cc = $rh_options->{country}
193 449   100     1404 || $self->_determine_country_code($rh_components)
194             || '';
195              
196 449 100       888 if ($cc) {
197 445         590 $rh_components->{country_code} = $cc;
198 445         814 $self->_set_district_alias($cc);
199             }
200              
201             # 2b. should we abbreviate?
202 449   100     1428 my $abbrv = $rh_options->{abbreviate} // 0;
203              
204             # 2c. was only_address set at the formatting level
205 449         644 my $oa = $only_address;
206 449 100       727 if (defined($rh_options->{only_address})){
207 2         4 $oa = $rh_options->{only_address};
208             }
209              
210 449 50       667 if ($debug){
211 0         0 say STDERR "component_aliases";
212 0         0 say STDERR Dumper $self->{component_aliases};
213             }
214              
215             # done with the options
216              
217             # 3. set the aliases, unless this would overwrite something
218             # need to do this in the right order (as defined in the components file)
219             # For example:
220             # both 'city_district' and 'suburb' are aliases of 'neighbourhood'
221             # so which one should we use if both are present?
222             # We should use the one defined first in the list
223              
224 449         480 my $rhh_p2a;
225 449         1573 foreach my $c (keys %$rh_components){
226              
227             # might not need an alias as it is a primary type
228 3532 100       5870 next if (defined($self->{component_aliases}{$c}));
229              
230             # it is not a primary type
231             # is there an alias?
232 727 100       1356 if (defined($self->{component2type}{$c})){
233 494         688 my $ptype = $self->{component2type}{$c};
234             # but is it already set?
235 494 100       942 if (! defined($rh_components->{$ptype}) ){
236             # no, we will set it later
237 395         934 $rhh_p2a->{$ptype}{$c} = 1;
238              
239             }
240             }
241             }
242              
243             # now we know which primary types have aliases
244 449         1094 foreach my $ptype (keys %$rhh_p2a){
245             # is there more than one?
246 356         404 my @aliases = keys %{$rhh_p2a->{$ptype}};
  356         823  
247 356 100       800 if (scalar @aliases == 1){
248 318         699 $rh_components->{$ptype} = $rh_components->{$aliases[0]};
249 318         582 next; # we are done with this ptype
250             }
251              
252             # if there is more than one we need to go through the list
253             # so we do them in the right order
254 38         77 foreach my $c (@{$self->{component_aliases}->{$ptype}}){
  38         128  
255 44 100       104 if (defined($rh_components->{$c})){
256 38         77 $rh_components->{$ptype} = $rh_components->{$c};
257 38         75 last; # we are done with this ptype
258             }
259             }
260             }
261              
262 449 50       801 if ($debug){
263 0         0 say STDERR "after component_aliases applied";
264 0         0 say STDERR Dumper $rh_components;
265             }
266              
267             # 4. deal wtih terrible inputs
268 449         1062 $self->_sanity_cleaning($rh_components);
269 449 50       673 if ($debug){
270 0         0 say STDERR "after sanity_cleaning applied";
271 0         0 say STDERR Dumper $rh_components;
272             }
273              
274             # 5. determine the template
275 449         500 my $template_text;
276 449   66     1143 my $rh_config = $self->{templates}{uc($cc)} || $self->{templates}{default};
277            
278 449 100       759 if (defined($rh_options->{address_template})) {
279 2         5 $template_text = $rh_options->{address_template};
280             }
281             else {
282              
283 447 100       798 if (defined($rh_config->{address_template})) {
    50          
284 445         902 $template_text = $rh_config->{address_template};
285             } elsif (defined($self->{templates}{default}{address_template})) {
286 2         4 $template_text = $self->{templates}{default}{address_template};
287             }
288            
289             # do we have the minimal components for an address?
290             # or should we instead use the fallback template?
291 447 100       859 if (!$self->_minimal_components($rh_components)) {
292 48 50       132 say STDERR "using fallback" if ($debug);
293 48 100       189 if (defined($rh_config->{fallback_template})) {
    100          
294 33         87 $template_text = $rh_config->{fallback_template};
295             } elsif (defined($self->{templates}{default}{fallback_template})) {
296 14         34 $template_text = $self->{templates}{default}{fallback_template};
297             }
298             # no fallback
299             }
300              
301             }
302              
303 449 50       748 say STDERR 'template text: ' . $template_text if ($debug);
304              
305             # 6. clean up the components, possibly add codes
306 449         970 $self->_fix_country($rh_components);
307 449 50       652 if ($debug){
308 0         0 say STDERR "after fix_country";
309 0         0 say STDERR Dumper $rh_components;
310             }
311              
312 449         1465 $self->_apply_replacements($rh_components, $rh_config->{replace});
313 449 50       967 if ($debug){
314 0         0 say STDERR "after applying_replacements applied";
315 0         0 say STDERR Dumper $rh_components;
316             }
317 449         1000 $self->_add_state_code($rh_components);
318 449         925 $self->_add_county_code($rh_components);
319 449 50       740 if ($debug){
320 0         0 say STDERR "after adding codes";
321 0         0 say STDERR Dumper $rh_components;
322             }
323              
324             # 7. add the attention, if needed
325 449 50       773 if ($debug){
326 0         0 say STDERR "object level only_address: $only_address";
327 0         0 say STDERR "formatting level only_address: $oa";
328             }
329              
330 449 100       630 if ($oa){
331 3 50       8 if ($debug){
332 0         0 say STDERR "not looking for unknown_components";
333 0         0 say STDERR "only_address was specified";
334             }
335             }
336             else {
337 446         832 my $ra_unknown = $self->_find_unknown_components($rh_components);
338 446 50       901 if ($debug){
339 0         0 say STDERR "unknown_components:";
340 0         0 say STDERR Dumper $ra_unknown;
341             }
342 446 100       905 if (scalar(@$ra_unknown)){
343             $rh_components->{attention} =
344 223         366 join(', ', map { $rh_components->{$_} } @$ra_unknown);
  229         765  
345 223 50       566 if ($debug){
346 0         0 say STDERR "putting unknown_components in 'attention'";
347             }
348             }
349             }
350              
351             # 8. abbreviate, if needed
352 449 100       871 if ($abbrv) {
353 5         10 $rh_components = $self->_abbreviate($rh_components);
354             }
355              
356             # 9. prepare the template
357 449         946 $template_text = $self->_replace_template_lambdas($template_text);
358              
359             # 10. compiled the template
360 449         1860 my $compiled_template =
361             $THC->compile($template_text, {'numeric_string_as_string' => 1});
362              
363 449 50       682249 if ($debug){
364 0         0 say STDERR "before _render_template";
365 0         0 say STDERR Dumper $rh_components;
366 0         0 say STDERR "template: ";
367 0         0 say STDERR Dumper $compiled_template;
368             }
369              
370             # 11. render the template
371 449         1023 my $text = $self->_render_template($compiled_template, $rh_components);
372 449 50       944 if ($debug){
373 0         0 say STDERR "text after _render_template $text";
374             }
375              
376             # 11. postformatting
377 449         1406 $text = $self->_postformat($text, $rh_config->{postformat_replace});
378              
379             # 12. clean again
380 449         1009 $text = $self->_clean($text);
381              
382             # 13. set final components
383 449         776 $self->{final_components} = $rh_components;
384              
385             # all done
386 449         1863 return $text;
387             }
388              
389             # remove duplicates ("Berlin, Berlin"), do replacements and similar
390             sub _postformat {
391 451     451   1193 my $self = shift;
392 451         502 my $text = shift;
393 451         725 my $raa_rules = shift;
394              
395 451 50       698 if ($debug){
396 0         0 say STDERR "entering _postformat: $text"
397             }
398              
399             # remove duplicates
400 451         1173 my @before_pieces = split(/, /, $text);
401 451         594 my %seen;
402             my @after_pieces;
403 451         614 foreach my $piece (@before_pieces) {
404 557         916 $piece =~ s/^\s+//g;
405 557         1446 $seen{$piece}++;
406 557 100       1388 if (lc($piece) ne 'new york') {
407 553 100       995 next if ($seen{$piece} > 1);
408             }
409 556         936 push(@after_pieces, $piece);
410             }
411 451         838 $text = join(', ', @after_pieces);
412              
413             # do any country specific rules
414 451         745 foreach my $ra_fromto (@$raa_rules) {
415             try {
416 505     505   14274 my $regexp = qr/$ra_fromto->[0]/;
417 505         915 my $replacement = $ra_fromto->[1];
418              
419             # ultra hack to do substitution
420             # limited to $1 and $2, should really be a while loop
421             # doing every substitution
422              
423 505 100       1216 if ($replacement =~ m/\$\d/) {
424 45 100       376 if ($text =~ m/$regexp/) {
425 20         48 my $tmp1 = $1;
426 20         45 my $tmp2 = $2;
427 20         37 my $tmp3 = $3;
428 20         71 $replacement =~ s/\$1/$tmp1/;
429 20         64 $replacement =~ s/\$2/$tmp2/;
430 20         46 $replacement =~ s/\$3/$tmp3/;
431             }
432             }
433 505         2426 $text =~ s/$regexp/$replacement/;
434             } catch {
435 0     0   0 warn "invalid replacement: " . join(', ', @$ra_fromto);
436 505         6245 };
437             }
438 451         2820 return $text;
439             }
440              
441             sub _sanity_cleaning {
442 450     450   1352 my $self = shift;
443 450   50     867 my $rh_components = shift || return;
444              
445             # catch insane postcodes
446 450 100       826 if (defined($rh_components->{'postcode'})) {
447 309 100       1343 if (length($rh_components->{'postcode'}) > 20) {
    100          
    100          
448 1         3 delete $rh_components->{'postcode'};
449             } elsif ($rh_components->{'postcode'} =~ m/\d+;\d+/) {
450             # sometimes OSM has postcode ranges
451 1         2 delete $rh_components->{'postcode'};
452             } elsif ($rh_components->{'postcode'} =~ m/^(\d{5}),\d{5}/) {
453 1         4 $rh_components->{'postcode'} = $1;
454             }
455             }
456              
457             # remove things that might be empty
458 450         1107 foreach my $c (keys %$rh_components) {
459             # catch empty values
460 3888 50       12708 if (!defined($rh_components->{$c})) {
    100          
    100          
461 0         0 delete $rh_components->{$c};
462             }
463             # no chars
464             elsif ($rh_components->{$c} !~ m/\w/) {
465 1         3 delete $rh_components->{$c};
466             }
467             # catch values containing URLs
468             elsif ($rh_components->{$c} =~ m|https?://|) {
469 1         5 delete $rh_components->{$c};
470             }
471             }
472 450         875 return;
473             }
474              
475             sub _minimal_components {
476 447     447   573 my $self = shift;
477 447   50     809 my $rh_components = shift || return;
478 447         865 my @required_components = qw(road postcode); #FIXME - should be in conf
479 447         513 my $missing = 0; # number of required components missing
480              
481 447         500 my $minimal_threshold = 2;
482 447         587 foreach my $c (@required_components) {
483 894 100       1489 $missing++ if (!defined($rh_components->{$c}));
484 894 100       1539 return 0 if ($missing == $minimal_threshold);
485             }
486 399         910 return 1;
487             }
488              
489             my %valid_replacement_components = ('state' => 1,);
490              
491             # determines which country code to use
492             # may also override other configuration if we are dealing with
493             # a dependent territory
494             sub _determine_country_code {
495 450     450   659 my $self = shift;
496 450   50     756 my $rh_components = shift || return;
497              
498             # FIXME - validate it is a valid country
499 450 100       900 return if (!defined($rh_components->{country_code}));
500              
501 446 50       1045 if (my $cc = lc($rh_components->{country_code})) {
502              
503             # is it two letters long?
504 446 50       1955 return if ($cc !~ m/^[a-z][a-z]$/);
505 446 50       879 return 'GB' if ($cc eq 'uk');
506              
507 446         778 $cc = uc($cc);
508              
509             # check if the configuration tells us to use
510             # the configuration of another country
511             # used in cases of dependent territories like
512             # American Samoa (AS) and Puerto Rico (PR)
513 446 100 100     2588 if ( defined($self->{templates}{$cc})
514             && defined($self->{templates}{$cc}{use_country}))
515             {
516 49         116 my $old_cc = $cc;
517 49         93 $cc = $self->{templates}{$cc}{use_country};
518 49 100       150 if (defined($self->{templates}{$old_cc}{change_country})) {
519              
520 36         78 my $new_country = $self->{templates}{$old_cc}{change_country};
521 36 100       121 if ($new_country =~ m/\$(\w*)/) {
522 2         5 my $component = $1;
523 2 50       6 if (defined($rh_components->{$component})) {
524 2         29 $new_country =~ s/\$$component/$rh_components->{$component}/;
525             } else {
526 0         0 $new_country =~ s/\$$component//;
527             }
528             }
529 36         80 $rh_components->{country} = $new_country;
530             }
531 49 100       129 if (defined($self->{templates}{$old_cc}{add_component})) {
532 12         29 my $tmp = $self->{templates}{$old_cc}{add_component};
533 12         53 my ($k, $v) = split(/=/, $tmp);
534             # check whitelist of valid replacement components
535 12 100       43 if (defined($valid_replacement_components{$k})) {
536 11         25 $rh_components->{$k} = $v;
537             }
538             }
539             }
540              
541 446 100       795 if ($cc eq 'NL') {
542 5 50       24 if (defined($rh_components->{state})) {
543 5 100       32 if ($rh_components->{state} eq 'Curaçao') {
    50          
    100          
544 1         2 $cc = 'CW';
545 1         3 $rh_components->{country} = 'Curaçao';
546             } elsif ($rh_components->{state} =~ m/^sint maarten/i) {
547 0         0 $cc = 'SX';
548 0         0 $rh_components->{country} = 'Sint Maarten';
549             } elsif ($rh_components->{state} =~ m/^Aruba/i) {
550 1         2 $cc = 'AW';
551 1         3 $rh_components->{country} = 'Aruba';
552             }
553             }
554             }
555 446         1603 return $cc;
556             }
557 0         0 return;
558             }
559              
560             # hacks for bad country data
561             sub _fix_country {
562 449     449   543 my $self = shift;
563 449   50     788 my $rh_components = shift || return;
564              
565             # is the country a number?
566             # if so, and there is a state, use state as country
567 449 100       801 if (defined($rh_components->{country})) {
568 446 100       1611 if (looks_like_number($rh_components->{country})) {
569 1 50       4 if (defined($rh_components->{state})) {
570 1         2 $rh_components->{country} = $rh_components->{state};
571 1         3 delete $rh_components->{state};
572             }
573             }
574             }
575 449         572 return;
576             }
577              
578             # sets and returns a state code
579             # note may also set other values in some odd edge cases
580             sub _add_state_code {
581 454     454   1050 my $self = shift;
582 454         531 my $rh_components = shift;
583 454         818 return $self->_add_code('state', $rh_components);
584             }
585              
586             sub _add_county_code {
587 450     450   1065 my $self = shift;
588 450         467 my $rh_components = shift;
589 450         708 return $self->_add_code('county', $rh_components);
590             }
591              
592             sub _add_code {
593 904     904   890 my $self = shift;
594 904   50     1611 my $keyname = shift // return;
595 904         904 my $rh_components = shift;
596 904 100       1524 return if !$rh_components->{country_code}; # do we know country?
597 895 100       1589 return if !$rh_components->{$keyname}; # do we know state/county?
598              
599 559         915 my $code = $keyname . '_code';
600              
601 559 100       987 if (defined($rh_components->{$code})) { # do we already have code?
602             # but could have situation
603             # where code and long name are
604             # the same which we want to correct
605 22 100       76 if ($rh_components->{$code} ne $rh_components->{$keyname}) {
606 21         38 return;
607             }
608             }
609              
610             # ensure country_code is uppercase as we use it as conf key
611 538         818 $rh_components->{country_code} = uc($rh_components->{country_code});
612 538         649 my $cc = $rh_components->{country_code};
613              
614 538 100       1691 if (my $mapping = $self->{$code . 's'}{$cc}) {
615              
616 282         421 my $name = $rh_components->{$keyname};
617 282         538 my $uc_name = uc($name);
618              
619 282         9155 LOCCODE: foreach my $abbrv (keys %$mapping) {
620              
621 6670         6507 my @confnames; # can have multiple names for the place
622             # for example in different languages
623              
624 6670 100       8801 if (is_hashref($mapping->{$abbrv})) {
625 883         879 push(@confnames, values %{$mapping->{$abbrv}});
  883         2043  
626             } else {
627 5787         7167 push(@confnames, $mapping->{$abbrv});
628             }
629              
630 6670         7193 foreach my $confname (@confnames) {
631 7635 100       12798 if ($uc_name eq uc($confname)) {
632 180         328 $rh_components->{$code} = $abbrv;
633 180         438 last LOCCODE;
634             }
635             # perhaps instead of passing in a name, we passed in a code
636             # example: state => 'NC'
637             # we want to turn that into
638             # state => 'North Carolina'
639             # state_code => 'NC'
640             #
641 7455 100       12050 if ($uc_name eq $abbrv) {
642 6         14 $rh_components->{$keyname} = $confname;
643 6         14 $rh_components->{$code} = $abbrv;
644 6         13 last LOCCODE;
645             }
646             }
647             }
648             # didn't find a valid code or name
649              
650             # try again for odd variants like "United States Virgin Islands"
651 282 100       885 if ($keyname eq 'state') {
652 253 100       598 if (!defined($rh_components->{state_code})) {
653 80 100       206 if ($cc eq 'US') {
654 2 50       17 if ($rh_components->{state} =~ m/^united states/i) {
655 0         0 my $state = $rh_components->{state};
656 0         0 $state =~ s/^United States/US/i;
657 0         0 foreach my $k (keys %$mapping) {
658 0 0       0 if (uc($state) eq uc($k)) {
659 0         0 $rh_components->{state_code} = $mapping->{$k};
660 0         0 last;
661             }
662             }
663             }
664 2 50       13 if ($rh_components->{state} =~ m/^washington,? d\.?c\.?/i) {
665 2         5 $rh_components->{state_code} = 'DC';
666 2         41 $rh_components->{state} = 'District of Columbia';
667 2         4 $rh_components->{city} = 'Washington';
668             }
669             }
670             }
671             }
672             }
673 538         895 return $rh_components->{$code};
674             }
675              
676             sub _apply_replacements {
677 452     452   1634 my $self = shift;
678 452         465 my $rh_components = shift;
679 452         720 my $raa_rules = shift;
680              
681 452 50       705 if ($debug){
682 0         0 say STDERR "in _apply_replacements";
683 0         0 say STDERR Dumper $raa_rules;
684             }
685              
686 452         1040 foreach my $component (keys %$rh_components) {
687 3887         15085 foreach my $ra_fromto (@$raa_rules) {
688              
689 8121         72731 my $regexp;
690             # do key specific replacement
691 8121 100       28288 if ($ra_fromto->[0] =~ m/^$component=/){
692 82         149 my $from = $ra_fromto->[0];
693 82         355 $from =~ s/^$component=//;
694 82 100       207 if ($rh_components->{$component} eq $from){
695 5         15 $rh_components->{$component} = $ra_fromto->[1];
696             } else {
697 77         133 $regexp = $from;
698             }
699             } else {
700 8039         10529 $regexp = $ra_fromto->[0];
701             }
702 8121 100       12160 if (defined($regexp)){
703             try {
704 8116     8116   203943 my $re = qr/$regexp/;
705 8115         31220 $rh_components->{$component} =~ s/$re/$ra_fromto->[1]/;
706             } catch {
707 1     1   21 warn "invalid replacement: " . join(', ', @$ra_fromto);
708 8116         30423 };
709             }
710             }
711             }
712 452         2246 return $rh_components;
713             }
714              
715             sub _abbreviate {
716 7     7   658 my $self = shift;
717 7   50     13 my $rh_comp = shift // return;
718              
719             # do we know the country?
720 7 100       17 if (!defined($rh_comp->{country_code})) {
721 2 100       13 if ($show_warnings){
722 1         2 my $error_msg = 'no country_code, unable to abbreviate';
723 1 50       3 if (defined($rh_comp->{country})) {
724 1         6 $error_msg .= ' - country: ' . $rh_comp->{country};
725             }
726 1         14 warn $error_msg
727             }
728 2         90 return;
729             }
730              
731             # do we have abbreviations for this country?
732 5         8 my $cc = uc($rh_comp->{country_code});
733              
734             # 1. which languages?
735 5 50       10 if (defined($self->{country2lang}{$cc})) {
736              
737 5         15 my @langs = split(/,/, $self->{country2lang}{$cc});
738              
739 5         7 foreach my $lang (@langs) {
740             # do we have abbrv for this lang?
741 9 100       21 if (defined($self->{abbreviations}->{$lang})) {
742              
743 6         7 my $rh_abbr = $self->{abbreviations}->{$lang};
744 6         14 foreach my $comp_name (keys %$rh_abbr) {
745 10 50       20 next if (!defined($rh_comp->{$comp_name}));
746 10         11 foreach my $long (keys %{$rh_abbr->{$comp_name}}) {
  10         26  
747 62         96 my $short = $rh_abbr->{$comp_name}->{$long};
748 62         468 $rh_comp->{$comp_name} =~ s/\b$long\b/$short/;
749             }
750             }
751             } else {
752             #warn "no abbreviations defined for lang $lang";
753             }
754             }
755             }
756              
757 5         12 return $rh_comp;
758             }
759              
760             # " abc,,def , ghi " => 'abc, def, ghi'
761             sub _clean {
762 904     904   2711 my $self = shift;
763 904   100     1652 my $out = shift // return;
764 903 50       1443 if ($debug){
765 0         0 say STDERR "entering _clean \n$out";
766             }
767              
768 903         1604 $out =~ s/\&#39\;/'/g;
769              
770 903         5728 $out =~ s/[\},\s]+$//;
771 903         2094 $out =~ s/^[,\s]+//;
772              
773 903         1297 $out =~ s/^- //; # line starting with dash due to a parameter missing
774              
775 903         1425 $out =~ s/,\s*,/, /g; # multiple commas to one
776 903         1708 $out =~ s/\h+,\h+/, /g; # one horiz whitespace behind comma
777 903         2571 $out =~ s/\h\h+/ /g; # multiple horiz whitespace to one
778 903         2255 $out =~ s/\h\n/\n/g; # horiz whitespace, newline to newline
779 903         1393 $out =~ s/\n,/\n/g; # newline comma to just newline
780 903         1295 $out =~ s/,,+/,/g; # multiple commas to one
781 903         1245 $out =~ s/,\n/\n/g; # comma newline to just newline
782 903         2303 $out =~ s/\n\h+/\n/g; # newline plus space to newline
783 903         2034 $out =~ s/\n\n+/\n/g; # multiple newline to one
784              
785             # final dedupe across and within lines
786 903         2718 my @before_pieces = split(/\n/, $out);
787 903         1298 my %seen_lines;
788             my @after_pieces;
789 903         1266 foreach my $line (@before_pieces) {
790 3493         5420 $line =~ s/^\h+//g;
791 3493         5126 $line =~ s/\h+$//g;
792 3493         6973 $seen_lines{$line}++;
793 3493 100       6017 next if ($seen_lines{$line} > 1);
794             # now dedupe within the line
795 3475         5776 my @before_words = split(/,/, $line);
796 3475         4087 my %seen_words;
797             my @after_words;
798 3475         4063 foreach my $w (@before_words) {
799 3675         5354 $w =~ s/^\h+//g;
800 3675         5300 $w =~ s/\h+$//g;
801 1 100   1   9 if (lc($w) ne 'new york') {
  1         2  
  1         15  
  3675         7570  
802 3665         32561 $seen_words{$w}++;
803             }
804 3675 100 100     10589 next if ((defined($seen_words{$w})) && ($seen_words{$w} > 1));
805 3674         6027 push(@after_words, $w);
806             }
807 3475         5179 $line = join(', ', @after_words);
808 3475         6867 push(@after_pieces, $line);
809             }
810 903         1628 $out = join("\n", @after_pieces);
811              
812 903         1607 $out =~ s/^\s+//; # remove leading whitespace
813 903         2413 $out =~ s/\s+$//; # remove end whitespace
814              
815 903         1299 $out .= "\n"; # add final newline
816 903         2658 return $out; # we are done
817             }
818              
819             sub _render_template {
820 450     450   2556 my $self = shift;
821 450         534 my $thtemplate = shift;
822 450         482 my $components = shift;
823              
824             # Mustache calls it context
825 450         7810 my $context = clone($components);
826 450 50       1140 say STDERR 'context: ' . Dumper $context if ($debug);
827 450         1115 my $output = $thtemplate->render($context);
828              
829 450         433476 $output = $self->_evaluate_template_lamdas($output);
830              
831 450 50       1720 say STDERR "in _render before _clean: $output" if ($debug);
832 450         849 $output = $self->_clean($output);
833              
834             # is it empty?
835             # if yes and there is only one component then just use that one
836 450 100       1396 if ($output !~ m/\w/) {
837 2         6 my @comps = sort keys %$components;
838 2 50       6 if (scalar(@comps) == 1) {
839 0         0 foreach my $k (@comps) {
840 0         0 $output = $components->{$k};
841             }
842             } # FIXME what if more than one?
843             }
844 450         1666 return $output;
845             }
846              
847             # Text::Hogan apparently caches lambdas when rendering templates. In the past
848             # we needed our lambda 'first', example
849             # {{#first}} {{{city}}} || {{{town}}} {{/first}}
850             # to evaluate the componentes. Whenever the lambda was called with different
851             # component values it consumed memory. Now replace with a simpler implementation
852             #
853             sub _replace_template_lambdas {
854 450     450   1060 my $self = shift;
855 450         740 my $template_text = shift;
856 450         7222 $template_text =~ s!\Q{{#first}}\E(.+?)\Q{{/first}}\E!FIRSTSTART${1}FIRSTEND!g;
857 450         1442 return $template_text;
858             }
859              
860             # We only use a lambda named 'first'
861             sub _evaluate_template_lamdas {
862 450     450   698 my $self = shift;
863 450         629 my $text = shift;
864 450         4740 $text =~ s!FIRSTSTART\s*(.+?)\s*FIRSTEND!_select_first($1)!seg;
  766         1385  
865 450         1070 return $text;
866             }
867              
868             # '|| val1 || || val3' => 'val1'
869             sub _select_first {
870 766     766   1292 my $text = shift;
871 766         4512 my @a_parts = grep { length($_) } split(/\s*\|\|\s*/, $text);
  2553         4061  
872 766 100       4377 return scalar(@a_parts) ? $a_parts[0] : '';
873             }
874              
875             my %small_district = (
876             'br' => 1,
877             'cr' => 1,
878             'es' => 1,
879             'ni' => 1,
880             'py' => 1,
881             'ro' => 1,
882             'tg' => 1,
883             'tm' => 1,
884             'xk' => 1,
885             );
886              
887             # correct the alias for "district"
888             # in OSM some countries use district to mean "city_district"
889             # others to mean "state_district"
890             sub _set_district_alias {
891 445     445   555 my $self = shift;
892 445         568 my $cc = shift;
893              
894             # this may get called repeatedly
895             # no need to do the work again
896 445 50       731 if (defined($cc)){
897 445         533 my $ucc = uc($cc);
898 445 100       1131 return if (defined($self->{set_district_alias}{$ucc}));
899 224         568 $self->{set_district_alias}{$ucc} = 1;
900             }
901              
902 224         243 my $oldalias;
903 224 50       481 if (defined($small_district{$cc})){
904 0         0 $self->{component2type}{district} = 'neighbourhood';
905 0         0 $oldalias = 'state_district';
906              
907             # add to the neighbourhood alias list
908             # though of course we are just sticking it at the end
909 0         0 push(@{$self->{component_aliases}{'neighbourhood'}}, 'district');
  0         0  
910              
911             } else {
912             # set 'district' to be type 'state_district'
913 224         415 $self->{component2type}{district} = 'state_district';
914 224         266 $oldalias = 'neighbourhood';
915              
916             # add to the state_district alias list
917 224         252 push(@{$self->{component_aliases}{'state_district'}}, 'district');
  224         520  
918             }
919              
920             # remove from the old alias list
921 224         280 my @temp = grep { $_ ne 'district' } @{$self->{component_aliases}{$oldalias}};
  2862         4461  
  224         554  
922 224         638 $self->{component_aliases}{$oldalias} = \@temp;
923 224         418 return;
924             }
925              
926              
927             # returns []
928             sub _find_unknown_components {
929 447     447   1057 my $self = shift;
930 447         552 my $rh_components = shift;
931              
932 447         468 my %h_known = map { $_ => 1 } @{$self->{ordered_components}};
  27774         38675  
  447         925  
933 447         2422 my @a_unknown = grep { !exists($h_known{$_}) } keys %$rh_components;
  4036         6298  
934 447         2401 return \@a_unknown;
935             }
936              
937             1;
938              
939             __END__
940              
941             =pod
942              
943             =encoding UTF-8
944              
945             =head1 NAME
946              
947             Geo::Address::Formatter - take structured address data and format it according to the various global/country rules
948              
949             =head1 VERSION
950              
951             version 1.996
952              
953             =head1 SYNOPSIS
954              
955             #
956             # get the templates (or use your own)
957             # git clone git@github.com:OpenCageData/address-formatting.git
958             #
959             my $GAF = Geo::Address::Formatter->new( conf_path => '/path/to/templates' );
960              
961             my $components = { ... }
962             my $text = $GAF->format_address($components, { country => 'FR' } );
963             my $rh_final_components = $GAF->final_components();
964             #
965             # or if we want shorter output
966             #
967             my $short_text = $GAF->format_address($components, { country => 'FR', abbreviate => 1, });
968              
969             =head2 new
970              
971             my $GAF = Geo::Address::Formatter->new( conf_path => '/path/to/templates' );
972              
973             Returns one instance. The I<conf_path> is required.
974              
975             Optional parameters are:
976              
977             I<debug>: prints tons of debugging info for use in development.
978              
979             I<no_warnings>: turns off a few warnings if configuration is not optimal.
980              
981             I<only_address>: formatted will only contain known components (will not include POI names like). Note, can be overridden with optional param to format_address method.
982              
983             =head2 final_components
984              
985             my $rh_components = $GAF->final_components();
986              
987             returns a reference to a hash of the final components that are set at the
988             completion of B<format_address>. Warns if called before they have been set
989             (unless I<no_warnings> was set at object creation).
990              
991             =head2 format_address
992              
993             my $text = $GAF->format_address(\%components, \%options );
994              
995             Given a structures address (hashref) and options (hashref) returns a
996             formatted address.
997              
998             Possible options are:
999              
1000             'abbreviate', if supplied common abbreviations are applied
1001             to the resulting output.
1002              
1003             'address_template', a mustache format template to be used instead of the template
1004             defined in the configuration
1005              
1006             'country', which should be an uppercase ISO 3166-1:alpha-2 code
1007             e.g. 'GB' for Great Britain, 'DE' for Germany, etc.
1008             If ommited we try to find the country in the address components.
1009              
1010             'only_address', same as only_address global option but set at formatting level
1011              
1012             =head1 DESCRIPTION
1013              
1014             You have a structured postal address (hash) and need to convert it into a
1015             readable address based on the format of the address country.
1016              
1017             For example, you have:
1018              
1019             {
1020             house_number => 12,
1021             street => 'Avenue Road',
1022             postcode => 45678,
1023             city => 'Deville'
1024             }
1025              
1026             you need:
1027              
1028             Great Britain: 12 Avenue Road, Deville 45678
1029             France: 12 Avenue Road, 45678 Deville
1030             Germany: Avenue Road 12, 45678 Deville
1031             Latvia: Avenue Road 12, Deville, 45678
1032              
1033             It gets more complicated with 200+ countries and territories and dozens more
1034             address components to consider.
1035              
1036             This module comes with a minimal configuration to run tests. Instead of
1037             developing your own configuration please use (and contribute to)
1038             those in https://github.com/OpenCageData/address-formatting
1039             which includes test cases.
1040              
1041             Together we can address the world!
1042              
1043             =head1 AUTHOR
1044              
1045             Ed Freyfogle
1046              
1047             =head1 COPYRIGHT AND LICENSE
1048              
1049             This software is copyright (c) 2023 by Opencage GmbH.
1050              
1051             This is free software; you can redistribute it and/or modify it under
1052             the same terms as the Perl 5 programming language system itself.
1053              
1054             =cut