File Coverage

lib/Geo/Address/Formatter.pm
Criterion Covered Total %
statement 451 512 88.0
branch 187 238 78.5
condition 33 46 71.7
subroutine 41 46 89.1
pod 3 3 100.0
total 715 845 84.6


line stmt bran cond sub pod time code
1             package Geo::Address::Formatter;
2             $Geo::Address::Formatter::VERSION = '1.9982';
3             # ABSTRACT: take structured address data and format it according to the various global/country rules
4              
5 6     6   26179 use strict;
  6         11  
  6         162  
6 6     6   23 use warnings;
  6         8  
  6         136  
7 6     6   22 use feature qw(say);
  6         7  
  6         467  
8 6     6   1744 use Clone qw(clone);
  6         9615  
  6         281  
9 6     6   973 use Data::Dumper;
  6         10885  
  6         299  
10             $Data::Dumper::Sortkeys = 1;
11 6     6   33 use File::Basename qw(dirname);
  6         8  
  6         254  
12 6     6   1890 use Ref::Util qw(is_hashref);
  6         6413  
  6         343  
13 6     6   34 use Scalar::Util qw(looks_like_number);
  6         10  
  6         244  
14 6     6   2087 use Text::Hogan::Compiler;
  6         32633  
  6         203  
15 6     6   1887 use Try::Catch;
  6         3455  
  6         310  
16 6     6   1313 use YAML::XS qw(LoadFile);
  6         8993  
  6         243  
17 6     6   30 use utf8;
  6         20  
  6         80  
18              
19             my $THC = Text::Hogan::Compiler->new;
20              
21             # optional params
22             my $show_warnings = 1;
23             my $debug = 0;
24             my $only_address = 0;
25              
26              
27             sub new {
28 17     17 1 16799 my ($class, %params) = @_;
29              
30 17         36 my $self = {};
31 17   50     60 my $conf_path = $params{conf_path} || die "no conf_path set";
32              
33             # optional params
34 17 50 66     51 if ( defined($params{no_warnings}) && ($params{no_warnings})){
35 1         2 $show_warnings = 0;
36             }
37 17   66     67 $only_address = (defined($params{only_address}) && $params{only_address}) // 0;
      50        
38 17   66     64 $debug = (defined($params{debug}) && $params{debug}) // 0;
      50        
39              
40 17         31 $self->{final_components} = undef;
41 17         28 $self->{set_district_alias} = {};
42              
43 17         25 bless($self, $class);
44              
45 17 50       47 say STDERR "************* in Geo::Address::Formatter::new ***" if ($debug);
46            
47 17 100       43 if ($self->_read_configuration($conf_path)){
48 16         83 return $self;
49             }
50 1         18 die 'unable to read configuration';
51             }
52              
53             sub _read_configuration {
54 17     17   25 my $self = shift;
55 17         21 my $path = shift;
56              
57 17 50       349 return if (! -e $path);
58             # components file
59 17         63 my $compyaml = $path . '/components.yaml';
60 17 100       247 return if (! -e $compyaml);
61              
62 16         78 $self->{templates} = {};
63 16         28 $self->{component_aliases} = {};
64 16         34 $self->{component2type} = {};
65 16         31 $self->{ordered_components} = [];
66              
67             # read the config file(s)
68 16         23 my $loaded = 0;
69              
70 16         36 my $wwfile = $path . '/countries/worldwide.yaml';
71 16 50       31 say STDERR "loading templates $wwfile" if ($debug);
72              
73 16 50       197 return if (! -e $wwfile);
74              
75             try {
76 16     16   344 my $rh_templates = LoadFile($wwfile);
77 16         28396 foreach (keys %$rh_templates) {
78 2950         3120 $self->{templates}{$_} = $rh_templates->{$_};
79 2950         2682 $loaded = 1;
80             }
81             } catch {
82 0     0   0 warn "error parsing country configuration in $wwfile";
83 0         0 return;
84 16         173 };
85 16 50       337 return if ($loaded == 0); # no templates
86              
87             # see if we can load the components
88             try {
89 16 50   16   307 say STDERR "loading components" if ($debug);
90 16         44 my @c = LoadFile($compyaml);
91              
92 16 50       3006 if ($debug){
93 0         0 say STDERR Dumper \@c;
94             }
95              
96 16         41 foreach my $rh_c (@c) {
97 244 50       306 if (defined($rh_c->{name})){
98 244 100       262 if (defined($rh_c->{aliases})){
99 148         251 $self->{component_aliases}{$rh_c->{name}} = $rh_c->{aliases};
100             } else {
101 96         209 $self->{component_aliases}{$rh_c->{name}} = [];
102             }
103             }
104             }
105              
106 16         28 foreach my $rh_c (@c) {
107 244         197 push(@{$self->{ordered_components}}, $rh_c->{name});
  244         330  
108 244         340 $self->{component2type}->{$rh_c->{name}} = $rh_c->{name};
109              
110 244 100       354 if (defined($rh_c->{aliases})) {
111 148         124 foreach my $alias (@{$rh_c->{aliases}}) {
  148         166  
112 510         393 push(@{$self->{ordered_components}}, $alias);
  510         528  
113 510         774 $self->{component2type}->{$alias} = $rh_c->{name};
114             }
115             }
116             }
117 16 50       103 if ($debug){
118 0         0 say STDERR 'component_aliases';
119 0         0 say STDERR Dumper $self->{component_aliases};
120 0         0 say STDERR 'ordered_components';
121 0         0 say STDERR Dumper $self->{ordered_components};
122 0         0 say STDERR 'component2type';
123 0         0 say STDERR Dumper $self->{component2type};
124             }
125             } catch {
126 0     0   0 warn "error parsing component configuration: $_";
127 16         109 };
128              
129             # get the county and state codes and country2lang conf
130 16         300 my @conf_files = qw(county_codes state_codes country2lang);
131 16         29 foreach my $cfile (@conf_files) {
132 48         74946 $self->{$cfile} = {};
133 48         122 my $yfile = $path . '/' . $cfile . '.yaml';
134 48 100       757 if (-e $yfile) {
135             try {
136 42     42   866 $self->{$cfile} = LoadFile($yfile);
137             } catch {
138 0     0   0 warn "error parsing $cfile configuration: $_";
139 42         384 };
140             }
141             }
142              
143 16         5555 my $abbrvdir = $path . '/abbreviations';
144              
145 16 100       227 if (-d $abbrvdir){
146 12 50       416 opendir my $dh, $abbrvdir
147             or die "Could not open '$abbrvdir' for read: $!\n";
148              
149 12         329 while (my $file = readdir $dh) {
150             # say STDERR "file: $file";
151 260 100       30296 if ($file =~ m/^(\w\w)\.yaml$/) {
152 236         499 my $lang = $1; # two letter lang code like 'en'
153 236         414 my $abbrvfile = $abbrvdir . '/' . $file;
154             try {
155 236     236   4838 $self->{abbreviations}->{$lang} = LoadFile($abbrvfile);
156             } catch {
157 0     0   0 warn "error parsing abbrv conf in $abbrvfile: $_";
158 236         1035 };
159             }
160             }
161 12         1618 closedir $dh;
162             }
163              
164             #say Dumper $self->{abbreviations};
165             #say Dumper $self->{country2lang};
166 16         97 return 1;
167             }
168              
169              
170             sub final_components {
171 6     6 1 1390 my $self = shift;
172 6 100       15 if (defined($self->{final_components})) {
173 5         13 return $self->{final_components};
174             }
175 1 50       15 warn 'final_components not yet set' if ($show_warnings);
176 1         64 return;
177             }
178              
179              
180             sub format_address {
181 455     455 1 359744 my $self = shift;
182 455   100     7077 my $rh_components = clone(shift) || return;
183 454   100     1674 my $rh_options = shift || {};
184              
185             # 1. make sure empty at the beginning
186 454         1398 $self->{final_components} = undef;
187              
188 454 50       788 if ($debug){
189 0         0 say STDERR "*** in format_address ***";
190 0         0 say STDERR Dumper $rh_options;
191 0         0 say STDERR Dumper $rh_components;
192             }
193              
194             # 2. deal with the options
195              
196             # 2a. which country format will we use?
197             # might have been specified in options
198             # otherwise look at components
199             my $cc = $rh_options->{country}
200 454   100     1227 || $self->_determine_country_code($rh_components)
201             || '';
202              
203 454 100       671 if ($cc) {
204 450         482 $rh_components->{country_code} = $cc;
205 450         798 $self->_set_district_alias($cc);
206             }
207              
208             # 2b. should we abbreviate?
209 454   100     1125 my $abbrv = $rh_options->{abbreviate} // 0;
210              
211             # 2c. was only_address set at the formatting level
212 454         488 my $oa = $only_address;
213 454 100       622 if (defined($rh_options->{only_address})){
214 2         4 $oa = $rh_options->{only_address};
215             }
216              
217 454 50       638 if ($debug){
218 0         0 say STDERR "component_aliases";
219 0         0 say STDERR Dumper $self->{component_aliases};
220             }
221              
222             # done with the options
223              
224             # 3. set the aliases, unless this would overwrite something
225             # need to do this in the right order (as defined in the components file)
226             # For example:
227             # both 'city_district' and 'suburb' are aliases of 'neighbourhood'
228             # so which one should we use if both are present?
229             # We should use the one defined first in the list
230              
231 454         369 my $rhh_p2a;
232 454         1343 foreach my $c (keys %$rh_components){
233              
234             # might not need an alias as it is a primary type
235 3574 100       4862 next if (defined($self->{component_aliases}{$c}));
236              
237             # it is not a primary type
238             # is there an alias?
239 734 100       1244 if (defined($self->{component2type}{$c})){
240 500         568 my $ptype = $self->{component2type}{$c};
241             # but is it already set?
242 500 100       796 if (! defined($rh_components->{$ptype}) ){
243             # no, we will set it later
244 396         808 $rhh_p2a->{$ptype}{$c} = 1;
245              
246             }
247             }
248             }
249              
250             # now we know which primary types have aliases
251 454         1036 foreach my $ptype (keys %$rhh_p2a){
252             # is there more than one?
253 357         365 my @aliases = keys %{$rhh_p2a->{$ptype}};
  357         742  
254 357 100       610 if (scalar @aliases == 1){
255 319         515 $rh_components->{$ptype} = $rh_components->{$aliases[0]};
256 319         486 next; # we are done with this ptype
257             }
258              
259             # if there is more than one we need to go through the list
260             # so we do them in the right order
261 38         56 foreach my $c (@{$self->{component_aliases}->{$ptype}}){
  38         79  
262 44 100       95 if (defined($rh_components->{$c})){
263 38         68 $rh_components->{$ptype} = $rh_components->{$c};
264 38         81 last; # we are done with this ptype
265             }
266             }
267             }
268              
269 454 50       675 if ($debug){
270 0         0 say STDERR "after component_aliases applied";
271 0         0 say STDERR Dumper $rh_components;
272             }
273              
274             # 4. deal wtih terrible inputs
275 454         1002 $self->_sanity_cleaning($rh_components);
276 454 50       574 if ($debug){
277 0         0 say STDERR "after sanity_cleaning applied";
278 0         0 say STDERR Dumper $rh_components;
279             }
280              
281             # 5. determine the template
282 454         475 my $template_text;
283 454   66     1013 my $rh_config = $self->{templates}{uc($cc)} || $self->{templates}{default};
284            
285 454 100       634 if (defined($rh_options->{address_template})) {
286 2         4 $template_text = $rh_options->{address_template};
287             }
288             else {
289              
290 452 100       651 if (defined($rh_config->{address_template})) {
    50          
291 450         629 $template_text = $rh_config->{address_template};
292             } elsif (defined($self->{templates}{default}{address_template})) {
293 2         3 $template_text = $self->{templates}{default}{address_template};
294             }
295            
296             # do we have the minimal components for an address?
297             # or should we instead use the fallback template?
298 452 100       829 if (!$self->_minimal_components($rh_components)) {
299 49 50       124 say STDERR "using fallback" if ($debug);
300 49 100       163 if (defined($rh_config->{fallback_template})) {
    100          
301 34         72 $template_text = $rh_config->{fallback_template};
302             } elsif (defined($self->{templates}{default}{fallback_template})) {
303 14         32 $template_text = $self->{templates}{default}{fallback_template};
304             }
305             # no fallback
306             }
307              
308             }
309              
310 454 50       639 say STDERR 'template text: ' . $template_text if ($debug);
311              
312             # 6. clean up the components, possibly add codes
313 454         813 $self->_fix_country($rh_components);
314 454 50       598 if ($debug){
315 0         0 say STDERR "after fix_country";
316 0         0 say STDERR Dumper $rh_components;
317             }
318              
319 454         1184 $self->_apply_replacements($rh_components, $rh_config->{replace});
320 454 50       764 if ($debug){
321 0         0 say STDERR "after applying_replacements applied";
322 0         0 say STDERR Dumper $rh_components;
323             }
324 454         883 $self->_add_state_code($rh_components);
325 454         834 $self->_add_county_code($rh_components);
326 454 50       643 if ($debug){
327 0         0 say STDERR "after adding codes";
328 0         0 say STDERR Dumper $rh_components;
329             }
330              
331             # 7. add the attention, if needed
332 454 50       667 if ($debug){
333 0         0 say STDERR "object level only_address: $only_address";
334 0         0 say STDERR "formatting level only_address: $oa";
335             }
336              
337 454 100       606 if ($oa){
338 3 50       8 if ($debug){
339 0         0 say STDERR "not looking for unknown_components";
340 0         0 say STDERR "only_address was specified";
341             }
342             }
343             else {
344 451         661 my $ra_unknown = $self->_find_unknown_components($rh_components);
345 451 50       806 if ($debug){
346 0         0 say STDERR "unknown_components:";
347 0         0 say STDERR Dumper $ra_unknown;
348             }
349 451 100       739 if (scalar(@$ra_unknown)){
350             $rh_components->{attention} =
351 224         307 join(', ', map { $rh_components->{$_} } @$ra_unknown);
  230         649  
352 224 50       512 if ($debug){
353 0         0 say STDERR "putting unknown_components in 'attention'";
354             }
355             }
356             }
357              
358             # 8. abbreviate, if needed
359 454 100       690 if ($abbrv) {
360 5         13 $rh_components = $self->_abbreviate($rh_components);
361             }
362              
363             # 9. prepare the template
364 454         842 $template_text = $self->_replace_template_lambdas($template_text);
365              
366             # 10. compiled the template
367 454         1664 my $compiled_template =
368             $THC->compile($template_text, {'numeric_string_as_string' => 1});
369              
370 454 50       579878 if ($debug){
371 0         0 say STDERR "before _render_template";
372 0         0 say STDERR Dumper $rh_components;
373 0         0 say STDERR "template: ";
374 0         0 say STDERR Dumper $compiled_template;
375             }
376              
377             # 11. render the template
378 454         841 my $text = $self->_render_template($compiled_template, $rh_components);
379 454 50       699 if ($debug){
380 0         0 say STDERR "text after _render_template $text";
381             }
382              
383             # 11. postformatting
384 454         1163 $text = $self->_postformat($text, $rh_config->{postformat_replace});
385              
386             # 12. clean again
387 454         848 $text = $self->_clean($text);
388              
389             # 13. set final components
390 454         622 $self->{final_components} = $rh_components;
391              
392             # all done
393 454         1502 return $text;
394             }
395              
396             # remove duplicates ("Berlin, Berlin"), do replacements and similar
397             sub _postformat {
398 456     456   1391 my $self = shift;
399 456         437 my $text = shift;
400 456         603 my $raa_rules = shift;
401              
402 456 50       625 if ($debug){
403 0         0 say STDERR "entering _postformat: $text"
404             }
405              
406             # remove duplicates
407 456         911 my @before_pieces = split(/, /, $text);
408 456         490 my %seen;
409             my @after_pieces;
410 456         591 foreach my $piece (@before_pieces) {
411 565         812 $piece =~ s/^\s+//g;
412 565         996 $seen{$piece}++;
413 565 100       1114 if (lc($piece) ne 'new york') {
414 561 100       869 next if ($seen{$piece} > 1);
415             }
416 564         758 push(@after_pieces, $piece);
417             }
418 456         702 $text = join(', ', @after_pieces);
419              
420             # do any country specific rules
421 456         686 foreach my $ra_fromto (@$raa_rules) {
422             try {
423 512     512   12136 my $regexp = qr/$ra_fromto->[0]/;
424 512         846 my $replacement = $ra_fromto->[1];
425              
426             # ultra hack to do substitution
427             # limited to $1 and $2, should really be a while loop
428             # doing every substitution
429              
430 512 100       991 if ($replacement =~ m/\$\d/) {
431 46 100       289 if ($text =~ m/$regexp/) {
432 19         41 my $tmp1 = $1;
433 19         31 my $tmp2 = $2;
434 19         28 my $tmp3 = $3;
435 19         56 $replacement =~ s/\$1/$tmp1/;
436 19         63 $replacement =~ s/\$2/$tmp2/;
437 19         51 $replacement =~ s/\$3/$tmp3/;
438             }
439             }
440 512         2129 $text =~ s/$regexp/$replacement/;
441             } catch {
442 0     0   0 warn "invalid replacement: " . join(', ', @$ra_fromto);
443 512         5622 };
444             }
445 456         2387 return $text;
446             }
447              
448             sub _sanity_cleaning {
449 455     455   1408 my $self = shift;
450 455   50     643 my $rh_components = shift || return;
451              
452             # catch insane postcodes
453 455 100       718 if (defined($rh_components->{'postcode'})) {
454 313 100       1296 if (length($rh_components->{'postcode'}) > 20) {
    100          
    100          
455 1         3 delete $rh_components->{'postcode'};
456             } elsif ($rh_components->{'postcode'} =~ m/\d+;\d+/) {
457             # sometimes OSM has postcode ranges
458 1         2 delete $rh_components->{'postcode'};
459             } elsif ($rh_components->{'postcode'} =~ m/^(\d{5}),\d{5}/) {
460 1         4 $rh_components->{'postcode'} = $1;
461             }
462             }
463              
464             # remove things that might be empty
465 455         920 foreach my $c (keys %$rh_components) {
466             # catch empty values
467 3931 50       10327 if (!defined($rh_components->{$c})) {
    100          
    100          
468 0         0 delete $rh_components->{$c};
469             }
470             # no chars
471             elsif ($rh_components->{$c} !~ m/\w/) {
472 1         3 delete $rh_components->{$c};
473             }
474             # catch values containing URLs
475             elsif ($rh_components->{$c} =~ m|https?://|) {
476 1         2 delete $rh_components->{$c};
477             }
478             }
479 455         671 return;
480             }
481              
482             sub _minimal_components {
483 452     452   505 my $self = shift;
484 452   50     650 my $rh_components = shift || return;
485 452         705 my @required_components = qw(road postcode); #FIXME - should be in conf
486 452         425 my $missing = 0; # number of required components missing
487              
488 452         408 my $minimal_threshold = 2;
489 452         516 foreach my $c (@required_components) {
490 904 100       1413 $missing++ if (!defined($rh_components->{$c}));
491 904 100       1381 return 0 if ($missing == $minimal_threshold);
492             }
493 403         790 return 1;
494             }
495              
496             my %valid_replacement_components = ('state' => 1,);
497              
498             # determines which country code to use
499             # may also override other configuration if we are dealing with
500             # a dependent territory
501             sub _determine_country_code {
502 455     455   511 my $self = shift;
503 455   50     694 my $rh_components = shift || return;
504              
505             # FIXME - validate it is a valid country
506 455 100       769 return if (!defined($rh_components->{country_code}));
507              
508 451 50       990 if (my $cc = lc($rh_components->{country_code})) {
509              
510             # is it two letters long?
511 451 50       1651 return if ($cc !~ m/^[a-z][a-z]$/);
512 451 50       827 return 'GB' if ($cc eq 'uk');
513              
514 451         728 $cc = uc($cc);
515              
516             # check if the configuration tells us to use
517             # the configuration of another country
518             # used in cases of dependent territories like
519             # American Samoa (AS) and Puerto Rico (PR)
520 451 100 100     1868 if ( defined($self->{templates}{$cc})
521             && defined($self->{templates}{$cc}{use_country}))
522             {
523 49         73 my $old_cc = $cc;
524 49         75 $cc = $self->{templates}{$cc}{use_country};
525 49 100       137 if (defined($self->{templates}{$old_cc}{change_country})) {
526              
527 36         80 my $new_country = $self->{templates}{$old_cc}{change_country};
528 36 100       108 if ($new_country =~ m/\$(\w*)/) {
529 2         5 my $component = $1;
530 2 50       7 if (defined($rh_components->{$component})) {
531 2         24 $new_country =~ s/\$$component/$rh_components->{$component}/;
532             } else {
533 0         0 $new_country =~ s/\$$component//;
534             }
535             }
536 36         83 $rh_components->{country} = $new_country;
537             }
538 49 100       130 if (defined($self->{templates}{$old_cc}{add_component})) {
539 12         29 my $tmp = $self->{templates}{$old_cc}{add_component};
540 12         44 my ($k, $v) = split(/=/, $tmp);
541             # check whitelist of valid replacement components
542 12 100       39 if (defined($valid_replacement_components{$k})) {
543 11         26 $rh_components->{$k} = $v;
544             }
545             }
546             }
547              
548 451 100       741 if ($cc eq 'NL') {
549 5 50       12 if (defined($rh_components->{state})) {
550 5 100       27 if ($rh_components->{state} eq 'Curaçao') {
    50          
    100          
551 1         2 $cc = 'CW';
552 1         4 $rh_components->{country} = 'Curaçao';
553             } elsif ($rh_components->{state} =~ m/^sint maarten/i) {
554 0         0 $cc = 'SX';
555 0         0 $rh_components->{country} = 'Sint Maarten';
556             } elsif ($rh_components->{state} =~ m/^Aruba/i) {
557 1         2 $cc = 'AW';
558 1         3 $rh_components->{country} = 'Aruba';
559             }
560             }
561             }
562 451         1443 return $cc;
563             }
564 0         0 return;
565             }
566              
567             # hacks for bad country data
568             sub _fix_country {
569 454     454   631 my $self = shift;
570 454   50     735 my $rh_components = shift || return;
571              
572             # is the country a number?
573             # if so, and there is a state, use state as country
574 454 100       692 if (defined($rh_components->{country})) {
575 451 100       1422 if (looks_like_number($rh_components->{country})) {
576 1 50       3 if (defined($rh_components->{state})) {
577 1         3 $rh_components->{country} = $rh_components->{state};
578 1         2 delete $rh_components->{state};
579             }
580             }
581             }
582 454         500 return;
583             }
584              
585             # sets and returns a state code
586             # note may also set other values in some odd edge cases
587             sub _add_state_code {
588 459     459   1064 my $self = shift;
589 459         416 my $rh_components = shift;
590 459         777 return $self->_add_code('state', $rh_components);
591             }
592              
593             sub _add_county_code {
594 455     455   1017 my $self = shift;
595 455         427 my $rh_components = shift;
596 455         599 return $self->_add_code('county', $rh_components);
597             }
598              
599             sub _add_code {
600 914     914   798 my $self = shift;
601 914   50     1277 my $keyname = shift // return;
602 914         772 my $rh_components = shift;
603 914 100       1293 return if !$rh_components->{country_code}; # do we know country?
604 905 100       1382 return if !$rh_components->{$keyname}; # do we know state/county?
605              
606 566         815 my $code = $keyname . '_code';
607              
608 566 100       930 if (defined($rh_components->{$code})) { # do we already have code?
609             # but could have situation
610             # where code and long name are
611             # the same which we want to correct
612 26 100       68 if ($rh_components->{$code} ne $rh_components->{$keyname}) {
613 25         39 return;
614             }
615             }
616              
617             # ensure country_code is uppercase as we use it as conf key
618 541         690 $rh_components->{country_code} = uc($rh_components->{country_code});
619 541         532 my $cc = $rh_components->{country_code};
620              
621 541 100       1354 if (my $mapping = $self->{$code . 's'}{$cc}) {
622              
623 283         354 my $name = $rh_components->{$keyname};
624 283         404 my $uc_name = uc($name);
625              
626 283         8359 LOCCODE: foreach my $abbrv (keys %$mapping) {
627              
628 6671         4976 my @confnames; # can have multiple names for the place
629             # for example in different languages
630              
631 6671 100       6941 if (is_hashref($mapping->{$abbrv})) {
632 839         683 push(@confnames, values %{$mapping->{$abbrv}});
  839         1493  
633             } else {
634 5832         6005 push(@confnames, $mapping->{$abbrv});
635             }
636              
637 6671         6012 foreach my $confname (@confnames) {
638 7587 100       10084 if ($uc_name eq uc($confname)) {
639 180         289 $rh_components->{$code} = $abbrv;
640 180         342 last LOCCODE;
641             }
642             # perhaps instead of passing in a name, we passed in a code
643             # example: state => 'NC'
644             # we want to turn that into
645             # state => 'North Carolina'
646             # state_code => 'NC'
647             #
648 7407 100       9859 if ($uc_name eq $abbrv) {
649 6         9 $rh_components->{$keyname} = $confname;
650 6         10 $rh_components->{$code} = $abbrv;
651 6         12 last LOCCODE;
652             }
653             }
654             }
655             # didn't find a valid code or name
656              
657             # try again for odd variants like "United States Virgin Islands"
658 283 100       768 if ($keyname eq 'state') {
659 254 100       492 if (!defined($rh_components->{state_code})) {
660 81 100       144 if ($cc eq 'US') {
661 2 50       45 if ($rh_components->{state} =~ m/^united states/i) {
662 0         0 my $state = $rh_components->{state};
663 0         0 $state =~ s/^United States/US/i;
664 0         0 foreach my $k (keys %$mapping) {
665 0 0       0 if (uc($state) eq uc($k)) {
666 0         0 $rh_components->{state_code} = $mapping->{$k};
667 0         0 last;
668             }
669             }
670             }
671 2 50       11 if ($rh_components->{state} =~ m/^washington,? d\.?c\.?/i) {
672 2         3 $rh_components->{state_code} = 'DC';
673 2         3 $rh_components->{state} = 'District of Columbia';
674 2         24 $rh_components->{city} = 'Washington';
675             }
676             }
677             }
678             }
679             }
680 541         760 return $rh_components->{$code};
681             }
682              
683             sub _apply_replacements {
684 457     457   1601 my $self = shift;
685 457         388 my $rh_components = shift;
686 457         575 my $raa_rules = shift;
687              
688 457 50       672 if ($debug){
689 0         0 say STDERR "in _apply_replacements";
690 0         0 say STDERR Dumper $raa_rules;
691             }
692              
693 457         964 foreach my $component (keys %$rh_components) {
694 3930         12871 foreach my $ra_fromto (@$raa_rules) {
695              
696 8010         58255 my $regexp;
697             # do key specific replacement
698 8010 100       23021 if ($ra_fromto->[0] =~ m/^$component=/){
699 82         131 my $from = $ra_fromto->[0];
700 82         316 $from =~ s/^$component=//;
701 82 100       164 if ($rh_components->{$component} eq $from){
702 5         12 $rh_components->{$component} = $ra_fromto->[1];
703             } else {
704 77         111 $regexp = $from;
705             }
706             } else {
707 7928         8399 $regexp = $ra_fromto->[0];
708             }
709 8010 100       10365 if (defined($regexp)){
710             try {
711 8005     8005   165819 my $re = qr/$regexp/;
712 8004         26689 $rh_components->{$component} =~ s/$re/$ra_fromto->[1]/;
713             } catch {
714 1     1   22 warn "invalid replacement: " . join(', ', @$ra_fromto);
715 8005         25980 };
716             }
717             }
718             }
719 457         1939 return $rh_components;
720             }
721              
722             sub _abbreviate {
723 7     7   566 my $self = shift;
724 7   50     11 my $rh_comp = shift // return;
725              
726             # do we know the country?
727 7 100       15 if (!defined($rh_comp->{country_code})) {
728 2 100       5 if ($show_warnings){
729 1         2 my $error_msg = 'no country_code, unable to abbreviate';
730 1 50       3 if (defined($rh_comp->{country})) {
731 1         3 $error_msg .= ' - country: ' . $rh_comp->{country};
732             }
733 1         13 warn $error_msg
734             }
735 2         66 return;
736             }
737              
738             # do we have abbreviations for this country?
739 5         6 my $cc = uc($rh_comp->{country_code});
740              
741             # 1. which languages?
742 5 50       11 if (defined($self->{country2lang}{$cc})) {
743              
744 5         12 my @langs = split(/,/, $self->{country2lang}{$cc});
745              
746 5         7 foreach my $lang (@langs) {
747             # do we have abbrv for this lang?
748 9 100       21 if (defined($self->{abbreviations}->{$lang})) {
749              
750 6         6 my $rh_abbr = $self->{abbreviations}->{$lang};
751 6         11 foreach my $comp_name (keys %$rh_abbr) {
752 10 50       18 next if (!defined($rh_comp->{$comp_name}));
753 10         8 foreach my $long (keys %{$rh_abbr->{$comp_name}}) {
  10         20  
754 62         78 my $short = $rh_abbr->{$comp_name}->{$long};
755 62         357 $rh_comp->{$comp_name} =~ s/\b$long\b/$short/;
756             }
757             }
758             } else {
759             #warn "no abbreviations defined for lang $lang";
760             }
761             }
762             }
763              
764 5         9 return $rh_comp;
765             }
766              
767             # " abc,,def , ghi " => 'abc, def, ghi'
768             sub _clean {
769 914     914   2768 my $self = shift;
770 914   100     1357 my $out = shift // return;
771 913 50       1118 if ($debug){
772 0         0 say STDERR "entering _clean \n$out";
773             }
774              
775 913         1366 $out =~ s/\&#39\;/'/g;
776              
777 913         4829 $out =~ s/[\},\s]+$//;
778 913         1908 $out =~ s/^[,\s]+//;
779              
780 913         1069 $out =~ s/^- //; # line starting with dash due to a parameter missing
781              
782 913         1113 $out =~ s/,\s*,/, /g; # multiple commas to one
783 913         1338 $out =~ s/\h+,\h+/, /g; # one horiz whitespace behind comma
784 913         1998 $out =~ s/\h\h+/ /g; # multiple horiz whitespace to one
785 913         1841 $out =~ s/\h\n/\n/g; # horiz whitespace, newline to newline
786 913         1129 $out =~ s/\n,/\n/g; # newline comma to just newline
787 913         1014 $out =~ s/,,+/,/g; # multiple commas to one
788 913         1103 $out =~ s/,\n/\n/g; # comma newline to just newline
789 913         1971 $out =~ s/\n\h+/\n/g; # newline plus space to newline
790 913         2031 $out =~ s/\n\n+/\n/g; # multiple newline to one
791              
792             # final dedupe across and within lines
793 913         2288 my @before_pieces = split(/\n/, $out);
794 913         1072 my %seen_lines;
795             my @after_pieces;
796 913         1134 foreach my $line (@before_pieces) {
797 3529         4290 $line =~ s/^\h+//g;
798 3529         4261 $line =~ s/\h+$//g;
799 3529         5301 $seen_lines{$line}++;
800 3529 100       4874 next if ($seen_lines{$line} > 1);
801             # now dedupe within the line
802 3511         4994 my @before_words = split(/,/, $line);
803 3511         3141 my %seen_words;
804             my @after_words;
805 3511         3389 foreach my $w (@before_words) {
806 3718         4699 $w =~ s/^\h+//g;
807 3718         4152 $w =~ s/\h+$//g;
808 1 100   1   7 if (lc($w) ne 'new york') {
  1         1  
  1         11  
  3718         5974  
809 3708         25187 $seen_words{$w}++;
810             }
811 3718 100 100     8902 next if ((defined($seen_words{$w})) && ($seen_words{$w} > 1));
812 3716         5053 push(@after_words, $w);
813             }
814 3511         4590 $line = join(', ', @after_words);
815 3511         5733 push(@after_pieces, $line);
816             }
817 913         1355 $out = join("\n", @after_pieces);
818              
819 913         1373 $out =~ s/^\s+//; # remove leading whitespace
820 913         1969 $out =~ s/\s+$//; # remove end whitespace
821              
822 913         1130 $out .= "\n"; # add final newline
823 913         2237 return $out; # we are done
824             }
825              
826             sub _render_template {
827 455     455   2017 my $self = shift;
828 455         484 my $thtemplate = shift;
829 455         409 my $components = shift;
830              
831             # Mustache calls it context
832 455         6884 my $context = clone($components);
833 455 50       893 say STDERR 'context: ' . Dumper $context if ($debug);
834 455         1051 my $output = $thtemplate->render($context);
835              
836 455         370415 $output = $self->_evaluate_template_lamdas($output);
837              
838 455 50       839 say STDERR "in _render before _clean: $output" if ($debug);
839 455         743 $output = $self->_clean($output);
840              
841             # is it empty?
842             # if yes and there is only one component then just use that one
843 455 100       1195 if ($output !~ m/\w/) {
844 2         7 my @comps = sort keys %$components;
845 2 50       5 if (scalar(@comps) == 1) {
846 0         0 foreach my $k (@comps) {
847 0         0 $output = $components->{$k};
848             }
849             } # FIXME what if more than one?
850             }
851 455         1436 return $output;
852             }
853              
854             # Text::Hogan apparently caches lambdas when rendering templates. In the past
855             # we needed our lambda 'first', example
856             # {{#first}} {{{city}}} || {{{town}}} {{/first}}
857             # to evaluate the componentes. Whenever the lambda was called with different
858             # component values it consumed memory. Now replace with a simpler implementation
859             #
860             sub _replace_template_lambdas {
861 455     455   991 my $self = shift;
862 455         592 my $template_text = shift;
863 455         6076 $template_text =~ s!\Q{{#first}}\E(.+?)\Q{{/first}}\E!FIRSTSTART${1}FIRSTEND!g;
864 455         1085 return $template_text;
865             }
866              
867             # We only use a lambda named 'first'
868             sub _evaluate_template_lamdas {
869 455     455   540 my $self = shift;
870 455         486 my $text = shift;
871 455         3643 $text =~ s!FIRSTSTART\s*(.+?)\s*FIRSTEND!_select_first($1)!seg;
  783         1227  
872 455         955 return $text;
873             }
874              
875             # '|| val1 || || val3' => 'val1'
876             sub _select_first {
877 783     783   1096 my $text = shift;
878 783         3762 my @a_parts = grep { length($_) } split(/\s*\|\|\s*/, $text);
  2591         3377  
879 783 100       3795 return scalar(@a_parts) ? $a_parts[0] : '';
880             }
881              
882             my %small_district = (
883             'br' => 1,
884             'cr' => 1,
885             'es' => 1,
886             'ni' => 1,
887             'py' => 1,
888             'ro' => 1,
889             'tg' => 1,
890             'tm' => 1,
891             'xk' => 1,
892             );
893              
894             # correct the alias for "district"
895             # in OSM some countries use district to mean "city_district"
896             # others to mean "state_district"
897             sub _set_district_alias {
898 450     450   456 my $self = shift;
899 450         431 my $cc = shift;
900              
901             # this may get called repeatedly
902             # no need to do the work again
903 450 50       764 if (defined($cc)){
904 450         493 my $ucc = uc($cc);
905 450 100       981 return if (defined($self->{set_district_alias}{$ucc}));
906 224         416 $self->{set_district_alias}{$ucc} = 1;
907             }
908              
909 224         217 my $oldalias;
910 224 50       425 if (defined($small_district{$cc})){
911 0         0 $self->{component2type}{district} = 'neighbourhood';
912 0         0 $oldalias = 'state_district';
913              
914             # add to the neighbourhood alias list
915             # though of course we are just sticking it at the end
916 0         0 push(@{$self->{component_aliases}{'neighbourhood'}}, 'district');
  0         0  
917              
918             } else {
919             # set 'district' to be type 'state_district'
920 224         362 $self->{component2type}{district} = 'state_district';
921 224         232 $oldalias = 'neighbourhood';
922              
923             # add to the state_district alias list
924 224         236 push(@{$self->{component_aliases}{'state_district'}}, 'district');
  224         534  
925             }
926              
927             # remove from the old alias list
928 224         255 my @temp = grep { $_ ne 'district' } @{$self->{component_aliases}{$oldalias}};
  2862         3507  
  224         476  
929 224         545 $self->{component_aliases}{$oldalias} = \@temp;
930 224         371 return;
931             }
932              
933              
934             # returns []
935             sub _find_unknown_components {
936 452     452   949 my $self = shift;
937 452         366 my $rh_components = shift;
938              
939 452         368 my %h_known = map { $_ => 1 } @{$self->{ordered_components}};
  29861         34624  
  452         790  
940 452         3336 my @a_unknown = grep { !exists($h_known{$_}) } sort keys %$rh_components;
  4079         5089  
941 452         2241 return \@a_unknown;
942             }
943              
944             1;
945              
946             __END__
947              
948             =pod
949              
950             =encoding UTF-8
951              
952             =head1 NAME
953              
954             Geo::Address::Formatter - take structured address data and format it according to the various global/country rules
955              
956             =head1 VERSION
957              
958             version 1.9982
959              
960             =head1 SYNOPSIS
961              
962             #
963             # get the templates (or use your own)
964             # git clone git@github.com:OpenCageData/address-formatting.git
965             #
966             my $GAF = Geo::Address::Formatter->new( conf_path => '/path/to/templates' );
967              
968             my $components = { ... }
969             my $text = $GAF->format_address($components, { country => 'FR' } );
970             my $rh_final_components = $GAF->final_components();
971             #
972             # or if we want shorter output
973             #
974             my $short_text = $GAF->format_address($components, { country => 'FR', abbreviate => 1, });
975              
976             =head2 new
977              
978             my $GAF = Geo::Address::Formatter->new( conf_path => '/path/to/templates' );
979              
980             Returns one instance. The I<conf_path> is required.
981              
982             Optional parameters are:
983              
984             I<debug>: prints tons of debugging info for use in development.
985              
986             I<no_warnings>: turns off a few warnings if configuration is not optimal.
987              
988             I<only_address>: formatted will only contain known components (will not include POI names). Note, can be overridden with optional param to format_address method.
989              
990             =head2 final_components
991              
992             my $rh_components = $GAF->final_components();
993              
994             returns a reference to a hash of the final components that are set at the
995             completion of B<format_address>. Warns if called before they have been set
996             (unless I<no_warnings> was set at object creation).
997              
998             =head2 format_address
999              
1000             my $text = $GAF->format_address(\%components, \%options );
1001              
1002             Given a structures address (hashref) and options (hashref) returns a
1003             formatted address.
1004              
1005             Possible options are:
1006              
1007             'abbreviate', if supplied common abbreviations are applied
1008             to the resulting output.
1009              
1010             'address_template', a mustache format template to be used instead of the template
1011             defined in the configuration
1012              
1013             'country', which should be an uppercase ISO 3166-1:alpha-2 code
1014             e.g. 'GB' for Great Britain, 'DE' for Germany, etc.
1015             If ommited we try to find the country in the address components.
1016              
1017             'only_address', same as only_address global option but set at formatting level
1018              
1019             =head1 DESCRIPTION
1020              
1021             You have a structured postal address (hash) and need to convert it into a
1022             readable address based on the format of the address country.
1023              
1024             For example, you have:
1025              
1026             {
1027             house_number => 12,
1028             street => 'Avenue Road',
1029             postcode => 45678,
1030             city => 'Deville'
1031             }
1032              
1033             you need:
1034              
1035             Great Britain: 12 Avenue Road, Deville 45678
1036             France: 12 Avenue Road, 45678 Deville
1037             Germany: Avenue Road 12, 45678 Deville
1038             Latvia: Avenue Road 12, Deville, 45678
1039              
1040             It gets more complicated with 200+ countries and territories and dozens more
1041             address components to consider.
1042              
1043             This module comes with a minimal configuration to run tests. Instead of
1044             developing your own configuration please use (and contribute to)
1045             those in https://github.com/OpenCageData/address-formatting
1046             which includes test cases.
1047              
1048             Together we can address the world!
1049              
1050             =head1 AUTHOR
1051              
1052             Ed Freyfogle
1053              
1054             =head1 COPYRIGHT AND LICENSE
1055              
1056             This software is copyright (c) 2023 by Opencage GmbH.
1057              
1058             This is free software; you can redistribute it and/or modify it under
1059             the same terms as the Perl 5 programming language system itself.
1060              
1061             =cut