File Coverage

blib/lib/Business/CompanyDesignator.pm
Criterion Covered Total %
statement 192 195 98.4
branch 92 102 90.2
condition 59 70 84.2
subroutine 26 26 100.0
pod 7 7 100.0
total 376 400 94.0


line stmt bran cond sub pod time code
1             package Business::CompanyDesignator;
2              
3             # Require perl 5.010 because the 'track' functionality of Regexp::Assemble
4             # is unsafe for earlier versions.
5 9     9   657930 use 5.010001;
  9         111  
6 9     9   4651 use Moose;
  9         3675611  
  9         52  
7 9     9   58348 use utf8;
  9         55  
  9         57  
8 9     9   284 use warnings qw(FATAL utf8);
  9         19  
  9         377  
9 9     9   2531 use FindBin qw($Bin);
  9         4935  
  9         1064  
10 9     9   2054 use YAML;
  9         31124  
  9         478  
11 9     9   5112 use File::ShareDir qw(dist_file);
  9         167941  
  9         570  
12 9     9   67 use List::MoreUtils qw(uniq);
  9         23  
  9         37  
13 9     9   11508 use Regexp::Assemble;
  9         157207  
  9         316  
14 9     9   5066 use Unicode::Normalize;
  9         17381  
  9         632  
15 9     9   69 use Carp;
  9         18  
  9         513  
16              
17 9     9   4207 use Business::CompanyDesignator::Record;
  9         37  
  9         449  
18 9     9   4899 use Business::CompanyDesignator::SplitResult;
  9         28  
  9         9867  
19              
20             our $VERSION = '0.16';
21              
22             # Hardcode the set of languages that we treat as 'continuous' i.e. their
23             # designators don't require a word break before/after.
24             our %LANG_CONTINUA = map { $_ => 1 } qw(
25             zh
26             ja
27             ko
28             );
29              
30             has 'datafile' => ( is => 'ro', default => sub {
31             # Development/test version
32             my $local_datafile = "$Bin/../share/company_designator_dev.yml";
33             return $local_datafile if -f $local_datafile;
34             $local_datafile = "$Bin/../share/company_designator.yml";
35             return $local_datafile if -f $local_datafile;
36             # Installed version
37             return dist_file('Business-CompanyDesignator', 'company_designator.yml');
38             });
39              
40             # data is the raw dataset as loaded from datafile, keyed by long designator
41             has data => ( is => 'ro', lazy_build => 1 );
42              
43             # regex_cache is a cache of regexes by language and type, since they're expensive to build
44             has 'regex_cache' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
45              
46             # abbr_long_map is a hash mapping abbreviations (strings) back to an arrayref of
47             # long designators (since abbreviations are not necessarily unique)
48             has 'abbr_long_map' => ( is => 'ro', isa => 'HashRef', lazy_build => 1 );
49              
50             # pattern_string_map is a hash mapping patterns back to their source string,
51             # since we do things like add additional patterns without diacritics
52             has 'pattern_string_map' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
53             # pattern_string_map_lang is a hash of hashes, mapping language codes to hashes
54             # of patterns back to their source string
55             has 'pattern_string_map_lang' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
56              
57             sub _build_data {
58 8     8   57 my $self = shift;
59 8         256 YAML::LoadFile($self->datafile);
60             }
61              
62             sub _build_abbr_long_map {
63 5     5   31 my $self = shift;
64 5         14 my $map = {};
65 5         15 while (my ($long, $entry) = each %{ $self->data }) {
  895         22117  
66 890 100       1718 if (my $abbr = $entry->{abbr_std}) {
67 25   100     114 $map->{$abbr} ||= [];
68 25         42 push @{ $map->{$abbr} }, $long;
  25         70  
69             }
70 890 100       1701 my $abbr_list = $entry->{abbr} or next;
71 860 100       1592 $abbr_list = [ $abbr_list ] if ! ref $abbr_list;
72 860         1296 for my $abbr (@$abbr_list) {
73 1365   100     5791 $map->{$abbr} ||= [];
74 1365         1687 push @{ $map->{$abbr} }, $long;
  1365         3495  
75             }
76             }
77 5         149 return $map;
78             }
79              
80             sub long_designators {
81 4     4 1 1178 my $self = shift;
82 4         9 sort keys %{ $self->data };
  4         132  
83             }
84              
85             sub abbreviations {
86 3     3 1 1055 my $self = shift;
87 3         7 sort keys %{ $self->abbr_long_map };
  3         107  
88             }
89              
90             sub designators {
91 1     1 1 3 my $self = shift;
92 1         5 sort $self->long_designators, $self->abbreviations;
93             }
94              
95             # Return the B::CD::Record for $long designator
96             sub record {
97 1590     1590 1 46348 my ($self, $long) = @_;
98 1590 100       40880 my $entry = $self->data->{$long}
99             or croak "Invalid long designator '$long'";
100 1589         46147 return Business::CompanyDesignator::Record->new( long => $long, record => $entry );
101             }
102              
103             # Return a list of B::CD::Records for $designator
104             sub records {
105 1243     1243 1 486211 my ($self, $designator) = @_;
106 1243 50       2878 croak "Missing designator" if ! $designator;
107 1243 100       38104 if (exists $self->data->{$designator}) {
    100          
108 366         934 return ( $self->record($designator) );
109             }
110             elsif (my $long_set = $self->abbr_long_map->{$designator}) {
111 876         2168 return map { $self->record($_) } @$long_set
  1045         2433  
112             }
113             else {
114 1         21 croak "Invalid designator '$designator'";
115             }
116             }
117              
118             # Add $string to regex assembler
119             sub _add_to_assembler {
120 1896     1896   3362 my ($self, $assembler, $lang, $string, $reference_string) = @_;
121 1896   66     4092 $reference_string ||= $string;
122             # printf "+ add_to_assembler (%s): '%s' => '%s'\n", join(',', @{ $lang || []}), $string, $reference_string;
123              
124             # FIXME: RA->add() doesn't work here because of known quantifier-escaping bugs:
125             # https://rt.cpan.org/Public/Bug/Display.html?id=50228
126             # https://rt.cpan.org/Public/Bug/Display.html?id=74449
127             # $assembler->add($string)
128             # Workaround by lexing and using insert()
129 1896         2361 my $optional1 = '\\.?,?\\s*';
130             my @pattern = map {
131             # Periods are treated as optional literals, with optional trailing commas and/or whitespace
132 1896 100       6023 /\./ ? $optional1 :
  26451 100       62650  
    100          
133             # Embedded spaces can be multiple, and include leading commas
134             / / ? ',?\s+' :
135             # Escape other regex metacharacters
136             /[()]/ ? "\\$_" : $_
137             } split //, $string;
138 1896         6768 $assembler->insert(@pattern);
139              
140             # Also add pattern => $string mapping to pattern_string_map and pattern_string_map_lang
141 1896         205850 my $pattern_string = join '', @pattern;
142              
143             # Special case - optional match characters can cause clashes between
144             # distinct pattern_strings e.g. /A\.?,?\s*S\.?,?\s*/ clashes with /AS/
145             # We need to handle such cases as ambiguous with extra checks
146 1896         3108 my $optional1e = "\Q$optional1\E";
147 1896         2345 my $alt_pattern_string1;
148 1896 100       9834 if ($pattern_string =~ /^(\w)(\w)$/) {
    100          
149 84         283 $alt_pattern_string1 = "$1$optional1$2$optional1";
150             } elsif ($pattern_string =~ /^(\w)$optional1e(\w)$optional1e$/) {
151 107         314 $alt_pattern_string1 = "$1$2";
152             }
153              
154             # If $pattern_string already exists in pattern_string_map then the pattern is ambiguous
155             # across entries, and we can't unambiguously map back to a standard designator
156 1896 100 66     56001 if (exists $self->pattern_string_map->{ $pattern_string }) {
    100          
157 429         11785 my $current = $self->pattern_string_map->{ $pattern_string };
158 429 100 100     1468 if ($current && $current ne $reference_string) {
159             # Reset to undef to mark ambiguity
160 3         84 $self->pattern_string_map->{ $pattern_string } = undef;
161             }
162             }
163             # Also check for the existence of $alt_pattern_string1, since this is also an ambiguity
164             elsif ($alt_pattern_string1 && exists $self->pattern_string_map->{ $alt_pattern_string1 }) {
165 7         178 my $current = $self->pattern_string_map->{ $alt_pattern_string1 };
166 7 50 33     55 if ($current && $current ne $reference_string) {
167             # Reset both pairs to undef to mark ambiguity
168 7         203 $self->pattern_string_map->{ $pattern_string } = undef;
169 7         198 $self->pattern_string_map->{ $alt_pattern_string1 } = undef;
170             }
171             }
172             else {
173 1460         36752 $self->pattern_string_map->{ $pattern_string } = $reference_string;
174             }
175 1896 100       3517 if ($lang) {
176 638         1029 for my $l (@$lang) {
177 766 100       20150 if (exists $self->pattern_string_map_lang->{$l}->{ $pattern_string }) {
178 202         5331 my $current = $self->pattern_string_map_lang->{$l}->{ $pattern_string };
179 202 50 33     705 if ($current && $current ne $reference_string) {
180             # Reset to undef to mark ambiguity
181 0         0 $self->pattern_string_map_lang->{$l}->{ $pattern_string } = undef;
182             }
183             }
184             else {
185 564         14773 $self->pattern_string_map_lang->{$l}->{ $pattern_string } = $reference_string;
186             }
187             }
188             }
189              
190             # If $string contains unicode diacritics, also add a version without them for misspellings
191 9 100   9   73 if ($string =~ m/\pM/) {
  9         19  
  9         135  
  1896         8060  
192 217         381 my $stripped = $string;
193 217         1170 $stripped =~ s/\pM//g;
194 217         724 $self->_add_to_assembler($assembler, $lang, $stripped, $reference_string);
195             }
196             }
197              
198             # Assemble designator regexes
199             sub _build_regex {
200 42     42   80 my $self = shift;
201 42         98 my ($type, $lang) = @_;
202              
203 42         99 state $types = { map { $_ => 1 } qw(end end_cont begin) };
  12         56  
204 42 50       147 if (! $types->{$type}) {
205 0         0 croak "invalid regex type '$type'";
206             }
207              
208             # RA constructor - case insensitive, with match tracking
209 42         258 my $assembler = Regexp::Assemble->new->flags('i')->track(1);
210              
211             # Construct language regex if $lang is set
212 42         3153 my $lang_re;
213 42 100       115 if ($lang) {
214 36 100       150 $lang = [ $lang ] if ! ref $lang;
215 36         154 my $lang_str = join '|', sort @$lang;
216 36         446 $lang_re = qr/^($lang_str)$/;
217             }
218              
219 42         88 my $count = 0;
220 42         70 while (my ($long, $entry) = each %{ $self->data }) {
  7518         176560  
221             # If $lang is set, restrict to entries that include $lang
222 7476 100 100     29452 next if $lang_re && $entry->{lang} !~ $lang_re;
223             # If $type is 'begin', restrict to 'lead' entries
224 1448 100 100     3479 next if $type eq 'begin' && ! $entry->{lead};
225             # if $type is 'end_cont', restrict to languages in %LANG_CONTINUA
226 964 100 100     2537 next if $type eq 'end_cont' && ! $LANG_CONTINUA{$entry->{lang}};
227              
228 648         810 $count++;
229 648         3393 my $long_nfd = NFD($long);
230 648         1690 $self->_add_to_assembler($assembler, $lang, $long_nfd);
231              
232             # Add all abbreviations
233 648 100       2215 if (my $abbr_list = $entry->{abbr}) {
234 616 100       1444 $abbr_list = [ $abbr_list ] if ! ref $abbr_list;
235 616         1104 for my $abbr (@$abbr_list) {
236 1031         3723 my $abbr_nfd = NFD($abbr);
237 1031   66     4336 my $abbr_std = NFD($entry->{abbr_std} || $abbr);
238 1031         2207 $self->_add_to_assembler($assembler, $lang, $abbr_nfd, $abbr_std);
239             }
240             }
241             }
242              
243             # If no entries found (a strange/bogus language?), return undef
244 42 100       266 return if $count == 0;
245              
246 30 50       190 return wantarray ? ( $assembler->re, $assembler ) : $assembler->re;
247             }
248              
249             # Regex accessor, returning regexes by type (begin/end) and language (en, es, etc.)
250             # $type defaults to 'end', $lang defaults to undef (for all)
251             sub regex {
252 2753     2753 1 3745 my $self = shift;
253 2753         4785 my ($type, $lang) = @_;
254 2753   50     5042 $type ||= 'end';
255              
256             # $lang might be an arrayref containing multiple language codes
257 2753         3313 my $lang_key;
258 2753 100       4821 if ($lang) {
259 968         1288 $lang_key = $lang;
260 968 50 66     2011 if (ref $lang && ref $lang eq 'ARRAY' && @$lang) {
      66        
261 8 50       18 if (@$lang == 1) {
262 0         0 $lang_key = $lang->[0];
263             }
264             else {
265 8         21 $lang_key = join '_', sort map { lc $_ } @$lang;
  16         64  
266             }
267             }
268             }
269              
270 2753         3625 my $cache_key = $type;
271 2753 100       4934 $cache_key .= "_$lang_key" if $lang_key;
272              
273 2753 100       73585 if (my $entry = $self->regex_cache->{ $cache_key }) {
274 2711 50       11172 return wantarray ? @$entry : $entry->[0];
275             }
276              
277 42         154 my ($re, $assembler) = $self->_build_regex($type, $lang);
278 42         218295 $self->regex_cache->{ $cache_key } = [ $re, $assembler ];
279 42 50       261 return wantarray ? ( $re, $assembler ) : $re;
280             }
281              
282             # Helper to return split_designator results
283             sub _split_designator_result {
284 1079     1079   9895 my $self = shift;
285 1079         4167 my ($lang, $before, $des, $after, $matched_pattern) = @_;
286              
287             # $before can end in whitespace (that we don't want to consume in the RE
288             # for technical reasons around handling punctuation like '& Co' in designators)
289             # So trim here to handle that case.
290 1079 100       4967 $before =~ s/\s+$// if $before;
291              
292 1079         1803 my $des_std;
293 1079 100       2132 if ($matched_pattern) {
294 819 100       8696 $des_std = $self->pattern_string_map_lang->{$lang}->{$matched_pattern} if $lang;
295 819   100     20126 $des_std ||= $self->pattern_string_map->{$matched_pattern};
296 819 100       1738 if ($des_std) {
297             # Always coalesce spaces and delete commas from $des_std
298 799         1629 $des_std =~ s/,+/ /g;
299 799         2071 $des_std =~ s/\s\s+/ /g;
300             }
301             }
302              
303             # Legacy interface - return a simple before / des / after tuple, plus $des_std
304 1079 100 66     2698 return map { defined $_ && ! ref $_ ? NFC($_) : '' } ($before, $des, $after, $des_std)
  1452 100       11041  
305             if wantarray;
306              
307             # New scalar-context interface - return SplitResult object
308 716 100 100     18662 Business::CompanyDesignator::SplitResult->new(
      100        
      100        
      100        
309             before => NFC($before // ''),
310             designator => NFC($des // ''),
311             designator_std => NFC($des_std // ''),
312             after => NFC($after // ''),
313             records => [ $des_std ? $self->records(NFC $des_std) : () ],
314             );
315             }
316              
317             # Split $company_name on (the first) company designator, returning a triplet of strings:
318             # ($before, $designator, $after), plus the normalised form of the designator. If no
319             # designator is found, just returns ($company_name).
320             # e.g. matching "ABC Pty Ltd" would return "Pty Ltd" for $designator, but "Pty. Ltd." for
321             # the normalised form, and "Accessoires XYZ Ltee" would return "Ltee" for $designator,
322             # but "Ltée" for the normalised form
323             sub split_designator {
324 1079     1079 1 238445 my $self = shift;
325 1079         3203 my ($company_name, %arg) = @_;
326 1079         2191 my $lang = $arg{lang};
327 1079         1886 my $allow_embedded = $arg{allow_embedded};
328 1079   100     4394 $allow_embedded //= 1; # backwards-compatibility, unfortunately
329 1079         6859 my $company_name_match = NFD($company_name);
330              
331             # Handle older perls without XPosixPunct
332 1079 50       2012 state $punct_class = eval { '.' =~ m/\p{XPosixPunct}/ } ?
  4         36  
333             '[\s\p{XPosixPunct}]' :
334             '[\s[:punct:]]';
335              
336             # Strip all brackets for continuous language matching
337 1079         4429 (my $company_name_match_cont_stripped = $company_name_match) =~ s/[()\x{ff08}\x{ff09}]//g;
338              
339 1079         2008 my ($end_re, $end_asr, $end_cont_re, $end_cont_asr, $begin_re, $begin_asr);
340 1079 100       2303 if ($lang) {
341 484 100       1120 if ($LANG_CONTINUA{$lang}) {
342 36         81 ($end_cont_re, $end_cont_asr) = $self->regex('end_cont', $lang);
343             } else {
344 448         1251 ($end_re, $end_asr) = $self->regex('end', $lang);
345             }
346 484         1151 ($begin_re, $begin_asr) = $self->regex('begin', $lang);
347             } else {
348 595         1916 ($end_re, $end_asr) = $self->regex('end');
349 595         1524 ($end_cont_re, $end_cont_asr) = $self->regex('end_cont');
350 595         1381 ($begin_re, $begin_asr) = $self->regex('begin');
351             }
352              
353             # Designators are usually final, so try $end_re first
354 1079 100 100     287582 if ($end_re &&
355             $company_name_match =~ m/^\s*(.*?)${punct_class}\s*\(?($end_re)\)?\s*$/) {
356 572         1018642 return $self->_split_designator_result($lang, $1, $2, undef, $end_asr->source($^R));
357             }
358              
359             # No final designator - retry without a word break for the subset of languages
360             # that use continuous scripts (see %LANG_CONTINUA above)
361 507 100 100     506032 if ($end_cont_re &&
362             $company_name_match_cont_stripped =~ m/^\s*(.*?)\(?($end_cont_re)\)?\s*$/) {
363 114         1501 return $self->_split_designator_result($lang, $1, $2, undef, $end_cont_asr->source($^R));
364             }
365              
366             # No final designator - check for a lead designator instead (e.g. RU, NL, etc.)
367 393 100 100     31529 if ($begin_re &&
368             $company_name_match =~ m/^\s*\(?($begin_re)\)?${punct_class}\s*(.*?)\s*$/) {
369 91         29011 return $self->_split_designator_result($lang, undef, $1, $2, $begin_asr->source($^R));
370             }
371              
372             # No final or initial - check for an embedded designator with trailing content
373 302 100 100     33225 if ($end_re && $allow_embedded &&
      100        
374             $company_name_match =~ m/(.*?)${punct_class}\s*\(?($end_re)\)?(?:\s+(.*?))?$/) {
375 42         70358 return $self->_split_designator_result($lang, $1, $2, $3, $end_asr->source($^R));
376             }
377              
378             # No match - return $company_name unchanged
379 260         15879 return $self->_split_designator_result($lang, $company_name);
380             }
381              
382             1;
383              
384             __END__
385              
386             =encoding utf-8
387              
388             =head1 NAME
389              
390             Business::CompanyDesignator - module for matching and stripping/manipulating the
391             company designators appended to company names
392              
393             =head1 VERSION
394              
395             Version: 0.16.
396              
397             This module is considered a B<BETA> release. Interfaces may change and/or break
398             without notice until the module reaches version 1.0.
399              
400             =head1 SYNOPSIS
401              
402             Business::CompanyDesignator is a perl module for matching and stripping/manipulating
403             the typical company designators appended (or sometimes, prepended) to company names.
404             It supports both long forms (e.g. Corporation, Incorporated, Limited etc.) and
405             abbreviations (e.g. Corp., Inc., Ltd., GmbH etc).
406              
407             use Business::CompanyDesignator;
408              
409             # Constructor
410             $bcd = Business::CompanyDesignator->new;
411             # Optionally, you can provide your own company_designator.yml file, instead of the bundled one
412             $bcd = Business::CompanyDesignator->new(datafile => '/path/to/company_designator.yml');
413              
414             # Get lists of designators, which may be long (e.g. Limited) or abbreviations (e.g. Ltd.)
415             @des = $bcd->designators;
416             @long = $bcd->long_designators;
417             @abbrev = $bcd->abbreviations;
418              
419             # Lookup individual designator records (returns B::CD::Record objects)
420             # Lookup record by long designator (unique)
421             $record = $bcd->record($long_designator);
422             # Lookup records by abbreviation or long designator (may not be unique)
423             @records = $bcd->records($designator);
424              
425             # Get a regex for matching designators by type ('end'/'begin') and lang
426             # By default, returns 'end' regexes for all languages
427             $re = $bcd->regex;
428             $company_name =~ $re and say 'designator found!';
429             $company_name =~ /$re\s*$/ and say 'final designator found!';
430             my $re_begin_en = $bcd->regex('begin', 'en');
431              
432             # Split $company_name on designator, returning a ($before, $designator, $after) triplet,
433             # plus the normalised form of the designator matched (can pass to records(), for example)
434             ($before, $des, $after, $normalised_des) = $bcd->split_designator($company_name);
435              
436             # Or in scalar context, return a L<Business::CompanyDesignator::SplitResult> object
437             $res = $bcd->split_designator($company_name, lang => 'en');
438             print join ' / ', $res->designator_std, $res->short_name, $res->extra;
439              
440              
441             =head1 DATASET
442              
443             Business::CompanyDesignator uses the company designator dataset from here:
444              
445             L<https://github.com/ProfoundNetworks/company_designator>
446              
447             which is bundled with the module. You can use your own (updated or custom)
448             version, if you prefer, by passing a 'datafile' parameter to the constructor.
449              
450             The dataset defines multiple long form designators (like "Company", "Limited",
451             or "Incorporée"), each of which have zero or more abbreviations (e.g. 'Co.',
452             'Ltd.', 'Inc.' etc.), and one or more language codes. The 'Company' entry,
453             for instance, looks like this:
454              
455             Company:
456             abbr:
457             - Co.
458             - '& Co.'
459             - and Co.
460             - and Company
461             lang: en
462              
463             Long designators are unique across the dataset, but abbreviations are not
464             e.g. 'Inc.' is used for both "Incorporated" and French "Incorporée".
465              
466             =head1 METHODS
467              
468             =head2 new()
469              
470             Creates a Business::CompanyDesignator object.
471              
472             $bcd = Business::CompanyDesignator->new;
473              
474             By default this uses the bundled company_designator dataset. You may
475             provide your own (updated or custom) version by passing via a 'datafile'
476             parameter to the constructor.
477              
478             $bcd = Business::CompanyDesignator->new(datafile => '/path/to/company_designator.yml');
479              
480             =head2 designators()
481              
482             Returns the full list of company designator strings from the dataset
483             (both long form and abbreviations).
484              
485             @designators = $bcd->designators;
486              
487             =head2 long_designators()
488              
489             Returns the full list of long form designators from the dataset.
490              
491             @long = $bcd->long_designators;
492              
493             =head2 abbreviations()
494              
495             Returns the full list of abbreviation designators from the dataset.
496              
497             @abbrev = $bcd->abbreviations;
498              
499             =head2 record($long_designator)
500              
501             Returns the Business::CompanyDesignator::Record object for the given
502             long designator (and dies if not found).
503              
504             =head2 records($designator)
505              
506             Returns a list of Business::CompanyDesignator::Record objects for the
507             given abbreviation or long designator (for long designators there will
508             only be a single record returned, but abbreviations may map to multiple
509             records).
510              
511             Use this method for abbreviations, or if you're aren't sure of a
512             designator's type.
513              
514             =head2 regex([$type], [$lang])
515              
516             Returns a regex for all matching designators for $type ('begin'/'end') and
517             $lang (iso 639-1 language code e.g. 'en', 'es', de', etc.) from the dataset.
518             $lang may be either a single language code scalar, or an arrayref of language
519             codes, for multiple alternative languages. The returned regex is case-insensitive
520             and non-anchored.
521              
522             $type defaults to 'end', so without parameters regex() returns a regex
523             matching all designators for all languages.
524              
525             =head2 split_designator($company_name, [lang => $lang], [allow_embedded => $bool])
526              
527             Attempts to split $company_name on (the first) company designator found.
528              
529             In array context split_designator returns a list of four items - a triplet of
530             strings from $company_name ( $before, $designator, $after ), plus the
531             standardised version of the designator as a fourth element.
532              
533             ($short_name, $des, $after_text, $des_std) = $bcd->split_designator($company_name);
534              
535             In scalar context split_designator returns a L<Business::CompanyDesignator::SplitResult>
536             object.
537              
538             $res = $bcd->split_designator($company_name, lang => $lang);
539              
540             The $des designator in array context, and the SplitResult $res->designator
541             is the designator text as it matched in $company_name, while the array context
542             $des_std, and the SplitResult $res->designator_std is the standardised version
543             as found in the dataset.
544              
545             For instance, "ABC Pty Ltd" would return "Pty Ltd" as the $designator, but
546             "Pty. Ltd." as the stardardised form, and the latter would be what you
547             would find in designators() or would lookup with records(). Similarly,
548             "Accessoires XYZ Ltee" (without the french acute) would match, returning
549             "Ltee" (as found) for the $designator, but "Ltée" (with the acute) as the
550             standardised form.
551              
552             split_designator accepts the following optional (named) parameters:
553              
554             =over 4
555              
556             =item lang => $lang
557              
558             $lang can be a scalar ISO 639-1 language code ('en', 'fr', 'cn', etc.), or an
559             arrayref containing multiple language codes. If $lang is defined, split_designator
560             will only match designators for the specified set of languages, which can improve
561             the accuracy of the split by reducing false positive matches.
562              
563             =item allow_embedded => $boolean
564              
565             allow_embedded is a boolean indicating whether or not designators can occur in
566             the middle of strings, instead of only at the beginning or end. Defaults to true,
567             for backwards compatibility, which yields more matches, but also more false
568             positives. Setting to false is safer, but yields fewer matches (and embedded
569             designators do occur surprisingly often in the wild.)
570              
571             For more discussion, see L<AMBIGUITIES> below.
572              
573             =back
574              
575             =head2 AMBIGUITIES
576              
577             Note that split_designator does not always get the split right. It checks for
578             final designators first, then leading ones, and then finally looks for embedded
579             designators (if allow_embedded is set to true).
580              
581             Leading and trailing designators are usually reasonably accurate, but embedded
582             designators are problematic. For instance, embedded designators allow names like
583             these to split correctly:
584              
585             Amerihealth Insurance Company of NJ
586             Trenkwalder Personal AG Schweiz
587             Vicente Campano S L (COMERCIAL VICAM)
588             Gvozdika, gostinitsa OOO ""Eko-Treyd""
589              
590             but it will also wrongly split names like the following:
591              
592             XYZ PC Repairs ('PC' is a designator meaning 'Professional Corporation')
593             Dr S L Ledingham ('S L' is a Spanish designator for 'Sociedad Limitada')
594              
595             If you do want to allow splitting on embedded designators, you might want to pass
596             a 'lang' parameter to split_designator if you know the language(s) used for your
597             company names, as this will reduce the number of false positives by restricting the
598             set of designators matched against. It won't eliminate the issue altogether though,
599             so some post-processing might be required. (And I'd love to hear of ideas on how
600             to improve this.)
601              
602             =head1 SEE ALSO
603              
604             Finance::CompanyNames
605              
606             =head1 AUTHOR
607              
608             Gavin Carr <gavin@profound.net>
609              
610             =head1 COPYRIGHT AND LICENCE
611              
612             Copyright (C) 2013-2021 Gavin Carr
613              
614             This library is free software; you can redistribute it and/or modify it
615             under the same terms as Perl itself.
616              
617             =cut