File Coverage

blib/lib/Business/CompanyDesignator.pm
Criterion Covered Total %
statement 193 196 98.4
branch 94 104 90.3
condition 62 73 84.9
subroutine 26 26 100.0
pod 7 7 100.0
total 382 406 94.0


line stmt bran cond sub pod time code
1             package Business::CompanyDesignator;
2              
3             # Require perl 5.010 because the 'track' functionality of Regexp::Assemble
4             # is unsafe for earlier versions.
5 9     9   660497 use 5.010001;
  9         106  
6 9     9   4566 use Moose;
  9         3704066  
  9         70  
7 9     9   61055 use utf8;
  9         60  
  9         57  
8 9     9   314 use warnings qw(FATAL utf8);
  9         16  
  9         413  
9 9     9   2618 use FindBin qw($Bin);
  9         5346  
  9         1148  
10 9     9   2319 use YAML;
  9         32361  
  9         505  
11 9     9   5562 use File::ShareDir qw(dist_file);
  9         174307  
  9         567  
12 9     9   68 use List::MoreUtils qw(uniq);
  9         27  
  9         42  
13 9     9   11876 use Regexp::Assemble;
  9         158772  
  9         342  
14 9     9   5041 use Unicode::Normalize;
  9         17531  
  9         664  
15 9     9   70 use Carp;
  9         18  
  9         434  
16              
17 9     9   4151 use Business::CompanyDesignator::Record;
  9         39  
  9         543  
18 9     9   5097 use Business::CompanyDesignator::SplitResult;
  9         29  
  9         10042  
19              
20             our $VERSION = '0.17';
21              
22             # Hardcode the set of languages that we treat as 'continuous'
23             # i.e. their non-ascii designators don't require a word break
24             # before/after.
25             our %LANG_CONTINUA = map { $_ => 1 } qw(
26             zh
27             ja
28             ko
29             );
30              
31             has 'datafile' => ( is => 'ro', default => sub {
32             # Development/test version
33             my $local_datafile = "$Bin/../share/company_designator_dev.yml";
34             return $local_datafile if -f $local_datafile;
35             $local_datafile = "$Bin/../share/company_designator.yml";
36             return $local_datafile if -f $local_datafile;
37             # Installed version
38             return dist_file('Business-CompanyDesignator', 'company_designator.yml');
39             });
40              
41             # data is the raw dataset as loaded from datafile, keyed by long designator
42             has data => ( is => 'ro', lazy_build => 1 );
43              
44             # regex_cache is a cache of regexes by language and type, since they're expensive to build
45             has 'regex_cache' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
46              
47             # abbr_long_map is a hash mapping abbreviations (strings) back to an arrayref of
48             # long designators (since abbreviations are not necessarily unique)
49             has 'abbr_long_map' => ( is => 'ro', isa => 'HashRef', lazy_build => 1 );
50              
51             # pattern_string_map is a hash mapping patterns back to their source string,
52             # since we do things like add additional patterns without diacritics
53             has 'pattern_string_map' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
54             # pattern_string_map_lang is a hash of hashes, mapping language codes to hashes
55             # of patterns back to their source string
56             has 'pattern_string_map_lang' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
57              
58             sub _build_data {
59 8     8   53 my $self = shift;
60 8         256 YAML::LoadFile($self->datafile);
61             }
62              
63             sub _build_abbr_long_map {
64 5     5   28 my $self = shift;
65 5         16 my $map = {};
66 5         13 while (my ($long, $entry) = each %{ $self->data }) {
  935         22972  
67 930 100       1731 if (my $abbr = $entry->{abbr_std}) {
68 25   100     112 $map->{$abbr} ||= [];
69 25         39 push @{ $map->{$abbr} }, $long;
  25         75  
70             }
71 930 100       1735 my $abbr_list = $entry->{abbr} or next;
72 900 100       1728 $abbr_list = [ $abbr_list ] if ! ref $abbr_list;
73 900         1386 for my $abbr (@$abbr_list) {
74 1440   100     6043 $map->{$abbr} ||= [];
75 1440         1695 push @{ $map->{$abbr} }, $long;
  1440         3596  
76             }
77             }
78 5         138 return $map;
79             }
80              
81             sub long_designators {
82 4     4 1 1197 my $self = shift;
83 4         9 sort keys %{ $self->data };
  4         141  
84             }
85              
86             sub abbreviations {
87 3     3 1 976 my $self = shift;
88 3         6 sort keys %{ $self->abbr_long_map };
  3         114  
89             }
90              
91             sub designators {
92 1     1 1 2 my $self = shift;
93 1         5 sort $self->long_designators, $self->abbreviations;
94             }
95              
96             # Return the B::CD::Record for $long designator
97             sub record {
98 1782     1782 1 48890 my ($self, $long) = @_;
99 1782 100       45238 my $entry = $self->data->{$long}
100             or croak "Invalid long designator '$long'";
101 1781         51476 return Business::CompanyDesignator::Record->new( long => $long, record => $entry );
102             }
103              
104             # Return a list of B::CD::Records for $designator
105             sub records {
106 1427     1427 1 562913 my ($self, $designator) = @_;
107 1427 50       3383 croak "Missing designator" if ! $designator;
108 1427 100       43146 if (exists $self->data->{$designator}) {
    100          
109 388         1127 return ( $self->record($designator) );
110             }
111             elsif (my $long_set = $self->abbr_long_map->{$designator}) {
112 1038         2413 return map { $self->record($_) } @$long_set
  1207         2818  
113             }
114             else {
115 1         19 croak "Invalid designator '$designator'";
116             }
117             }
118              
119             # Add $string to regex assembler
120             sub _add_to_assembler {
121 1973     1973   3716 my ($self, $assembler, $lang, $string, $reference_string) = @_;
122 1973   66     4481 $reference_string ||= $string;
123             # printf "+ add_to_assembler (%s): '%s' => '%s'\n", join(',', @{ $lang || []}), $string, $reference_string;
124              
125             # FIXME: RA->add() doesn't work here because of known quantifier-escaping bugs:
126             # https://rt.cpan.org/Public/Bug/Display.html?id=50228
127             # https://rt.cpan.org/Public/Bug/Display.html?id=74449
128             # $assembler->add($string)
129             # Workaround by lexing and using insert()
130 1973         2572 my $optional1 = '\\.?,?\\s*';
131             my @pattern = map {
132             # Periods are treated as optional literals, with optional trailing commas and/or whitespace
133 1973 100       6542 /\./ ? $optional1 :
  27614 100       66046  
    100          
134             # Embedded spaces can be multiple, and include leading commas
135             / / ? ',?\s+' :
136             # Escape other regex metacharacters
137             /[()]/ ? "\\$_" : $_
138             } split //, $string;
139 1973         7194 $assembler->insert(@pattern);
140              
141             # Also add pattern => $string mapping to pattern_string_map and pattern_string_map_lang
142 1973         218229 my $pattern_string = join '', @pattern;
143              
144             # Special case - optional match characters can cause clashes between
145             # distinct pattern_strings e.g. /A\.?,?\s*S\.?,?\s*/ clashes with /AS/
146             # We need to handle such cases as ambiguous with extra checks
147 1973         3110 my $optional1e = "\Q$optional1\E";
148 1973         2362 my $alt_pattern_string1;
149 1973 100       10447 if ($pattern_string =~ /^(\w)(\w)$/) {
    100          
150 96         366 $alt_pattern_string1 = "$1$optional1$2$optional1";
151             } elsif ($pattern_string =~ /^(\w)$optional1e(\w)$optional1e$/) {
152 109         335 $alt_pattern_string1 = "$1$2";
153             }
154              
155             # If $pattern_string already exists in pattern_string_map then the pattern is ambiguous
156             # across entries, and we can't unambiguously map back to a standard designator
157 1973 100 66     58918 if (exists $self->pattern_string_map->{ $pattern_string }) {
    100          
158 432         10812 my $current = $self->pattern_string_map->{ $pattern_string };
159 432 100 100     1518 if ($current && $current ne $reference_string) {
160             # Reset to undef to mark ambiguity
161 3         84 $self->pattern_string_map->{ $pattern_string } = undef;
162             }
163             }
164             # Also check for the existence of $alt_pattern_string1, since this is also an ambiguity
165             elsif ($alt_pattern_string1 && exists $self->pattern_string_map->{ $alt_pattern_string1 }) {
166 7         209 my $current = $self->pattern_string_map->{ $alt_pattern_string1 };
167 7 50 33     42 if ($current && $current ne $reference_string) {
168             # Reset both pairs to undef to mark ambiguity
169 7         195 $self->pattern_string_map->{ $pattern_string } = undef;
170 7         181 $self->pattern_string_map->{ $alt_pattern_string1 } = undef;
171             }
172             }
173             else {
174 1534         38994 $self->pattern_string_map->{ $pattern_string } = $reference_string;
175             }
176 1973 100       3605 if ($lang) {
177 681         1145 for my $l (@$lang) {
178 821 100       21608 if (exists $self->pattern_string_map_lang->{$l}->{ $pattern_string }) {
179 230         6226 my $current = $self->pattern_string_map_lang->{$l}->{ $pattern_string };
180 230 50 33     780 if ($current && $current ne $reference_string) {
181             # Reset to undef to mark ambiguity
182 0         0 $self->pattern_string_map_lang->{$l}->{ $pattern_string } = undef;
183             }
184             }
185             else {
186 591         15165 $self->pattern_string_map_lang->{$l}->{ $pattern_string } = $reference_string;
187             }
188             }
189             }
190              
191             # If $string contains unicode diacritics, also add a version without them for misspellings
192 9 100   9   75 if ($string =~ m/\pM/) {
  9         22  
  9         138  
  1973         8511  
193 231         404 my $stripped = $string;
194 231         1347 $stripped =~ s/\pM//g;
195 231         750 $self->_add_to_assembler($assembler, $lang, $stripped, $reference_string);
196             }
197             }
198              
199             # Assemble designator regexes
200             sub _build_regex {
201 50     50   104 my $self = shift;
202 50         148 my ($type, $lang) = @_;
203              
204 50         105 state $types = { map { $_ => 1 } qw(end end_cont begin) };
  12         41  
205 50 50       193 if (! $types->{$type}) {
206 0         0 croak "invalid regex type '$type'";
207             }
208              
209             # RA constructor - case insensitive, with match tracking
210 50         344 my $assembler = Regexp::Assemble->new->flags('i')->track(1);
211              
212             # Construct language regex if $lang is set
213 50         4053 my $lang_re;
214 50 100       144 if ($lang) {
215 44 100       204 $lang = [ $lang ] if ! ref $lang;
216 44         200 my $lang_str = join '|', sort @$lang;
217 44         698 $lang_re = qr/^($lang_str)$/;
218             }
219              
220 50         113 my $count = 0;
221 50         97 while (my ($long, $entry) = each %{ $self->data }) {
  9350         222252  
222             # If $lang is set, restrict to entries that include $lang
223 9300 100 100     38214 next if $lang_re && $entry->{lang} !~ $lang_re;
224             # If $type is 'begin', restrict to 'lead' entries
225 1528 100 100     3676 next if $type eq 'begin' && ! $entry->{lead};
226             # if $type is 'end_cont', restrict to languages in %LANG_CONTINUA
227 1023 100 100     2742 next if $type eq 'end_cont' && ! $LANG_CONTINUA{$entry->{lang}};
228              
229 691         924 $count++;
230 691         3535 my $long_nfd = NFD($long);
231 691         1923 $self->_add_to_assembler($assembler, $lang, $long_nfd);
232              
233             # Add all abbreviations
234 691 100       2076 if (my $abbr_list = $entry->{abbr}) {
235 659 100       1523 $abbr_list = [ $abbr_list ] if ! ref $abbr_list;
236 659         1298 for my $abbr (@$abbr_list) {
237             # Only treat non-ascii abbreviations as continuous
238 1114 100 100     2440 next if $type eq 'end_cont' && $abbr =~ /^\p{ASCII}+$/;
239 1051         3900 my $abbr_nfd = NFD($abbr);
240 1051   66     4248 my $abbr_std = NFD($entry->{abbr_std} || $abbr);
241 1051         2266 $self->_add_to_assembler($assembler, $lang, $abbr_nfd, $abbr_std);
242             }
243             }
244             }
245              
246             # If no entries found (a strange/bogus language?), return undef
247 50 100       405 return if $count == 0;
248              
249 35 50       208 return wantarray ? ( $assembler->re, $assembler ) : $assembler->re;
250             }
251              
252             # Regex accessor, returning regexes by type (begin/end) and language (en, es, etc.)
253             # $type defaults to 'end', $lang defaults to undef (for all)
254             sub regex {
255 3282     3282 1 4604 my $self = shift;
256 3282         5823 my ($type, $lang) = @_;
257 3282   50     6010 $type ||= 'end';
258              
259             # $lang might be an arrayref containing multiple language codes
260 3282         4049 my $lang_key;
261 3282 100       5677 if ($lang) {
262 1152         1525 $lang_key = $lang;
263 1152 50 66     2657 if (ref $lang && ref $lang eq 'ARRAY' && @$lang) {
      66        
264 8 50       22 if (@$lang == 1) {
265 0         0 $lang_key = $lang->[0];
266             }
267             else {
268 8         22 $lang_key = join '_', sort map { lc $_ } @$lang;
  16         78  
269             }
270             }
271             }
272              
273 3282         4359 my $cache_key = $type;
274 3282 100       6168 $cache_key .= "_$lang_key" if $lang_key;
275              
276 3282 100       88646 if (my $entry = $self->regex_cache->{ $cache_key }) {
277 3232 50       12012 return wantarray ? @$entry : $entry->[0];
278             }
279              
280 50         227 my ($re, $assembler) = $self->_build_regex($type, $lang);
281 50         230461 $self->regex_cache->{ $cache_key } = [ $re, $assembler ];
282 50 50       354 return wantarray ? ( $re, $assembler ) : $re;
283             }
284              
285             # Helper to return split_designator results
286             sub _split_designator_result {
287 1286     1286   12125 my $self = shift;
288 1286         5290 my ($lang, $before, $des, $after, $matched_pattern) = @_;
289              
290             # $before can end in whitespace (that we don't want to consume in the RE
291             # for technical reasons around handling punctuation like '& Co' in designators)
292             # So trim here to handle that case.
293 1286 100       5959 $before =~ s/\s+$// if $before;
294              
295 1286         2214 my $des_std;
296 1286 100       2782 if ($matched_pattern) {
297 980 100       10705 $des_std = $self->pattern_string_map_lang->{$lang}->{$matched_pattern} if $lang;
298 980   100     24003 $des_std ||= $self->pattern_string_map->{$matched_pattern};
299 980 100       2470 if ($des_std) {
300             # Always coalesce spaces and delete commas from $des_std
301 960         2274 $des_std =~ s/,+/ /g;
302 960         2973 $des_std =~ s/\s\s+/ /g;
303             }
304             }
305              
306             # Legacy interface - return a simple before / des / after tuple, plus $des_std
307 1286 100 66     3390 return map { defined $_ && ! ref $_ ? NFC($_) : '' } ($before, $des, $after, $des_std)
  1728 100       13422  
308             if wantarray;
309              
310             # New scalar-context interface - return SplitResult object
311 854 100 100     24006 Business::CompanyDesignator::SplitResult->new(
      100        
      100        
      100        
312             before => NFC($before // ''),
313             designator => NFC($des // ''),
314             designator_std => NFC($des_std // ''),
315             after => NFC($after // ''),
316             records => [ $des_std ? $self->records(NFC $des_std) : () ],
317             );
318             }
319              
320             # Split $company_name on (the first) company designator, returning a triplet of strings:
321             # ($before, $designator, $after), plus the normalised form of the designator. If no
322             # designator is found, just returns ($company_name).
323             # e.g. matching "ABC Pty Ltd" would return "Pty Ltd" for $designator, but "Pty. Ltd." for
324             # the normalised form, and "Accessoires XYZ Ltee" would return "Ltee" for $designator,
325             # but "Ltée" for the normalised form
326             sub split_designator {
327 1286     1286 1 285419 my $self = shift;
328 1286         4113 my ($company_name, %arg) = @_;
329 1286         2544 my $lang = $arg{lang};
330 1286         3041 my $allow_embedded = $arg{allow_embedded};
331 1286   100     5448 $allow_embedded //= 1; # backwards-compatibility, unfortunately
332 1286         8759 my $company_name_match = NFD($company_name);
333              
334             # Handle older perls without XPosixPunct
335 1286 50       2326 state $punct_class = eval { '.' =~ m/\p{XPosixPunct}/ } ?
  4         35  
336             '[\s\p{XPosixPunct}]' :
337             '[\s[:punct:]]';
338              
339             # Strip all brackets for continuous language matching
340 1286         5330 (my $company_name_match_cont_stripped = $company_name_match) =~ s/[()\x{ff08}\x{ff09}]//g;
341              
342 1286         2444 my ($end_re, $end_asr, $end_cont_re, $end_cont_asr, $begin_re, $begin_asr);
343 1286 100       2925 if ($lang) {
344 576 100       1903 if ($LANG_CONTINUA{$lang}) {
345 36         195 ($end_cont_re, $end_cont_asr) = $self->regex('end_cont', $lang);
346             } else {
347 540         2023 ($end_re, $end_asr) = $self->regex('end', $lang);
348             }
349 576         1584 ($begin_re, $begin_asr) = $self->regex('begin', $lang);
350             } else {
351 710         2100 ($end_re, $end_asr) = $self->regex('end');
352 710         1790 ($end_cont_re, $end_cont_asr) = $self->regex('end_cont');
353 710         1574 ($begin_re, $begin_asr) = $self->regex('begin');
354             }
355              
356             # Designators are usually final, so try $end_re first
357 1286 100 100     360951 if ($end_re &&
358             $company_name_match =~ m/^\s*(.*?)${punct_class}\s*\(?($end_re)\)?\s*$/) {
359 677         1266193 return $self->_split_designator_result($lang, $1, $2, undef, $end_asr->source($^R));
360             }
361              
362             # No final designator - retry without a word break for the subset of languages
363             # that use continuous scripts (see %LANG_CONTINUA above)
364 609 100 100     632773 if ($end_cont_re &&
365             $company_name_match_cont_stripped =~ m/^\s*(.*?)\(?($end_cont_re)\)?\s*$/) {
366 114         1902 return $self->_split_designator_result($lang, $1, $2, undef, $end_cont_asr->source($^R));
367             }
368              
369             # No final designator - check for a lead designator instead (e.g. RU, NL, etc.)
370 495 100 100     45453 if ($begin_re &&
371             $company_name_match =~ m/^\s*\(?($begin_re)\)?${punct_class}\s*(.*?)\s*$/) {
372 147         53550 return $self->_split_designator_result($lang, undef, $1, $2, $begin_asr->source($^R));
373             }
374              
375             # No final or initial - check for an embedded designator with trailing content
376 348 100 100     34810 if ($end_re && $allow_embedded &&
      100        
377             $company_name_match =~ m/(.*?)${punct_class}\s*\(?($end_re)\)?(?:\s+(.*?))?$/) {
378 42         70350 return $self->_split_designator_result($lang, $1, $2, $3, $end_asr->source($^R));
379             }
380              
381             # No match - return $company_name unchanged
382 306         16268 return $self->_split_designator_result($lang, $company_name);
383             }
384              
385             1;
386              
387             __END__
388              
389             =encoding utf-8
390              
391             =head1 NAME
392              
393             Business::CompanyDesignator - module for matching and stripping/manipulating the
394             company designators appended to company names
395              
396             =head1 VERSION
397              
398             Version: 0.17.
399              
400             This module is considered a B<BETA> release. Interfaces may change and/or break
401             without notice until the module reaches version 1.0.
402              
403             =head1 SYNOPSIS
404              
405             Business::CompanyDesignator is a perl module for matching and stripping/manipulating
406             the typical company designators appended (or sometimes, prepended) to company names.
407             It supports both long forms (e.g. Corporation, Incorporated, Limited etc.) and
408             abbreviations (e.g. Corp., Inc., Ltd., GmbH etc).
409              
410             use Business::CompanyDesignator;
411              
412             # Constructor
413             $bcd = Business::CompanyDesignator->new;
414             # Optionally, you can provide your own company_designator.yml file, instead of the bundled one
415             $bcd = Business::CompanyDesignator->new(datafile => '/path/to/company_designator.yml');
416              
417             # Get lists of designators, which may be long (e.g. Limited) or abbreviations (e.g. Ltd.)
418             @des = $bcd->designators;
419             @long = $bcd->long_designators;
420             @abbrev = $bcd->abbreviations;
421              
422             # Lookup individual designator records (returns B::CD::Record objects)
423             # Lookup record by long designator (unique)
424             $record = $bcd->record($long_designator);
425             # Lookup records by abbreviation or long designator (may not be unique)
426             @records = $bcd->records($designator);
427              
428             # Get a regex for matching designators by type ('end'/'begin') and lang
429             # By default, returns 'end' regexes for all languages
430             $re = $bcd->regex;
431             $company_name =~ $re and say 'designator found!';
432             $company_name =~ /$re\s*$/ and say 'final designator found!';
433             my $re_begin_en = $bcd->regex('begin', 'en');
434              
435             # Split $company_name on designator, returning a ($before, $designator, $after) triplet,
436             # plus the normalised form of the designator matched (can pass to records(), for example)
437             ($before, $des, $after, $normalised_des) = $bcd->split_designator($company_name);
438              
439             # Or in scalar context, return a L<Business::CompanyDesignator::SplitResult> object
440             $res = $bcd->split_designator($company_name, lang => 'en');
441             print join ' / ', $res->designator_std, $res->short_name, $res->extra;
442              
443              
444             =head1 DATASET
445              
446             Business::CompanyDesignator uses the company designator dataset from here:
447              
448             L<https://github.com/ProfoundNetworks/company_designator>
449              
450             which is bundled with the module. You can use your own (updated or custom)
451             version, if you prefer, by passing a 'datafile' parameter to the constructor.
452              
453             The dataset defines multiple long form designators (like "Company", "Limited",
454             or "Incorporée"), each of which have zero or more abbreviations (e.g. 'Co.',
455             'Ltd.', 'Inc.' etc.), and one or more language codes. The 'Company' entry,
456             for instance, looks like this:
457              
458             Company:
459             abbr:
460             - Co.
461             - '& Co.'
462             - and Co.
463             - and Company
464             lang: en
465              
466             Long designators are unique across the dataset, but abbreviations are not
467             e.g. 'Inc.' is used for both "Incorporated" and French "Incorporée".
468              
469             =head1 METHODS
470              
471             =head2 new()
472              
473             Creates a Business::CompanyDesignator object.
474              
475             $bcd = Business::CompanyDesignator->new;
476              
477             By default this uses the bundled company_designator dataset. You may
478             provide your own (updated or custom) version by passing via a 'datafile'
479             parameter to the constructor.
480              
481             $bcd = Business::CompanyDesignator->new(datafile => '/path/to/company_designator.yml');
482              
483             =head2 designators()
484              
485             Returns the full list of company designator strings from the dataset
486             (both long form and abbreviations).
487              
488             @designators = $bcd->designators;
489              
490             =head2 long_designators()
491              
492             Returns the full list of long form designators from the dataset.
493              
494             @long = $bcd->long_designators;
495              
496             =head2 abbreviations()
497              
498             Returns the full list of abbreviation designators from the dataset.
499              
500             @abbrev = $bcd->abbreviations;
501              
502             =head2 record($long_designator)
503              
504             Returns the Business::CompanyDesignator::Record object for the given
505             long designator (and dies if not found).
506              
507             =head2 records($designator)
508              
509             Returns a list of Business::CompanyDesignator::Record objects for the
510             given abbreviation or long designator (for long designators there will
511             only be a single record returned, but abbreviations may map to multiple
512             records).
513              
514             Use this method for abbreviations, or if you're aren't sure of a
515             designator's type.
516              
517             =head2 regex([$type], [$lang])
518              
519             Returns a regex for all matching designators for $type ('begin'/'end') and
520             $lang (iso 639-1 language code e.g. 'en', 'es', de', etc.) from the dataset.
521             $lang may be either a single language code scalar, or an arrayref of language
522             codes, for multiple alternative languages. The returned regex is case-insensitive
523             and non-anchored.
524              
525             $type defaults to 'end', so without parameters regex() returns a regex
526             matching all designators for all languages.
527              
528             =head2 split_designator($company_name, [lang => $lang], [allow_embedded => $bool])
529              
530             Attempts to split $company_name on (the first) company designator found.
531              
532             In array context split_designator returns a list of four items - a triplet of
533             strings from $company_name ( $before, $designator, $after ), plus the
534             standardised version of the designator as a fourth element.
535              
536             ($short_name, $des, $after_text, $des_std) = $bcd->split_designator($company_name);
537              
538             In scalar context split_designator returns a L<Business::CompanyDesignator::SplitResult>
539             object.
540              
541             $res = $bcd->split_designator($company_name, lang => $lang);
542              
543             The $des designator in array context, and the SplitResult $res->designator
544             is the designator text as it matched in $company_name, while the array context
545             $des_std, and the SplitResult $res->designator_std is the standardised version
546             as found in the dataset.
547              
548             For instance, "ABC Pty Ltd" would return "Pty Ltd" as the $designator, but
549             "Pty. Ltd." as the stardardised form, and the latter would be what you
550             would find in designators() or would lookup with records(). Similarly,
551             "Accessoires XYZ Ltee" (without the french acute) would match, returning
552             "Ltee" (as found) for the $designator, but "Ltée" (with the acute) as the
553             standardised form.
554              
555             split_designator accepts the following optional (named) parameters:
556              
557             =over 4
558              
559             =item lang => $lang
560              
561             $lang can be a scalar ISO 639-1 language code ('en', 'fr', 'cn', etc.), or an
562             arrayref containing multiple language codes. If $lang is defined, split_designator
563             will only match designators for the specified set of languages, which can improve
564             the accuracy of the split by reducing false positive matches.
565              
566             =item allow_embedded => $boolean
567              
568             allow_embedded is a boolean indicating whether or not designators can occur in
569             the middle of strings, instead of only at the beginning or end. Defaults to true,
570             for backwards compatibility, which yields more matches, but also more false
571             positives. Setting to false is safer, but yields fewer matches (and embedded
572             designators do occur surprisingly often in the wild.)
573              
574             For more discussion, see L<AMBIGUITIES> below.
575              
576             =back
577              
578             =head2 AMBIGUITIES
579              
580             Note that split_designator does not always get the split right. It checks for
581             final designators first, then leading ones, and then finally looks for embedded
582             designators (if allow_embedded is set to true).
583              
584             Leading and trailing designators are usually reasonably accurate, but embedded
585             designators are problematic. For instance, embedded designators allow names like
586             these to split correctly:
587              
588             Amerihealth Insurance Company of NJ
589             Trenkwalder Personal AG Schweiz
590             Vicente Campano S L (COMERCIAL VICAM)
591             Gvozdika, gostinitsa OOO ""Eko-Treyd""
592              
593             but it will also wrongly split names like the following:
594              
595             XYZ PC Repairs ('PC' is a designator meaning 'Professional Corporation')
596             Dr S L Ledingham ('S L' is a Spanish designator for 'Sociedad Limitada')
597              
598             If you do want to allow splitting on embedded designators, you might want to pass
599             a 'lang' parameter to split_designator if you know the language(s) used for your
600             company names, as this will reduce the number of false positives by restricting the
601             set of designators matched against. It won't eliminate the issue altogether though,
602             so some post-processing might be required. (And I'd love to hear of ideas on how
603             to improve this.)
604              
605             =head1 SEE ALSO
606              
607             Finance::CompanyNames
608              
609             =head1 AUTHOR
610              
611             Gavin Carr <gavin@profound.net>
612              
613             =head1 COPYRIGHT AND LICENCE
614              
615             Copyright (C) 2013-2021 Gavin Carr
616              
617             This library is free software; you can redistribute it and/or modify it
618             under the same terms as Perl itself.
619              
620             =cut