File Coverage

blib/lib/String/Normal/Type/Business.pm
Criterion Covered Total %
statement 65 68 95.5
branch 22 32 68.7
condition 7 11 63.6
subroutine 11 11 100.0
pod 2 2 100.0
total 107 124 86.2


line stmt bran cond sub pod time code
1             package String::Normal::Type::Business;
2 9     9   32 use strict;
  9         8  
  9         200  
3 9     9   26 use warnings;
  9         7  
  9         142  
4 9     9   26 use String::Normal::Type;
  9         9  
  9         158  
5 9     9   2931 use String::Normal::Config;
  9         10  
  9         155  
6              
7 9     9   3820 use Lingua::Stem;
  9         36370  
  9         5507  
8             our $STEM;
9             our $biz_stop;
10             our $biz_compress;
11              
12             sub transform {
13 13     13 1 19 my ($self,$value) = @_;
14              
15             # tokenize and stem
16 13         11 my (@digits,@words);
17 13         28 _tokenize_value( $value, \@digits, \@words );
18 13         38 $STEM->stem_in_place( @words );
19              
20             # Remove "special" beginning and/or ending stopwords, if such words are present
21             # and enough tokens are in place to remove them safely.
22 13 50       6170 if (@words) {
23             # make a copy of @words and whittle it down
24 13         20 my @copy = @words;
25 13         12 my $count;
26 13 50       36 if ($count = $biz_stop->{first}{$copy[0]}) {
27 0 0       0 shift @copy if @copy >= $count;
28             }
29 13 50 33     74 if (@copy and $count = $biz_stop->{last}{$copy[-1]}) {
30 0 0       0 pop @copy if @copy >= $count;
31             }
32              
33             # reverting back if overnormalization occurs
34 13 50       38 @words = @copy if @copy;
35             }
36              
37             # Remove all middle stop words that are safe to remove, based on the number of
38             # tokens, of course.
39             my @filtered = map {
40 13   100     18 my $count = $biz_stop->{middle}{$_} || '';
  26         80  
41 26 100 66     103 (length $count and @words >= $count) ? () : $_;
42             } @words;
43              
44             # If we filtered all words out, "revert" to the full array of stemmed tokens.
45 13 100       24 @filtered = @words unless @filtered;
46              
47             # The canon name is the sorted filtered stemmed words plus the original digits.
48 13         95 return join ' ', sort @digits, @filtered;
49             }
50              
51             sub new {
52 3     3 1 8 my $self = shift;
53 3         20 $STEM = Lingua::Stem->new;
54 3         118 $STEM->add_exceptions( String::Normal::Config::BusinessStem::_data( @_ ) );
55 3         3000 $biz_stop = String::Normal::Config::BusinessStop::_data( @_ );
56 3         15 $biz_compress = String::Normal::Config::BusinessCompress::_data( @_ );
57 3         25 return bless {@_}, $self;
58             }
59              
60             sub _tokenize_value {
61 13     13   18 my ($value,$digits,$words) = @_;
62              
63 13         30 $value = String::Normal::Type::_scrub_value( $value );
64              
65             # split tokens on more than just whitespace:
66             # split digits from words but keep things like 3D and 1st combined,
67             # also split things like abcd#efgh but keep pound signs for #2 and # 1 and #
68             # prevent the empty string from finding its way into the token list as well
69 13 50       49 my @tokens = map { map length $_ ? $_ : (), split /##+|\s+|#+\b|\b#+/, $_ } $value =~ /(?:\d+\w{1,2}\b|\d+|\D+)/g;
  13         127  
70              
71             # walk each token thru the tree and create markers
72 13         30 my @pairs = _mark_pairs( \@tokens );
73 13 100       54 _compress_list( \@tokens, \@pairs ) if @pairs;
74              
75             # separate out tokens that contain digits (snowball stemmer will scrub all digits)
76 13         20 for (@tokens) {
77 26 50       42 if (/\d/) {
78 0         0 push @$digits, $_;
79             } else {
80 26         51 push @$words, $_;
81             }
82             }
83             }
84              
85             sub _mark_pairs {
86 13     13   13 my $tokens = shift;
87 13         10 my @pairs = ();
88 13         38 for my $i (0 .. $#$tokens) {
89 30         27 my $token = $tokens->[$i];
90 30 100       74 next unless exists $biz_compress->{$token};
91 5 100       20 next if $i + 1 > $#$tokens;
92 4         13 my $end = _walk_tree( $i + 1, $tokens, $biz_compress->{$token} );
93 4 100       11 if ($end) {
94 3         8 push @pairs, [$i,$end];
95 3         6 $i = $end;
96             }
97             }
98 13         21 return @pairs;
99             }
100              
101              
102             sub _walk_tree {
103 5     5   7 my ($i, $list, $tree) = @_;
104              
105 5 100       18 if (my $t = $tree->{$list->[$i]}) {
106 4 100 66     26 if (ref $t eq 'HASH' and !%$t) {
107 3         9 return $i;
108             } else {
109 1         3 _walk_tree( $i + 1, $list, $t );
110             }
111             }
112             }
113              
114             sub _compress_list {
115 3     3   4 my ($list,$pairs) = @_;
116 3         7 for my $pair (reverse @$pairs) {
117 3         8 my ($s,$e) = @$pair;
118 3         17 splice @$list, $s, $e - $s + 1, join '', @$list[$s .. $e];
119             }
120             }
121              
122             1;
123              
124             __END__