File Coverage

blib/lib/String/Normal/Type/Business.pm
Criterion Covered Total %
statement 65 68 95.5
branch 22 32 68.7
condition 7 11 63.6
subroutine 11 11 100.0
pod 2 2 100.0
total 107 124 86.2


line stmt bran cond sub pod time code
1             package String::Normal::Type::Business;
2 9     9   30 use strict;
  9         9  
  9         183  
3 9     9   26 use warnings;
  9         7  
  9         141  
4 9     9   26 use String::Normal::Type;
  9         9  
  9         161  
5 9     9   2858 use String::Normal::Config;
  9         14  
  9         167  
6              
7 9     9   3823 use Lingua::Stem;
  9         38328  
  9         5508  
8             our $STEM;
9             our $biz_stop;
10             our $biz_compress;
11              
12             sub transform {
13 13     13 1 15 my ($self,$value) = @_;
14              
15             # tokenize and stem
16 13         9 my (@digits,@words);
17 13         24 _tokenize_value( $value, \@digits, \@words );
18 13         37 $STEM->stem_in_place( @words );
19              
20             # Remove "special" beginning and/or ending stopwords, if such words are present
21             # and enough tokens are in place to remove them safely.
22 13 50       5904 if (@words) {
23             # make a copy of @words and whittle it down
24 13         26 my @copy = @words;
25 13         9 my $count;
26 13 50       36 if ($count = $biz_stop->{first}{$copy[0]}) {
27 0 0       0 shift @copy if @copy >= $count;
28             }
29 13 50 33     59 if (@copy and $count = $biz_stop->{last}{$copy[-1]}) {
30 0 0       0 pop @copy if @copy >= $count;
31             }
32              
33             # reverting back if overnormalization occurs
34 13 50       40 @words = @copy if @copy;
35             }
36              
37             # Remove all middle stop words that are safe to remove, based on the number of
38             # tokens, of course.
39             my @filtered = map {
40 13   100     18 my $count = $biz_stop->{middle}{$_} || '';
  26         78  
41 26 100 66     89 (length $count and @words >= $count) ? () : $_;
42             } @words;
43              
44             # If we filtered all words out, "revert" to the full array of stemmed tokens.
45 13 100       23 @filtered = @words unless @filtered;
46              
47             # The canon name is the sorted filtered stemmed words plus the original digits.
48 13         93 return join ' ', sort @digits, @filtered;
49             }
50              
51             sub new {
52 3     3 1 6 my $self = shift;
53 3         16 $STEM = Lingua::Stem->new;
54 3         110 $STEM->add_exceptions( String::Normal::Config::BusinessStem::_data( @_ ) );
55 3         2326 $biz_stop = String::Normal::Config::BusinessStop::_data( @_ );
56 3         13 $biz_compress = String::Normal::Config::BusinessCompress::_data( @_ );
57 3         20 return bless {@_}, $self;
58             }
59              
60             sub _tokenize_value {
61 13     13   16 my ($value,$digits,$words) = @_;
62              
63 13         26 $value = String::Normal::Type::_scrub_value( $value );
64              
65             # split tokens on more than just whitespace:
66             # split digits from words but keep things like 3D and 1st combined,
67             # also split things like abcd#efgh but keep pound signs for #2 and # 1 and #
68             # prevent the empty string from finding its way into the token list as well
69 13 50       45 my @tokens = map { map length $_ ? $_ : (), split /##+|\s+|#+\b|\b#+/, $_ } $value =~ /(?:\d+\w{1,2}\b|\d+|\D+)/g;
  13         115  
70              
71             # walk each token thru the tree and create markers
72 13         27 my @pairs = _mark_pairs( \@tokens );
73 13 100       40 _compress_list( \@tokens, \@pairs ) if @pairs;
74              
75             # separate out tokens that contain digits (snowball stemmer will scrub all digits)
76 13         20 for (@tokens) {
77 26 50       40 if (/\d/) {
78 0         0 push @$digits, $_;
79             } else {
80 26         49 push @$words, $_;
81             }
82             }
83             }
84              
85             sub _mark_pairs {
86 13     13   9 my $tokens = shift;
87 13         12 my @pairs = ();
88 13         37 for my $i (0 .. $#$tokens) {
89 30         30 my $token = $tokens->[$i];
90 30 100       57 next unless exists $biz_compress->{$token};
91 5 100       16 next if $i + 1 > $#$tokens;
92 4         10 my $end = _walk_tree( $i + 1, $tokens, $biz_compress->{$token} );
93 4 100       9 if ($end) {
94 3         5 push @pairs, [$i,$end];
95 3         5 $i = $end;
96             }
97             }
98 13         23 return @pairs;
99             }
100              
101              
102             sub _walk_tree {
103 5     5   6 my ($i, $list, $tree) = @_;
104              
105 5 100       14 if (my $t = $tree->{$list->[$i]}) {
106 4 100 66     23 if (ref $t eq 'HASH' and !%$t) {
107 3         10 return $i;
108             } else {
109 1         4 _walk_tree( $i + 1, $list, $t );
110             }
111             }
112             }
113              
114             sub _compress_list {
115 3     3   4 my ($list,$pairs) = @_;
116 3         7 for my $pair (reverse @$pairs) {
117 3         4 my ($s,$e) = @$pair;
118 3         18 splice @$list, $s, $e - $s + 1, join '', @$list[$s .. $e];
119             }
120             }
121              
122             1;
123              
124             __END__