|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
  
 
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package String::Normal::Type::Business;  | 
| 
2
 | 
9
 | 
 
 | 
 
 | 
  
9
  
 | 
 
 | 
32
 | 
 use strict;  | 
| 
 
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
    | 
| 
 
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
200
 | 
    | 
| 
3
 | 
9
 | 
 
 | 
 
 | 
  
9
  
 | 
 
 | 
26
 | 
 use warnings;  | 
| 
 
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
    | 
| 
 
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
142
 | 
    | 
| 
4
 | 
9
 | 
 
 | 
 
 | 
  
9
  
 | 
 
 | 
26
 | 
 use String::Normal::Type;  | 
| 
 
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
    | 
| 
 
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
158
 | 
    | 
| 
5
 | 
9
 | 
 
 | 
 
 | 
  
9
  
 | 
 
 | 
2931
 | 
 use String::Normal::Config;  | 
| 
 
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
    | 
| 
 
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
155
 | 
    | 
| 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
7
 | 
9
 | 
 
 | 
 
 | 
  
9
  
 | 
 
 | 
3820
 | 
 use Lingua::Stem;  | 
| 
 
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
36370
 | 
    | 
| 
 
 | 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5507
 | 
    | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $STEM;  | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $biz_stop;  | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $biz_compress;  | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub transform {  | 
| 
13
 | 
13
 | 
 
 | 
 
 | 
  
13
  
 | 
  
1
  
 | 
19
 | 
     my ($self,$value) = @_;  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # tokenize and stem  | 
| 
16
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
     my (@digits,@words);  | 
| 
17
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
28
 | 
     _tokenize_value( $value, \@digits, \@words );  | 
| 
18
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
38
 | 
     $STEM->stem_in_place( @words );  | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # Remove "special" beginning and/or ending stopwords, if such words are present  | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # and enough tokens are in place to remove them safely.  | 
| 
22
 | 
13
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
6170
 | 
     if (@words) {  | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # make a copy of @words and whittle it down  | 
| 
24
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
20
 | 
         my @copy = @words;  | 
| 
25
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
         my $count;  | 
| 
26
 | 
13
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
36
 | 
         if ($count = $biz_stop->{first}{$copy[0]}) {  | 
| 
27
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
             shift @copy if @copy >= $count;  | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
29
 | 
13
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
74
 | 
         if (@copy and $count = $biz_stop->{last}{$copy[-1]}) {  | 
| 
30
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
             pop @copy if @copy >= $count;  | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # reverting back if overnormalization occurs  | 
| 
34
 | 
13
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
38
 | 
         @words = @copy if @copy;  | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # Remove all middle stop words that are safe to remove, based on the number of  | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # tokens, of course.  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my @filtered = map {  | 
| 
40
 | 
13
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
18
 | 
         my $count = $biz_stop->{middle}{$_} || '';  | 
| 
 
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
80
 | 
    | 
| 
41
 | 
26
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
103
 | 
         (length $count and @words >= $count) ? () : $_;  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } @words;  | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # If we filtered all words out, "revert" to the full array of stemmed tokens.  | 
| 
45
 | 
13
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
24
 | 
     @filtered = @words unless @filtered;  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # The canon name is the sorted filtered stemmed words plus the original digits.  | 
| 
48
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
95
 | 
     return join ' ', sort @digits, @filtered;  | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub new {  | 
| 
52
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
  
1
  
 | 
8
 | 
     my $self = shift;  | 
| 
53
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
20
 | 
     $STEM = Lingua::Stem->new;  | 
| 
54
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
118
 | 
     $STEM->add_exceptions( String::Normal::Config::BusinessStem::_data( @_ ) );  | 
| 
55
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3000
 | 
     $biz_stop     = String::Normal::Config::BusinessStop::_data( @_ );  | 
| 
56
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
15
 | 
     $biz_compress = String::Normal::Config::BusinessCompress::_data( @_ );  | 
| 
57
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
25
 | 
     return bless {@_}, $self;  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _tokenize_value {  | 
| 
61
 | 
13
 | 
 
 | 
 
 | 
  
13
  
 | 
 
 | 
18
 | 
     my ($value,$digits,$words) = @_;  | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
63
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
30
 | 
     $value = String::Normal::Type::_scrub_value( $value );  | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # split tokens on more than just whitespace:  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # split digits from words but keep things like 3D and 1st combined,  | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # also split things like abcd#efgh but keep pound signs for #2 and # 1 and #  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # prevent the empty string from finding its way into the token list as well  | 
| 
69
 | 
13
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
49
 | 
     my @tokens = map { map length $_ ? $_ : (), split /##+|\s+|#+\b|\b#+/, $_ } $value =~ /(?:\d+\w{1,2}\b|\d+|\D+)/g;  | 
| 
 
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
127
 | 
    | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # walk each token thru the tree and create markers  | 
| 
72
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
30
 | 
     my @pairs = _mark_pairs( \@tokens );  | 
| 
73
 | 
13
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
54
 | 
     _compress_list( \@tokens, \@pairs ) if @pairs;  | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # separate out tokens that contain digits (snowball stemmer will scrub all digits)  | 
| 
76
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
20
 | 
     for (@tokens) {  | 
| 
77
 | 
26
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
42
 | 
         if (/\d/) {  | 
| 
78
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
             push @$digits, $_;  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } else {  | 
| 
80
 | 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
51
 | 
             push @$words, $_;  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _mark_pairs {  | 
| 
86
 | 
13
 | 
 
 | 
 
 | 
  
13
  
 | 
 
 | 
13
 | 
     my $tokens = shift;  | 
| 
87
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
     my @pairs = ();  | 
| 
88
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
38
 | 
     for my $i (0 .. $#$tokens) {  | 
| 
89
 | 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
27
 | 
         my $token = $tokens->[$i];  | 
| 
90
 | 
30
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
74
 | 
         next unless exists $biz_compress->{$token};  | 
| 
91
 | 
5
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
20
 | 
         next if $i + 1 > $#$tokens;  | 
| 
92
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
13
 | 
         my $end = _walk_tree( $i + 1, $tokens, $biz_compress->{$token} );  | 
| 
93
 | 
4
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
11
 | 
         if ($end) {  | 
| 
94
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
             push @pairs, [$i,$end];  | 
| 
95
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
             $i = $end;  | 
| 
96
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
97
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
98
 | 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
21
 | 
     return @pairs;  | 
| 
99
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
100
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
101
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _walk_tree {  | 
| 
103
 | 
5
 | 
 
 | 
 
 | 
  
5
  
 | 
 
 | 
7
 | 
     my ($i, $list, $tree) = @_;  | 
| 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
105
 | 
5
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
18
 | 
     if (my $t = $tree->{$list->[$i]}) {  | 
| 
106
 | 
4
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
26
 | 
         if (ref $t eq 'HASH' and !%$t) {  | 
| 
107
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
             return $i;  | 
| 
108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } else {  | 
| 
109
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
             _walk_tree( $i + 1, $list, $t );  | 
| 
110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
111
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
113
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
114
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _compress_list {  | 
| 
115
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
 
 | 
4
 | 
     my ($list,$pairs) = @_;  | 
| 
116
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
     for my $pair (reverse @$pairs) {  | 
| 
117
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
         my ($s,$e) = @$pair;  | 
| 
118
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
17
 | 
         splice @$list, $s, $e - $s + 1, join '', @$list[$s .. $e];  | 
| 
119
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
121
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
122
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
123
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
124
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  |