File Coverage

blib/lib/Hailo/Tokenizer/Words.pm
Criterion Covered Total %
statement 74 74 100.0
branch 22 24 91.6
condition 33 36 91.6
subroutine 10 10 100.0
pod 0 2 0.0
total 139 146 95.2


line stmt bran cond sub pod time code
1             package Hailo::Tokenizer::Words;
2             our $AUTHORITY = 'cpan:AVAR';
3             $Hailo::Tokenizer::Words::VERSION = '0.75';
4 29     29   3266303 use v5.10.0;
  29         143  
5 29     29   5533 use utf8;
  29         184  
  29         220  
6 29     29   2421 use Moose;
  29         1364052  
  29         252  
7 29     29   204193 use MooseX::StrictConstructor;
  29         93207  
  29         282  
8 29     29   130130 use Regexp::Common qw/ URI /;
  29         72181  
  29         130  
9 29     29   672507 use namespace::clean -except => 'meta';
  29         81  
  29         300  
10              
11             with qw(Hailo::Role::Arguments
12             Hailo::Role::Tokenizer);
13              
14             # [[:alpha:]] doesn't match combining characters on Perl >=5.12
15             my $ALPHABET = qr/(?![_\d])\w/;
16              
17             # tokenization
18             my $SPACE = qr/\s/;
19             my $NONSPACE = qr/\S/;
20             my $DASH = qr/[–-]/;
21             my $POINT = qr/[.,]/;
22             my $APOSTROPHE = qr/['’´]/;
23             my $ELLIPSIS = qr/\.{2,}|…/;
24             my $NON_WORD = qr/[^\w\s]+/;
25             my $BARE_WORD = qr/\w+/;
26             my $CURRENCY = qr/[¤¥¢£\$]/;
27             my $NUMBER = qr/$CURRENCY?$POINT\d+(?:$POINT\d+)*(?:$CURRENCY|$ALPHABET+)?|$CURRENCY?\d+(?:$POINT\d+)*(?:$CURRENCY|$ALPHABET+)?(?!\d|$ALPHABET)/;
28             my $APOST_WORD = qr/$ALPHABET+(?:$APOSTROPHE$ALPHABET+)+/;
29             my $ABBREV = qr/$ALPHABET(?:\.$ALPHABET)+\./;
30             my $DOTTED = qr/$BARE_WORD?\.$BARE_WORD(?:\.$BARE_WORD)*/;
31             my $WORD_TYPES = qr/$NUMBER|$ABBREV|$DOTTED|$APOST_WORD|$BARE_WORD/;
32             my $WORD_APOST = qr/$WORD_TYPES(?:$DASH$WORD_TYPES)*$APOSTROPHE(?!$ALPHABET|$NUMBER)/;
33             my $WORD = qr/$WORD_TYPES(?:(?:$DASH$WORD_TYPES)+|$DASH(?!$DASH))?/;
34 14     14   11217 my $MIXED_CASE = qr/ \p{Lower}+ \p{Upper} | \p{Upper}{2,} \p{Lower} /x;
  14         43  
  14         194  
35             my $UPPER_NONW = qr/^ (?:\p{Upper}+ \W+)(?<!I') (?: \p{Upper}* \p{Lower} ) /x;
36              
37             # special tokens
38             my $TWAT_NAME = qr/ \@ [A-Za-z0-9_]+ /x;
39             my $EMAIL = qr/ [A-Z0-9._%+-]+ @ [A-Z0-9.-]+ (?: \. [A-Z]{2,4} )* /xi;
40             my $PERL_CLASS = qr/ (?: :: \w+ (?: :: \w+ )* | \w+ (?: :: \w+ )+ ) (?: :: )? | \w+ :: /x;
41             my $EXTRA_URI = qr{ (?: \w+ \+ ) ssh:// $NONSPACE+ }x;
42             my $ESC_SPACE = qr/(?:\\ )+/;
43             my $NAME = qr/(?:$BARE_WORD|$ESC_SPACE)+/;
44             my $FILENAME = qr/ $NAME? \. $NAME (?: \. $NAME )* | $NAME/x;
45             my $UNIX_PATH = qr{ / $FILENAME (?: / $FILENAME )* /? }x;
46             my $WIN_PATH = qr{ $ALPHABET : \\ $FILENAME (?: \\ $FILENAME )* \\?}x;
47             my $PATH = qr/$UNIX_PATH|$WIN_PATH/;
48             my $DATE = qr/[0-9]{4}-W?[0-9]{1,2}-[0-9]{1,2}/i;
49             my $TIME = qr/[0-9]{1,2}:[0-9]{2}(?::[0-9]{2})?(?:Z| ?[AP]M|[-+±][0-9]{2}(?::?[0-9]{2})?)?/i;
50             my $DATETIME = qr/${DATE}T$TIME/;
51             my $IRC_NICK = qr/<(?: |[&~]?[@%+~&])?[A-Za-z_`\-^\|\\\{}\[\]][A-Za-z_0-9`\-^\|\\\{}\[\]]+>/;
52             my $IRC_CHAN = qr/[#&+][^ \a\0\012\015,:]{1,199}/;
53             my $NUMERO = qr/#[0-9]+/;
54             my $CLOSE_TAG = qr{</[-\w]+>};
55              
56             my $CASED_WORD = qr/$CLOSE_TAG|$IRC_NICK|$IRC_CHAN|$DATETIME|$DATE|$TIME|$PERL_CLASS|$EXTRA_URI|$EMAIL|$TWAT_NAME|$PATH|$NUMERO/;
57              
58             # capitalization
59             # The rest of the regexes are pretty hairy. The goal here is to catch the
60             # most common cases where a word should be capitalized. We try hard to
61             # guard against capitalizing things which don't look like proper words.
62             # Examples include URLs and code snippets.
63             my $OPEN_QUOTE = qr/['"‘“„«»「『‹‚]/;
64             my $CLOSE_QUOTE = qr/['"’“”«»」』›‘]/;
65             my $TERMINATOR = qr/(?:[?!‽]+|(?<!\.)\.)/;
66             my $ADDRESS = qr/:/;
67             my $PUNCTUATION = qr/[?!‽,;.:]/;
68             my $BOUNDARY = qr/$CLOSE_QUOTE?(?:\s*$TERMINATOR|$ADDRESS)\s+$OPEN_QUOTE?\s*/;
69             my $LOOSE_WORD = qr/$IRC_CHAN|$DATETIME|$DATE|$TIME|$PATH|$NUMBER|$ABBREV|$APOST_WORD|$NUMERO|$BARE_WORD(?:$DASH(?:$WORD_TYPES|$BARE_WORD)|$APOSTROPHE(?!$ALPHABET|$NUMBER|$APOSTROPHE)|$DASH(?!$DASH{2}))*/;
70             my $SPLIT_WORD = qr{$LOOSE_WORD(?:/$LOOSE_WORD)?(?=$PUNCTUATION(?:\s+|$)|$CLOSE_QUOTE|$TERMINATOR|\s+|$)};
71              
72             # we want to capitalize words that come after "On example.com?"
73             # or "You mean 3.2?", but not "Yes, e.g."
74             my $DOTTED_STRICT = qr/$LOOSE_WORD(?:$POINT(?:\d+|\w{2,}))?/;
75             my $WORD_STRICT = qr/$DOTTED_STRICT(?:$APOSTROPHE$DOTTED_STRICT)*/;
76              
77             # This string is added to (and later removed from) the output string when
78             # capitalizing it in multiple passes. We use backspace, because that is
79             # unlikely to be in the input. This dirty approach can probably be replaced
80             # with regex grammars, but I haven't bothered to learn to use those.
81             my $SEPARATOR = "\x08";
82              
83             # input -> tokens
84             sub make_tokens {
85 5117     5117 0 219802 my ($self, $input) = @_;
86              
87 5117         8484 my @tokens;
88 5117         24104 $input =~ s/$DASH\K\s*\n+\s*//;
89 5117         13120 $input =~ s/\s*\n+\s*/ /gm;
90              
91 5117         13547 while (length $input) {
92             # remove the next chunk of whitespace
93 28247         118542 $input =~ s/^$SPACE+//;
94 28247         46496 my $got_word;
95              
96 28247   100     124555 while (length $input && $input =~ /^$NONSPACE/) {
97             # We convert it to ASCII and then look for a URI because $RE{URI}
98             # from Regexp::Common doesn't support non-ASCII domain names
99 34731         122250 my ($ascii) = $input =~ /^($NONSPACE+)/;
100 34731         71473 $ascii =~ s/[^[:ascii:]]/a/g;
101              
102             # URIs
103 34731 100 100     360343 if (!$got_word && $ascii =~ / ^ $RE{URI} /xo) {
    100 100        
    100          
    50          
104 8         28 my $uri_end = $+[0];
105 8         30 my $uri = substr $input, 0, $uri_end;
106 8         143 $input =~ s/^\Q$uri//;
107              
108 8         30 push @tokens, [$self->{_spacing_normal}, $uri];
109 8         46 $got_word = 1;
110             }
111             # special words for which we preserve case
112             elsif (!$got_word && $input =~ s/ ^ (?<word> $CASED_WORD )//xo) {
113 313     3   7590 push @tokens, [$self->{_spacing_normal}, $+{word}];
  3         67976  
  3         1231  
  3         2038  
114 313         2023 $got_word = 1;
115             }
116             # normal words
117             elsif ($input =~ / ^ $WORD /xo) {
118 26814         104493 my $word;
119              
120             # special case to allow matching q{ridin'} as one word, even when
121             # it appears as q{"ridin'"}, but not as q{'ridin'}
122 26814 100       70847 my $last_char = @tokens ? substr $tokens[-1][1], -1, 1 : '';
123 26814 100 100     148053 if (!@tokens && $input =~ s/ ^ (?<word>$WORD_APOST) //xo
      66        
      66        
124             || $last_char =~ / ^ $APOSTROPHE $ /xo
125             && $input =~ s/ ^ (?<word>$WORD_APOST) (?<! $last_char ) //xo) {
126 21         161 $word = $+{word};
127             }
128             else {
129 26793 50       203141 $input =~ s/^($WORD)//o and $word = $1;
130             }
131              
132             # Maybe preserve the casing of this word
133 26814 100 100     247301 $word = lc $word
      100        
134             if $word ne uc $word
135             # Mixed-case words like "WoW" or "ATMs"
136             and $word !~ $MIXED_CASE
137             # Words that are upper case followed by a non-word character.
138             and $word !~ $UPPER_NONW;
139              
140 26814         88195 push @tokens, [$self->{_spacing_normal}, $word];
141 26814         155208 $got_word = 1;
142             }
143             # everything else
144             elsif ($input =~ s/ ^ (?<non_word> $NON_WORD ) //xo) {
145 7596         47311 my $non_word = $+{non_word};
146 7596         18786 my $spacing = $self->{_spacing_normal};
147              
148             # was the previous token a word?
149 7596 100       17723 if ($got_word) {
    100          
150             $spacing = $input =~ /^$NONSPACE/
151             ? $self->{_spacing_infix}
152 5646 100       19011 : $self->{_spacing_postfix};
153             }
154             # do we still have more tokens?
155             elsif ($input =~ /^$NONSPACE/) {
156 651         1361 $spacing = $self->{_spacing_prefix};
157             }
158              
159 7596         45422 push @tokens, [$spacing, $non_word];
160             }
161             }
162             }
163              
164 5117         17566 return \@tokens;
165             }
166              
167             # tokens -> output
168             sub make_output {
169 1299     1299 0 115823 my ($self, $tokens) = @_;
170 1299         2145 my $reply = '';
171              
172 1299         1886 for my $pos (0 .. $#{ $tokens }) {
  1299         3595  
173 13926         17305 my ($spacing, $text) = @{ $tokens->[$pos] };
  13926         24467  
174 13926         20344 $reply .= $text;
175              
176             # append whitespace if this is not a prefix token or infix token,
177             # and this is not the last token, and the next token is not
178             # a postfix/infix token
179 13926 100 100     16585 if ($pos != $#{ $tokens }
  13926   100     52535  
      66        
      100        
180             && $spacing != $self->{_spacing_prefix}
181             && $spacing != $self->{_spacing_infix}
182             && !($pos < $#{ $tokens }
183             && ($tokens->[$pos+1][0] == $self->{_spacing_postfix}
184             || $tokens->[$pos+1][0] == $self->{_spacing_infix})
185             )
186             ) {
187 9935         20679 $reply .= ' ';
188             }
189             }
190              
191             # capitalize the first word
192 1299         36744 $reply =~ s/^\s*$OPEN_QUOTE?\s*\K($SPLIT_WORD)(?=$ELLIPSIS|(?:(?:$CLOSE_QUOTE|$TERMINATOR|$ADDRESS|$PUNCTUATION+)?(?:\s|$)))/\u$1/o;
193              
194             # capitalize the second word
195 1299         41542 $reply =~ s/^\s*$OPEN_QUOTE?\s*$SPLIT_WORD(?:(?:\s*$TERMINATOR|$ADDRESS)\s+)\K($SPLIT_WORD)/\u$1/o;
196              
197             # capitalize all other words after word boundaries
198             # we do it in two passes because we need to match two words at a time
199 1299         95891 $reply =~ s/(?:$ELLIPSIS|\s+)$OPEN_QUOTE?\s*$WORD_STRICT$BOUNDARY\K($SPLIT_WORD)/$SEPARATOR\u$1$SEPARATOR/go;
200 1299         35986 $reply =~ s/$SEPARATOR$WORD_STRICT$SEPARATOR$BOUNDARY\K($SPLIT_WORD)/\u$1/go;
201 1299         3690 $reply =~ s/$SEPARATOR//go;
202              
203             # end paragraphs with a period when it makes sense
204 1299         111908 $reply =~ s/(?:$ELLIPSIS|\s+|^)$OPEN_QUOTE?(?:$SPLIT_WORD(?:\.$SPLIT_WORD)*)\K($CLOSE_QUOTE?)$/.$1/o;
205              
206             # capitalize I'm, I've...
207 1299         18252 $reply =~ s{(?:(?:$ELLIPSIS|\s+)|$OPEN_QUOTE)\Ki(?=$APOSTROPHE$ALPHABET)}{I}go;
208              
209 1299         9573 return $reply;
210             }
211              
212             __PACKAGE__->meta->make_immutable;
213              
214             =encoding utf8
215              
216             =head1 NAME
217              
218             Hailo::Tokenizer::Words - A tokenizer for L<Hailo|Hailo> which splits
219             on whitespace and word boundaries, mostly.
220              
221             =head1 DESCRIPTION
222              
223             This tokenizer does its best to handle various languages. It knows about most
224             apostrophes, quotes, and sentence terminators.
225              
226             =head1 AUTHOR
227              
228             Hinrik E<Ouml>rn SigurE<eth>sson, hinrik.sig@gmail.com
229              
230             =head1 LICENSE AND COPYRIGHT
231              
232             Copyright 2010 Hinrik E<Ouml>rn SigurE<eth>sson
233              
234             This program is free software, you can redistribute it and/or modify
235             it under the same terms as Perl itself.
236              
237             =cut