File Coverage

blib/lib/Hailo/Tokenizer/Words.pm
Criterion Covered Total %
statement 13 13 100.0
branch n/a
condition n/a
subroutine 5 5 100.0
pod n/a
total 18 18 100.0


line stmt bran cond sub pod time code
1             package Hailo::Tokenizer::Words;
2             BEGIN {
3 1     1   21560 $Hailo::Tokenizer::Words::AUTHORITY = 'cpan:AVAR';
4             }
5             {
6             $Hailo::Tokenizer::Words::VERSION = '0.72';
7             }
8              
9 1     1   24 use 5.010;
  1         4  
  1         33  
10 1     1   5 use utf8;
  1         2  
  1         6  
11 1     1   801 use Any::Moose;
  1         34877  
  1         8  
12 1     1   608 use Any::Moose 'X::StrictConstructor';
  1         1  
  1         6  
13             use Regexp::Common qw/ URI /;
14             use namespace::clean -except => 'meta';
15              
16             with qw(Hailo::Role::Arguments
17             Hailo::Role::Tokenizer);
18              
19             # [[:alpha:]] doesn't match combining characters on Perl >=5.12
20             my $ALPHABET = qr/(?![_\d])\w/;
21              
22             # tokenization
23             my $SPACE = qr/\s/;
24             my $NONSPACE = qr/\S/;
25             my $DASH = qr/[–-]/;
26             my $POINT = qr/[.,]/;
27             my $APOSTROPHE = qr/['’´]/;
28             my $ELLIPSIS = qr/\.{2,}|…/;
29             my $NON_WORD = qr/[^\w\s]+/;
30             my $BARE_WORD = qr/\w+/;
31             my $CURRENCY = qr/[¤¥¢£\$]/;
32             my $NUMBER = qr/$CURRENCY?$POINT\d+(?:$POINT\d+)*(?:$CURRENCY|$ALPHABET+)?|$CURRENCY?\d+(?:$POINT\d+)*(?:$CURRENCY|$ALPHABET+)?(?!\d|$ALPHABET)/;
33             my $APOST_WORD = qr/$ALPHABET+(?:$APOSTROPHE$ALPHABET+)+/;
34             my $ABBREV = qr/$ALPHABET(?:\.$ALPHABET)+\./;
35             my $DOTTED = qr/$BARE_WORD?\.$BARE_WORD(?:\.$BARE_WORD)*/;
36             my $WORD_TYPES = qr/$NUMBER|$ABBREV|$DOTTED|$APOST_WORD|$BARE_WORD/;
37             my $WORD_APOST = qr/$WORD_TYPES(?:$DASH$WORD_TYPES)*$APOSTROPHE(?!$ALPHABET|$NUMBER)/;
38             my $WORD = qr/$WORD_TYPES(?:(?:$DASH$WORD_TYPES)+|$DASH(?!$DASH))?/;
39             my $MIXED_CASE = qr/ \p{Lower}+ \p{Upper} | \p{Upper}{2,} \p{Lower} /x;
40             my $UPPER_NONW = qr/^ (?:\p{Upper}+ \W+)(?
41              
42             # special tokens
43             my $TWAT_NAME = qr/ \@ [A-Za-z0-9_]+ /x;
44             my $EMAIL = qr/ [A-Z0-9._%+-]+ @ [A-Z0-9.-]+ (?: \. [A-Z]{2,4} )* /xi;
45             my $PERL_CLASS = qr/ (?: :: \w+ (?: :: \w+ )* | \w+ (?: :: \w+ )+ ) (?: :: )? | \w+ :: /x;
46             my $EXTRA_URI = qr{ (?: \w+ \+ ) ssh:// $NONSPACE+ }x;
47             my $ESC_SPACE = qr/(?:\\ )+/;
48             my $NAME = qr/(?:$BARE_WORD|$ESC_SPACE)+/;
49             my $FILENAME = qr/ $NAME? \. $NAME (?: \. $NAME )* | $NAME/x;
50             my $UNIX_PATH = qr{ / $FILENAME (?: / $FILENAME )* /? }x;
51             my $WIN_PATH = qr{ $ALPHABET : \\ $FILENAME (?: \\ $FILENAME )* \\?}x;
52             my $PATH = qr/$UNIX_PATH|$WIN_PATH/;
53             my $DATE = qr/[0-9]{4}-W?[0-9]{1,2}-[0-9]{1,2}/i;
54             my $TIME = qr/[0-9]{1,2}:[0-9]{2}(?::[0-9]{2})?(?:Z| ?[AP]M|[-+±][0-9]{2}(?::?[0-9]{2})?)?/i;
55             my $DATETIME = qr/${DATE}T$TIME/;
56             my $IRC_NICK = qr/<(?: |[&~]?[@%+~&])?[A-Za-z_`\-^\|\\\{}\[\]][A-Za-z_0-9`\-^\|\\\{}\[\]]+>/;
57             my $IRC_CHAN = qr/[#&+][^ \a\0\012\015,:]{1,199}/;
58             my $NUMERO = qr/#[0-9]+/;
59             my $CLOSE_TAG = qr{};
60              
61             my $CASED_WORD = qr/$CLOSE_TAG|$IRC_NICK|$IRC_CHAN|$DATETIME|$DATE|$TIME|$PERL_CLASS|$EXTRA_URI|$EMAIL|$TWAT_NAME|$PATH|$NUMERO/;
62              
63             # capitalization
64             # The rest of the regexes are pretty hairy. The goal here is to catch the
65             # most common cases where a word should be capitalized. We try hard to
66             # guard against capitalizing things which don't look like proper words.
67             # Examples include URLs and code snippets.
68             my $OPEN_QUOTE = qr/['"‘“„«»「『‹‚]/;
69             my $CLOSE_QUOTE = qr/['"’“”«»」』›‘]/;
70             my $TERMINATOR = qr/(?:[?!‽]+|(?
71             my $ADDRESS = qr/:/;
72             my $PUNCTUATION = qr/[?!‽,;.:]/;
73             my $BOUNDARY = qr/$CLOSE_QUOTE?(?:\s*$TERMINATOR|$ADDRESS)\s+$OPEN_QUOTE?\s*/;
74             my $LOOSE_WORD = qr/$IRC_CHAN|$DATETIME|$DATE|$TIME|$PATH|$NUMBER|$ABBREV|$APOST_WORD|$NUMERO|$BARE_WORD(?:$DASH(?:$WORD_TYPES|$BARE_WORD)|$APOSTROPHE(?!$ALPHABET|$NUMBER|$APOSTROPHE)|$DASH(?!$DASH{2}))*/;
75             my $SPLIT_WORD = qr{$LOOSE_WORD(?:/$LOOSE_WORD)?(?=$PUNCTUATION(?:\s+|$)|$CLOSE_QUOTE|$TERMINATOR|\s+|$)};
76              
77             # we want to capitalize words that come after "On example.com?"
78             # or "You mean 3.2?", but not "Yes, e.g."
79             my $DOTTED_STRICT = qr/$LOOSE_WORD(?:$POINT(?:\d+|\w{2,}))?/;
80             my $WORD_STRICT = qr/$DOTTED_STRICT(?:$APOSTROPHE$DOTTED_STRICT)*/;
81              
82             # This string is added to (and later removed from) the output string when
83             # capitalizing it in multiple passes. We use backspace, because that is
84             # unlikely to be in the input. This dirty approach can probably be replaced
85             # with regex grammars, but I haven't bothered to learn to use those.
86             my $SEPARATOR = "\x08";
87              
88             # input -> tokens
89             sub make_tokens {
90             my ($self, $input) = @_;
91              
92             my @tokens;
93             $input =~ s/$DASH\K\s*\n+\s*//;
94             $input =~ s/\s*\n+\s*/ /gm;
95              
96             while (length $input) {
97             # remove the next chunk of whitespace
98             $input =~ s/^$SPACE+//;
99             my $got_word;
100              
101             while (length $input && $input =~ /^$NONSPACE/) {
102             # We convert it to ASCII and then look for a URI because $RE{URI}
103             # from Regexp::Common doesn't support non-ASCII domain names
104             my ($ascii) = $input =~ /^($NONSPACE+)/;
105             $ascii =~ s/[^[:ascii:]]/a/g;
106              
107             # URIs
108             if (!$got_word && $ascii =~ / ^ $RE{URI} /xo) {
109             my $uri_end = $+[0];
110             my $uri = substr $input, 0, $uri_end;
111             $input =~ s/^\Q$uri//;
112              
113             push @tokens, [$self->{_spacing_normal}, $uri];
114             $got_word = 1;
115             }
116             # special words for which we preserve case
117             elsif (!$got_word && $input =~ s/ ^ (? $CASED_WORD )//xo) {
118             push @tokens, [$self->{_spacing_normal}, $+{word}];
119             $got_word = 1;
120             }
121             # normal words
122             elsif ($input =~ / ^ $WORD /xo) {
123             my $word;
124              
125             # special case to allow matching q{ridin'} as one word, even when
126             # it appears as q{"ridin'"}, but not as q{'ridin'}
127             my $last_char = @tokens ? substr $tokens[-1][1], -1, 1 : '';
128             if (!@tokens && $input =~ s/ ^ (?$WORD_APOST) //xo
129             || $last_char =~ / ^ $APOSTROPHE $ /xo
130             && $input =~ s/ ^ (?$WORD_APOST) (?
131             $word = $+{word};
132             }
133             else {
134             $input =~ s/^($WORD)//o and $word = $1;
135             }
136              
137             # Maybe preserve the casing of this word
138             $word = lc $word
139             if $word ne uc $word
140             # Mixed-case words like "WoW" or "ATMs"
141             and $word !~ $MIXED_CASE
142             # Words that are upper case followed by a non-word character.
143             and $word !~ $UPPER_NONW;
144              
145             push @tokens, [$self->{_spacing_normal}, $word];
146             $got_word = 1;
147             }
148             # everything else
149             elsif ($input =~ s/ ^ (? $NON_WORD ) //xo) {
150             my $non_word = $+{non_word};
151             my $spacing = $self->{_spacing_normal};
152              
153             # was the previous token a word?
154             if ($got_word) {
155             $spacing = $input =~ /^$NONSPACE/
156             ? $self->{_spacing_infix}
157             : $self->{_spacing_postfix};
158             }
159             # do we still have more tokens?
160             elsif ($input =~ /^$NONSPACE/) {
161             $spacing = $self->{_spacing_prefix};
162             }
163              
164             push @tokens, [$spacing, $non_word];
165             }
166             }
167             }
168              
169             return \@tokens;
170             }
171              
172             # tokens -> output
173             sub make_output {
174             my ($self, $tokens) = @_;
175             my $reply = '';
176              
177             for my $pos (0 .. $#{ $tokens }) {
178             my ($spacing, $text) = @{ $tokens->[$pos] };
179             $reply .= $text;
180              
181             # append whitespace if this is not a prefix token or infix token,
182             # and this is not the last token, and the next token is not
183             # a postfix/infix token
184             if ($pos != $#{ $tokens }
185             && $spacing != $self->{_spacing_prefix}
186             && $spacing != $self->{_spacing_infix}
187             && !($pos < $#{ $tokens }
188             && ($tokens->[$pos+1][0] == $self->{_spacing_postfix}
189             || $tokens->[$pos+1][0] == $self->{_spacing_infix})
190             )
191             ) {
192             $reply .= ' ';
193             }
194             }
195              
196             # capitalize the first word
197             $reply =~ s/^\s*$OPEN_QUOTE?\s*\K($SPLIT_WORD)(?=$ELLIPSIS|(?:(?:$CLOSE_QUOTE|$TERMINATOR|$ADDRESS|$PUNCTUATION+)?(?:\s|$)))/\u$1/o;
198              
199             # capitalize the second word
200             $reply =~ s/^\s*$OPEN_QUOTE?\s*$SPLIT_WORD(?:(?:\s*$TERMINATOR|$ADDRESS)\s+)\K($SPLIT_WORD)/\u$1/o;
201              
202             # capitalize all other words after word boundaries
203             # we do it in two passes because we need to match two words at a time
204             $reply =~ s/(?:$ELLIPSIS|\s+)$OPEN_QUOTE?\s*$WORD_STRICT$BOUNDARY\K($SPLIT_WORD)/$SEPARATOR\u$1$SEPARATOR/go;
205             $reply =~ s/$SEPARATOR$WORD_STRICT$SEPARATOR$BOUNDARY\K($SPLIT_WORD)/\u$1/go;
206             $reply =~ s/$SEPARATOR//go;
207              
208             # end paragraphs with a period when it makes sense
209             $reply =~ s/(?:$ELLIPSIS|\s+|^)$OPEN_QUOTE?(?:$SPLIT_WORD(?:\.$SPLIT_WORD)*)\K($CLOSE_QUOTE?)$/.$1/o;
210              
211             # capitalize I'm, I've...
212             $reply =~ s{(?:(?:$ELLIPSIS|\s+)|$OPEN_QUOTE)\Ki(?=$APOSTROPHE$ALPHABET)}{I}go;
213              
214             return $reply;
215             }
216              
217             __PACKAGE__->meta->make_immutable;
218              
219             =encoding utf8
220              
221             =head1 NAME
222              
223             Hailo::Tokenizer::Words - A tokenizer for L which splits
224             on whitespace and word boundaries, mostly.
225              
226             =head1 DESCRIPTION
227              
228             This tokenizer does its best to handle various languages. It knows about most
229             apostrophes, quotes, and sentence terminators.
230              
231             =head1 AUTHOR
232              
233             Hinrik Ern SigurEsson, hinrik.sig@gmail.com
234              
235             =head1 LICENSE AND COPYRIGHT
236              
237             Copyright 2010 Hinrik Ern SigurEsson
238              
239             This program is free software, you can redistribute it and/or modify
240             it under the same terms as Perl itself.
241              
242             =cut