File Coverage

blib/lib/Text/Ngram.pm
Criterion Covered Total %
statement 50 50 100.0
branch 23 26 88.4
condition 5 7 71.4
subroutine 8 8 100.0
pod 2 2 100.0
total 88 93 94.6


line stmt bran cond sub pod time code
1             package Text::Ngram;
2              
3 2     2   35019 use 5.008008;
  2         9  
  2         91  
4 2     2   12 use strict;
  2         4  
  2         78  
5 2     2   11 use warnings;
  2         9  
  2         78  
6              
7 2     2   2098 use Unicode::CaseFold;
  2         2574  
  2         1542  
8              
9             require Exporter;
10              
11             our @ISA = qw(Exporter);
12             our %EXPORT_TAGS = ( 'all' => [ qw( ngram_counts add_to_counts) ] );
13             our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
14             our @EXPORT = qw();
15              
16             our $VERSION = '0.15';
17              
18             =head1 NAME
19              
20             Text::Ngram - Ngram analysis of text
21              
22             =head1 SYNOPSIS
23              
24             use Text::Ngram qw(ngram_counts add_to_counts);
25             my $text = "abcdefghijklmnop";
26             my $hash_r = ngram_counts($text, 3); # Window size = 3
27             # $hash_r => { abc => 1, bcd => 1, ... }
28              
29             add_to_counts($more_text, 3, $hash_r);
30              
31             =head1 DESCRIPTION
32              
33             n-Gram analysis is a field in textual analysis which uses sliding window
34             character sequences in order to aid topic analysis, language
35             determination and so on. The n-gram spectrum of a document can be used
36             to compare and filter documents in multiple languages, prepare word
37             prediction networks, and perform spelling correction.
38              
39             The neat thing about n-grams, though, is that they're really easy to
40             determine. For n=3, for instance, we compute the n-gram counts like so:
41              
42             the cat sat on the mat
43             --- $counts{"the"}++;
44             --- $counts{"he "}++;
45             --- $counts{"e c"}++;
46             ...
47              
48             This module provides an efficient XS-based implementation of n-gram
49             spectrum analysis.
50              
51             There are two functions which can be imported:
52              
53             =cut
54              
55             require XSLoader;
56             XSLoader::load('Text::Ngram', $VERSION);
57              
58             sub _clean_buffer {
59 19     19   18 my %config = %{+shift};
  19         94  
60 19         28 my $buffer = shift;
61 19 50   1   81 $buffer = fc $buffer if $config{lowercase};
  1         38  
  1         1  
  1         16  
62 19         26760 $buffer =~ s/\s+/ /g;
63 19 100       49 unless ($config{punctuation}) {
64 15 100       27 if ($config{flankbreaks}) {
65 11         87 $buffer =~ s/[^[:alpha:] ]+/ \xff /g;
66             }
67             else {
68 4         22 $buffer =~ s/[^[:alpha:] ]+/\xff/g;
69             }
70             }
71 19         38 $buffer =~ y/ / /s;
72 19         258 return $buffer;
73             }
74              
75             =head2 ngram_counts
76              
77             This first function returns a hash reference with the n-gram histogram
78             of the text for the given window size. The default window size is 5.
79              
80             $href = ngram_counts(\%config, $text, $window_size);
81              
82             As of version 0.14, the %config may instead be passed in as named arguments:
83              
84             $href = ngram_counts($text, $window_size, %config);
85              
86             The only necessary parameter is $text.
87              
88             The possible value for %config are:
89              
90             =head3 flankbreaks
91              
92             If set to 1 (default), breaks are flanked by spaces; if set to 0,
93             they're not. Breaks are punctuation and other non-alphabetic
94             characters, which, unless you use C<< punctuation => 0 >> in your
95             configuration, do not make it into the returned hash.
96              
97             Here's an example, supposing you're using the default value
98             for punctuation (1):
99              
100             my $text = "Hello, world";
101             my $hash = ngram_counts($text, 5);
102              
103             That produces the following ngrams:
104              
105             {
106             'Hello' => 1,
107             'ello ' => 1,
108             ' worl' => 1,
109             'world' => 1,
110             }
111              
112             On the other hand, this:
113              
114             my $text = "Hello, world";
115             my $hash = ngram_counts({flankbreaks => 0}, $text, 5);
116              
117             Produces the following ngrams:
118              
119             {
120             'Hello' => 1,
121             ' worl' => 1,
122             'world' => 1,
123             }
124              
125             =head3 lowercase
126              
127             If set to 0, casing is preserved. If set to 1, all letters are
128             lowercased before counting ngrams. Default is 1.
129              
130             # Get all ngrams of size 4 preserving case
131             $href_p = ngram_counts( {lowercase => 0}, $text, 4 );
132              
133             =head3 punctuation
134              
135             If set to 0 (default), punctuation is removed before calculating the
136             ngrams. Set to 1 to preserve it.
137              
138             # Get all ngrams of size 2 preserving punctuation
139             $href_p = ngram_counts( {punctuation => 1}, $text, 2 );
140              
141             =head3 spaces
142              
143             If set to 0 (default is 1), no ngrams containing spaces will be returned.
144              
145             # Get all ngrams of size 3 that do not contain spaces
146             $href = ngram_counts( {spaces => 0}, $text, 3);
147              
148             If you're going to request both types of ngrams, than the best way to
149             avoid calculating the same thing twice is probably this:
150              
151             $href_with_spaces = ngram_counts($text[, $window]);
152             $href_no_spaces = $href_with_spaces;
153             for (keys %$href_no_spaces) { delete $href->{$_} if / / }
154              
155             =cut
156              
157             sub ngram_counts {
158 16     16 1 832 my %config = (
159             spaces => 1,
160             punctuation => 0,
161             lowercase => 1,
162             flankbreaks => 1
163             );
164 16 100       59 if (ref($_[0]) eq 'HASH') {
    100          
165 7         14 %config = (%config, %{+shift});
  7         33  
166             }
167             elsif (@_ > 2) {
168 2 100       17 %config = (%config, splice @_, (@_ & 1) ? 1 : 2);
169             }
170 16         29 my ($buffer, $width) = @_;
171 16   100     36 $width ||= 5;
172 16 50       33 return {} if $width < 1;
173 16         45 my $href = _process_buffer(_clean_buffer(\%config, $buffer), $width);
174 16 100       47 unless ($config{punctuation}) {
175 12 100       59 for (keys %$href) { delete $href->{$_} if /\xff/ }
  169         356  
176             }
177 16 100       52 unless ($config{spaces}) {
178 1 100       5 for (keys %$href) { delete $href->{$_} if / / }
  14         29  
179             }
180 16         131 return $href;
181             }
182              
183             =head2 add_to_counts
184              
185             This incrementally adds to the supplied hash; if C<$window> is zero or
186             undefined, then the window size is computed from the hash keys.
187              
188             add_to_counts($more_text, $window, $href)
189              
190             =cut
191              
192             sub add_to_counts {
193 3     3 1 2525 my %config = (punctuation => 0, lowercase => 1);
194 3         6 my ($buffer, $width, $href) = @_;
195 3 100 66     19 if (!defined $width or !$width) {
196 1         3 my ($key, undef) = each %$href; # Just gimme a random key
197 1   50     5 $width = length $key || 5;
198             }
199 3         10 _process_buffer_incrementally(_clean_buffer(\%config, $buffer), $width, $href);
200 3 50       24 for (keys %$href) { delete $href->{$_} if /\xff/ }
  42         78  
201             }
202              
203             1;
204             __END__