File Coverage

blib/lib/Text/NGrammer.pm
Criterion Covered Total %
statement 58 67 86.5
branch 8 16 50.0
condition n/a
subroutine 9 11 81.8
pod 7 7 100.0
total 82 101 81.1


line stmt bran cond sub pod time code
1             # Copyright 2018 Francesco Nidito. All rights reserved.
2             #
3             # This library is free software; you can redistribute it and/or
4             # modify it under the same terms as Perl itself.
5              
6             package Text::NGrammer;
7              
8 1     1   5956 use strict;
  1         3  
  1         30  
9 1     1   5 use Carp;
  1         2  
  1         49  
10 1     1   542 use Lingua::Sentence;
  1         74260  
  1         45  
11              
12 1     1   9 use vars qw($VERSION);
  1         2  
  1         335  
13             $VERSION = '0.06';
14              
15             sub new {
16 1     1 1 111 my $class = shift;
17 1         5 my %config = (lang => 'en', # used by the sentencer
18             );
19              
20 1         3 my %param = @_;
21              
22 1         4 for my $opt (keys %param) {
23 0 0       0 croak "option $opt unsupported by version $VERSION of Text::NGrammer" unless exists $config{$opt};
24 0         0 $config{$opt} = $param{$opt};
25             }
26 1         4 $config{version} = $VERSION;
27 1         4 return bless \%config, $class;
28             }
29              
30             ##
31             # Skip-grams
32              
33             sub skipgrams_array {
34 11     11 1 18 my $self = shift;
35 11         15 my $n = shift;
36 11         14 my $k = shift;
37 11         24 my $length = scalar(@_);
38              
39 11 50       21 croak "the n-gram length cannot be lesser than 1" if $n < 1;
40 11 50       20 croak "the tokens to be skipped cannot be lesser than 0" if $k < 0;
41              
42 11         16 my @ngrams = ();
43 11         17 my $step = $k + 1;
44 11         30 for (my $i = 0; $i <= ($length - ($n+($k*($n-1)))); $i += 1) {
45 25         32 my @tokens = ();
46 25         33 my $at = $i;
47 25         46 while (@tokens < $n) {
48 64         95 push @tokens, $_[$at];
49 64         106 $at += $step;
50             }
51 25         58 push @ngrams, \@tokens;
52             }
53              
54 11         34 return @ngrams;
55             }
56              
57             sub skipgrams_sentence {
58 11     11 1 22 my $self = shift;
59 11         12 my $n = shift;
60 11         15 my $k = shift;
61 11         18 my $sentence = shift;
62              
63 11 50       25 croak "the n-gram length cannot be lesser than 1" if $n < 1;
64 11 50       24 croak "the tokens to be skipped cannot be lesser than 0" if $k < 0;
65              
66             # splits a string -- assumed to be a sentence -- according to spaces, control chars, etc.
67 11         175 my @tokens = grep /\S+/, split(/(?:\p{C}|\p{M}|\p{P}|\p{S}|\p{Z})+/, $sentence);
68 11 50       32 return () if @tokens < $n;
69 11         29 return $self->skipgrams_array($n, $k, @tokens);
70             }
71              
72             sub skipgrams_text {
73 12     12 1 1061 my $self = shift;
74 12         19 my $n = shift;
75 12         15 my $k = shift;
76 12         19 my $text = shift;
77              
78 12 50       31 croak "the n-gram length cannot be lesser than 1" if $n < 1;
79 12 100       208 croak "the tokens to be skipped cannot be lesser than 0" if $k < 0;
80              
81 11         19 my @ngrams = ();
82              
83 11         38 my $splitter = Lingua::Sentence->new($self->{lang});
84 11         15836 for my $sentence ($splitter->split_array($text)) {
85 11         746 push @ngrams, $self->skipgrams_sentence($n, $k, $sentence);
86             }
87              
88 11         147 return @ngrams;
89             }
90              
91             ##
92             # N-Grams
93              
94             sub ngrams_array {
95 0     0 1 0 my $self = shift;
96 0         0 my $n = shift;
97              
98 0         0 return $self->skipgrams_array($n, 0, @_);
99             }
100              
101             sub ngrams_sentence {
102 0     0 1 0 my $self = shift;
103 0         0 my $n = shift;
104 0         0 my $sentence = shift;
105              
106 0         0 return $self->skipgrams_sentencey($n, 0, $sentence);
107             }
108              
109             sub ngrams_text {
110 3     3 1 330 my $self = shift;
111 3         5 my $n = shift;
112 3         6 my $text = shift;
113              
114 3         10 return $self->skipgrams_text($n, 0, $text);
115             }
116              
117              
118             1;
119              
120             __END__