| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Text::Perfide::WordBags; |
|
2
|
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
30757
|
use warnings; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
38
|
|
|
4
|
1
|
|
|
1
|
|
6
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
36
|
|
|
5
|
1
|
|
|
1
|
|
879
|
use utf8::all; |
|
|
1
|
|
|
|
|
68720
|
|
|
|
1
|
|
|
|
|
7
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
1
|
|
|
1
|
|
3503
|
use base 'Exporter'; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
660
|
|
|
8
|
|
|
|
|
|
|
our @EXPORT = (qw/pairability file2bag bagint baguni bagcard/); |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 NAME |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Text::Perfide::WordBags - Create word bags from text, and operate over them. |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 VERSION |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
Version 0.01_02 |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=cut |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our $VERSION = '0.01_02'; |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
Quick summary of what the module does. |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
Perhaps a little code snippet. |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
use Text::Perfide::WordBags; |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
my $foo = Text::Perfide::WordBags->new(); |
|
32
|
|
|
|
|
|
|
... |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 EXPORT |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
A list of functions that can be exported. You can delete this section |
|
37
|
|
|
|
|
|
|
if you don't export anything, such as for a purely object-oriented module. |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=head1 FUNCTIONS |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=head2 pairability |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
Calculates pairability of two wordbags. Pairibitily value is given by: |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
$int/($uni || 1) |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
where $int and $uni are the values given by, respectively, the intersection |
|
48
|
|
|
|
|
|
|
and the union of the two bags. |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=cut |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
sub pairability{ |
|
53
|
0
|
|
|
0
|
1
|
|
my ($bag1,$bag2) = @_; |
|
54
|
0
|
|
|
|
|
|
my $int = bagcard(bagint($bag1,$bag2)); |
|
55
|
0
|
|
|
|
|
|
my $uni = bagcard(baguni($bag1,$bag2)); |
|
56
|
0
|
|
0
|
|
|
|
return $int/($uni || 1); |
|
57
|
|
|
|
|
|
|
} |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=head2 file2bag |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
Create a word bag from a file. |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
Receives as argument a function ref and a file path. |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
Reads a file in slurp mode, passes the text to the function passed as argument, |
|
66
|
|
|
|
|
|
|
and returns the result. |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=cut |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
sub file2bag{ |
|
71
|
0
|
|
|
0
|
1
|
|
my ($txt2bag_fn,$file) = @_; |
|
72
|
0
|
0
|
|
|
|
|
open my $fp,'<:utf8',$file or die "Could not open file '$file'"; |
|
73
|
0
|
|
|
|
|
|
my $text = join '',<$fp>; |
|
74
|
0
|
|
|
|
|
|
return $txt2bag_fn->($text); |
|
75
|
|
|
|
|
|
|
} |
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
# sub txt2bag_pn{ # proper nouns |
|
78
|
|
|
|
|
|
|
# my $text = shift; |
|
79
|
|
|
|
|
|
|
# |
|
80
|
|
|
|
|
|
|
# my $upper = {}; |
|
81
|
|
|
|
|
|
|
# my $uppat = qr{\b[A-Z]\w{3,}(?:['-]\w+)*\b}; |
|
82
|
|
|
|
|
|
|
# $upper->{$1}++ while($text =~ /($uppat)/g); |
|
83
|
|
|
|
|
|
|
# |
|
84
|
|
|
|
|
|
|
# my $lower = {}; |
|
85
|
|
|
|
|
|
|
# my $lwpat = qr{\b[a-z]+(?:['-][a-z]+)*\b}; |
|
86
|
|
|
|
|
|
|
# $lower->{$1}++ while($text =~ /($lwpat)/g); |
|
87
|
|
|
|
|
|
|
# |
|
88
|
|
|
|
|
|
|
# foreach my $k (keys %$upper){ |
|
89
|
|
|
|
|
|
|
# if($lower->{lc $k}){ |
|
90
|
|
|
|
|
|
|
# my $ratio = $upper->{$k}/$lower->{lc $k}; |
|
91
|
|
|
|
|
|
|
# delete $upper->{$k} if $ratio < 10; |
|
92
|
|
|
|
|
|
|
# } |
|
93
|
|
|
|
|
|
|
# } |
|
94
|
|
|
|
|
|
|
# return $upper; |
|
95
|
|
|
|
|
|
|
# } |
|
96
|
|
|
|
|
|
|
# |
|
97
|
|
|
|
|
|
|
# sub txt2bag_num{ |
|
98
|
|
|
|
|
|
|
# my $text = shift; |
|
99
|
|
|
|
|
|
|
# my $bag = {}; |
|
100
|
|
|
|
|
|
|
# my $pecul = qr{\d+}; |
|
101
|
|
|
|
|
|
|
# $bag->{$1}++ while($text =~ /($pecul)/g); |
|
102
|
|
|
|
|
|
|
# if(haspn($bag)){ |
|
103
|
|
|
|
|
|
|
# foreach(1..300){ |
|
104
|
|
|
|
|
|
|
# $bag->{$_}-- if $bag->{$_}; |
|
105
|
|
|
|
|
|
|
# delete $bag->{$_} unless $bag->{$_}; |
|
106
|
|
|
|
|
|
|
# } |
|
107
|
|
|
|
|
|
|
# } |
|
108
|
|
|
|
|
|
|
# return $bag; |
|
109
|
|
|
|
|
|
|
# } |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=head2 bagint |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
Calculates the intersection between two wordbags. |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=cut |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub bagint { |
|
118
|
0
|
|
|
0
|
1
|
|
my ($bag1,$bag2) = @_; |
|
119
|
0
|
|
|
|
|
|
my $inters = {}; |
|
120
|
0
|
|
|
|
|
|
foreach(keys %$bag1){ |
|
121
|
0
|
0
|
|
|
|
|
next unless $bag2->{$_}; |
|
122
|
0
|
0
|
|
|
|
|
$inters->{$_} = $bag1->{$_} < $bag2->{$_} ? $bag1->{$_} : $bag2->{$_}; |
|
123
|
|
|
|
|
|
|
} |
|
124
|
0
|
|
|
|
|
|
return $inters; |
|
125
|
|
|
|
|
|
|
} |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=head2 baguni |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
Calculates the union of two wordbags. |
|
130
|
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=cut |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub baguni { |
|
134
|
0
|
|
|
0
|
1
|
|
my ($bag1,$bag2) = @_; |
|
135
|
0
|
|
|
|
|
|
my $union = {}; |
|
136
|
0
|
|
|
|
|
|
foreach(keys %$bag1){ |
|
137
|
1
|
|
|
1
|
|
8
|
no warnings; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
353
|
|
|
138
|
0
|
0
|
|
|
|
|
$union->{$_} = $bag1->{$_} > $bag2->{$_} ? $bag1->{$_} : $bag2->{$_}; |
|
139
|
|
|
|
|
|
|
} |
|
140
|
0
|
|
|
|
|
|
foreach(keys %$bag2){ |
|
141
|
0
|
|
0
|
|
|
|
$union->{$_}//=$bag2->{$_}; |
|
142
|
|
|
|
|
|
|
} |
|
143
|
0
|
|
|
|
|
|
return $union; |
|
144
|
|
|
|
|
|
|
} |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=head2 bagcard |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
Calculates the cardinality of two wordbags. |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=cut |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
sub bagcard { |
|
153
|
0
|
|
|
0
|
1
|
|
my $bag = shift; |
|
154
|
0
|
|
|
|
|
|
my $soma = 0; |
|
155
|
0
|
|
|
|
|
|
map { $soma+= $bag->{$_} } keys %$bag; |
|
|
0
|
|
|
|
|
|
|
|
156
|
0
|
|
|
|
|
|
return $soma; |
|
157
|
|
|
|
|
|
|
} |
|
158
|
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
=head1 AUTHOR |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
Andre Santos, C<< >> |
|
164
|
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=head1 BUGS |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
Please report any bugs or feature requests to C, or through |
|
168
|
|
|
|
|
|
|
the web interface at L. I will be notified, and then you'll |
|
169
|
|
|
|
|
|
|
automatically be notified of progress on your bug as I make changes. |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
Copyright 2011 Project Natura. |
|
178
|
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
|
180
|
|
|
|
|
|
|
under the terms of either: the GNU General Public License as published |
|
181
|
|
|
|
|
|
|
by the Free Software Foundation; or the Artistic License. |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
See http://dev.perl.org/licenses/ for more information. |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=cut |
|
187
|
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
1; # End of Text::Perfide::WordBags |