File Coverage

blib/lib/Text/Document.pm

Criterion	Covered	Total	%
statement	99	164	60.3
branch	23	52	44.2
condition	3	12	25.0
subroutine	12	18	66.6
pod	11	14	78.5
total	148	260	56.9

line	stmt	bran	cond	sub	pod	time	code
1
2							package Text::Document;
3
4							$Text::Document::VERSION = '1.05';
5
6	5			5		1642	use strict;
	5					8
	5					203
7
8	5			5		58	use v5.6.0;
	5					15
	5					632
9
10							our @FIELDS = qw( lowercase );
11							our $COMPRESS_AVAILABLE;
12							our @KEYS_FOR_NEW = qw( compress lowercase );
13
14							BEGIN {
15	5			5		355	eval "use Compress::Zlib;";
	5			5		6855
	5					653533
	5					1727
16	5	50				33	if( $@ ){
17	0					0	$COMPRESS_AVAILABLE = undef;
18							} else {
19	5					12616	$COMPRESS_AVAILABLE = 1;
20							}
21							}
22
23
24							sub new
25							{
26	7			7	1	151	my $class = shift;
27	7					14	my %self = @_;
28	7					26	my $self = {
29							lowercase => 1,
30							compress => 1,
31							terms => {},
32							};
33	7					18	foreach my $k ( @KEYS_FOR_NEW ){
34	14	50				38	defined( $self{$k} )
35							and ($self->{$k} = $self{$k});
36							}
37
38	7					16	bless $self, $class;
39	7					22	return $self;
40							}
41
42							sub AddContent
43							{
44	7			7	1	36	my $self = shift;
45	7					15	my ($text) = @_;
46							# clear frequency cache
47	7	50				25	$self->{freqs} and delete $self->{freqs};
48
49							# parse text fragment
50	7					24	my @terms = $self->ScanV( $text );
51
52							# update word count
53	7					11	foreach my $w (@terms){
54	26					59	$self->{terms}->{$w} ++;
55							}
56	7					12	undef $self->{WeightedEuclideanNorm};
57	7					15	undef $self->{EuclideanNorm};
58	7					19	return scalar @terms;
59							}
60
61							# number of occurrences of a given term
62							sub Occurrences
63							{
64	0			0	1	0	my $self = shift;
65	0					0	my ($term) = @_;
66
67	0					0	return $self->{terms}->{$term};
68							}
69
70							sub ScanV
71							{
72	7			7	1	10	my $self = shift;
73	7					9	my ($text) = @_;
74	7					49	my @words = split( /[^a-zA-Z0-9]+/, $text );
75	7					66	@words = grep( /.+/, @words );
76	7	50				21	if( $self->{lowercase} ){
77	7					44	return map( lc($_), @words );
78							} else {
79	0					0	return @words;
80							}
81							}
82
83							sub KeywordFrequency
84							{
85	0			0	1	0	my $self = shift;
86
87	0	0				0	return $self->{freqs} if $self->{freqs};
88
89							# all the distinct terms in the doc
90	0					0	my @terms = $self->Terms();
91							# total number of terms
92	0					0	my $sum = 0;
93	0					0	foreach my $t (@terms) { $sum += $self->{terms}->{$t}; }
	0					0
94							# if zero, frequency is not defined
95	0	0				0	($sum > 0) or return undef;
96							# list of [term,frequency] pairs
97	0					0	my @freqs = map( [$_, $self->{terms}->{$_}/$sum ] , @terms );
98							# sort by ascending frequency
99	0					0	@freqs = sort { $a->[1] <=> $b->[1] } @freqs;
	0					0
100
101							# return reference to result
102	0					0	return $self->{freqs} = \@freqs;
103							}
104
105							# all distinct term names
106							sub Terms
107							{
108	0			0	1	0	my $self = shift;
109	0					0	return keys %{$self->{terms}};
	0					0
110							}
111
112							# number of common terms divided by total number of terms
113							sub CommonTermsRatio
114							{
115	0			0	0	0	my $self = shift;
116	0					0	my ($other) = @_;
117	0					0	my @terms = $self->Terms();
118	0					0	my %terms;
119	0					0	@terms{@terms} = 1 .. @terms;
120	0					0	my @oTerms = $other->Terms();
121	0					0	my (%union);
122	0					0	@union{@terms} = 1 .. @terms;
123	0					0	@union{@oTerms} = 1 .. @oTerms;
124	0	0				0	my @intersection = map( ( $terms{$_} ? 1 : () ), @oTerms );
125	0					0	my $unionCardinality = scalar( keys %union );
126	0	0				0	($unionCardinality > 0) or return undef;
127	0					0	return scalar(@intersection) / $unionCardinality;
128							}
129
130							sub PureASCII
131							{
132	0			0	0	0	my $self = shift;
133	0					0	$self->{compress} = 1;
134							}
135
136							sub WriteToString
137							{
138	1			1	1	9	my $self = shift;
139
140	1					2	my $block = join( ',', %{$self->{terms}} );
	1					8
141	1					3	my $compressed = undef;
142	1	50	33			11	if( $COMPRESS_AVAILABLE && $self->{compress} ){
143	1					5	$block = Compress::Zlib::compress( $block );
144							# $block = compress( $block );
145	1					496	$compressed = 1;
146							}
147	1	50				14	my $header =
148							'p='
149							. __PACKAGE__
150							. ' v='
151							. $Text::Document::VERSION
152							. ' l='
153							. length( $block )
154							. ' compress='
155							. ($compressed?'1':'0')
156							. ' '
157							. join( ' ', map( "$_=$self->{$_}", @FIELDS))
158							. "\n";
159
160	1					20	my $str = $header . $block;
161
162							# add 8-char hex-encoded 4-byte checksum at the end of data
163	1					15	return $str . sprintf( '%08x', unpack( '%32C*', $str ) );
164							}
165
166							sub NewFromString
167							{
168	1			1	1	5	my ($str) = @_;
169
170	1					2	my $self = {};
171
172							# verify checksum
173							# try to be compatible with version 1.03
174	1					3	my $stored_checksum = unpack( 'N', substr( $str, -4 ));
175	1					2	my $data_payload = substr( $str, 0, -4 );
176	1					3	my $computed_checksum = unpack( '%32C*', $data_payload );
177
178	1	50				5	if( $stored_checksum != $computed_checksum ){
179	1					4	$stored_checksum = hex( substr( $str, -8 ));
180	1					2	$data_payload = substr( $str, 0, -8 );
181	1					2	$computed_checksum = unpack( '%32C*', $data_payload );
182							}
183
184	1	50				4	if( $stored_checksum != $computed_checksum ){
185	0					0	die( __PACKAGE__ . '::NewFromString : '
186							. 'checksum test failed '
187							. $stored_checksum
188							. ' != '
189							. $computed_checksum
190							);
191							}
192
193							# split data in header and block
194	1					4	my ($header,$block) = split( /\n/, $data_payload, 2 );
195
196							# parse header line
197	1					11	my %header = split( /[ =]+/, $header );
198
199							# check that the reading package is the same as the one that wrote
200	1	50				5	if( $header{p} ne __PACKAGE__ ){
201	0					0	die( __PACKAGE__ . '::NewFromString : '
202							. "file was not written by "
203							. __PACKAGE__
204							);
205							}
206
207							# version must be identical
208	1	50				7	if( $header{v} > $Text::Document::VERSION ){
209	0					0	die( __PACKAGE__ . '::NewFromString : '
210							. "Current version is $Text::Document::VERSION"
211							. " and the file version is $header{v}"
212							);
213							}
214
215							# size of block must match
216	1	50				4	if( $header{l} != length( $block ) ){
217	0					0	die( __PACKAGE__ . '::NewFromString : '
218							. "data size is "
219							. length( $block )
220							. "instead of $header{l} "
221							);
222							}
223
224							# compressed?
225	1	50	33			6	if( $header{compress} and not($COMPRESS_AVAILABLE) ){
226	0					0	die( __PACKAGE__ . '::NewFromString : '
227							. 'header indicates that data is compressed, '
228							. 'but Compress::Zlib is not available'
229							);
230							}
231
232	1	50				3	if( $header{compress} ){
233	1					4	$block = Compress::Zlib::uncompress( $block );
234							# $block = uncompress( $block );
235							}
236
237
238	1					91	@{$self}{@FIELDS} = @header{ @FIELDS };
	1					3
239
240							# retrieve terms and recurrence count
241	1					3	%{$self->{terms}} = split( /,/, $block );
	1					6
242
243	1					3	bless $self, $header{p};
244
245	1					4	return $self;
246							}
247
248							sub JaccardSimilarity
249							{
250	8			8	1	41	my $self = shift;
251	8					13	my ($e) = @_;
252
253	8					56	my @inter = map(
254							( $self->{terms}->{$_} ? $_ : () ),
255	8	100				10	keys %{$e->{terms}}
256							);
257	8					12	my %union = %{$self->{terms}};
	8					34
258	8					13	my @keyse = keys %{$e->{terms}};
	8					26
259	8					24	@union{@keyse} = @keyse;
260	8	50				25	if( (my $unionSize = scalar keys %union) > 0 ){
261	8					47	return scalar(@inter) / $unionSize;
262							} else {
263	0					0	return undef;
264							}
265							}
266
267							sub CosineSimilarity
268							{
269	3			3	1	20	my $self = shift;
270	3					5	my ($e) = @_;
271
272	3					7	my ($Dv,$Ev) = ($self->{terms}, $e->{terms});
273	3					5	my %union = %{$self->{terms}};
	3					14
274	3					5	my @keyse = keys %{$e->{terms}};
	3					11
275	3					9	@union{@keyse} = @keyse;
276	3					4	my $dotProduct = 0.0;
277	3	100				45	map( $dotProduct +=
		100
278							(defined($Dv->{$_}) ? $Dv->{$_} : 0.0)
279							* (defined($Ev->{$_}) ? $Ev->{$_} : 0.0 ),
280							keys %union
281							);
282
283	3					9	my $nD = $self->EuclideanNorm();
284	3					7	my $nE = $e->EuclideanNorm();
285
286	3	50	33			18	if( ($nD==0) \|\| ($nE==0) ){
287	0					0	return undef;
288							} else {
289	3					33	return $dotProduct / $nD / $nE;
290							}
291							}
292
293							sub EuclideanNorm
294							{
295	6			6	0	8	my $self = shift;
296	6	100				22	defined( $self->{EuclideanNorm} ) and return $self->{EuclideanNorm};
297	3					4	my $sum = 0.0;
298	3					4	map( $sum += $_*$_, values %{$self->{terms}} );
	3					11
299	3					16	return ($self->{EuclideanNorm} = sqrt( $sum ));
300							}
301
302							# this is rather rough
303							sub WeightedCosineSimilarity
304							{
305	0			0	1		my $self = shift;
306	0						my ($e,$weightFunction,$rock) = @_;
307
308	0						my ($Dv,$Ev) = ($self->{terms}, $e->{terms});
309
310							# compute union
311	0						my %union = %{$self->{terms}};
	0
312	0						my @keyse = keys %{$e->{terms}};
	0
313	0						@union{@keyse} = @keyse;
314	0						my @allkeys = keys %union;
315
316							# weighted D
317	0						my @Dw = map(( defined( $Dv->{$_} )?
318	0	0					&{$weightFunction}( $rock, $_ )*$Dv->{$_} : 0.0 ),
319							@allkeys
320							);
321
322							# weighted E
323	0						my @Ew = map(( defined( $Ev->{$_} )?
324	0	0					&{$weightFunction}( $rock, $_ )*$Ev->{$_} : 0.0 ),
325							@allkeys
326							);
327
328							# dot product of D and E
329	0						my $dotProduct = 0.0;
330	0						map( $dotProduct += $Dw[$_] * $Ew[$_] , 0..$#Dw );
331
332							# norm of D
333	0						my $nD = 0.0;
334	0						map( $nD += $Dw[$_] * $Dw[$_] , 0..$#Dw );
335	0						$nD = sqrt( $nD );
336
337							# norm of E
338	0						my $nE = 0.0;
339	0						map( $nE += $Ew[$_] * $Ew[$_] , 0..$#Ew );
340	0						$nE = sqrt( $nE );
341
342							# dot product scaled by norm
343	0	0	0				if( ($nD==0) \|\| ($nE==0) ){
344	0						return undef;
345							} else {
346	0						return $dotProduct / $nD / $nE;
347							}
348							}
349
350							1;
351
352
353							__END__