| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# Copyright 2009 Francesco Nidito. All rights reserved. |
|
2
|
|
|
|
|
|
|
# |
|
3
|
|
|
|
|
|
|
# This library is free software; you can redistribute it and/or |
|
4
|
|
|
|
|
|
|
# modify it under the same terms as Perl itself. |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package Algorithm::WordLevelStatistics; |
|
7
|
|
|
|
|
|
|
|
|
8
|
1
|
|
|
1
|
|
6306
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
38
|
|
|
9
|
|
|
|
|
|
|
|
|
10
|
1
|
|
|
1
|
|
6
|
use vars qw($VERSION); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
497
|
|
|
11
|
|
|
|
|
|
|
$VERSION = '0.03'; |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
sub new { |
|
14
|
1
|
|
|
1
|
1
|
79
|
my $class = shift; |
|
15
|
1
|
|
|
|
|
6
|
return bless { |
|
16
|
|
|
|
|
|
|
version => $VERSION, |
|
17
|
|
|
|
|
|
|
}, $class; |
|
18
|
|
|
|
|
|
|
} |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# computes the statistical level of a single word (given its spectrum and the total number of words in the text) |
|
21
|
|
|
|
|
|
|
sub compute_spectrum { |
|
22
|
2907
|
|
|
2907
|
1
|
3735
|
my ($self, $N, $s) = @_; |
|
23
|
|
|
|
|
|
|
|
|
24
|
2907
|
|
|
|
|
2653
|
my $n = @{$s}; |
|
|
2907
|
|
|
|
|
3894
|
|
|
25
|
|
|
|
|
|
|
|
|
26
|
2907
|
|
|
|
|
8157
|
my $ls = { count => $n, C => 0, sigma_nor => 0 }; |
|
27
|
2907
|
100
|
|
|
|
5396
|
if( $n > 3 ) { |
|
28
|
|
|
|
|
|
|
# position -> distance from preceding element in text |
|
29
|
973
|
|
|
|
|
1153
|
my @tmp = (); |
|
30
|
973
|
|
|
|
|
1951
|
for( my $i = 0; $i < ($n-1); ++$i ){ push @tmp, ($s->[$i+1] - $s->[$i]); } |
|
|
28079
|
|
|
|
|
64612
|
|
|
31
|
|
|
|
|
|
|
|
|
32
|
973
|
|
|
|
|
1976
|
my ($avg, $sigma) = $self->_mean_and_variance( \@tmp ); |
|
33
|
973
|
|
|
|
|
1273
|
$sigma = sqrt($sigma)/$avg; |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# normalize sigma using an hypothetical uniform distribution |
|
36
|
973
|
|
|
|
|
1075
|
my $p = $n/$N; |
|
37
|
973
|
|
|
|
|
1639
|
$ls->{sigma_nor} = $sigma/sqrt(1.0-$p); |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# this is not simple: |
|
40
|
973
|
|
|
|
|
4267
|
$ls->{C} = ($ls->{sigma_nor} - (2.0*$n-1.0)/(2.0*$n+2.0)) * ( sqrt($n)*(1.0+2.8*$n**-0.865) ); |
|
41
|
|
|
|
|
|
|
} |
|
42
|
|
|
|
|
|
|
|
|
43
|
2907
|
|
|
|
|
7374
|
return $ls; |
|
44
|
|
|
|
|
|
|
} |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# computes the statistical level of a group of words (given their spectra) |
|
47
|
|
|
|
|
|
|
sub compute_spectra { |
|
48
|
1
|
|
|
1
|
1
|
153570
|
my ($self, $s) = @_; |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
# count the total number of "words" in text |
|
51
|
1
|
|
|
|
|
3
|
my $N = 0; |
|
52
|
1
|
|
|
|
|
2
|
foreach my $i (keys(%{$s})){ $N += @{ $s->{$i} }; } |
|
|
1
|
|
|
|
|
849
|
|
|
|
2907
|
|
|
|
|
2571
|
|
|
|
2907
|
|
|
|
|
4579
|
|
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
# computes the level statistic for all terms |
|
55
|
1
|
|
|
|
|
161
|
my %r = (); |
|
56
|
1
|
|
|
|
|
3
|
foreach my $i (keys(%{$s})){ |
|
|
1
|
|
|
|
|
313
|
|
|
57
|
2907
|
|
|
|
|
6820
|
$r{$i} = $self->compute_spectrum( $N, $s->{$i} ); |
|
58
|
|
|
|
|
|
|
} |
|
59
|
|
|
|
|
|
|
|
|
60
|
1
|
|
|
|
|
421
|
return \%r; |
|
61
|
|
|
|
|
|
|
} |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
# fast, on-line algorithm to compute mean and variance: |
|
64
|
|
|
|
|
|
|
# http://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#On-line_algorithm |
|
65
|
|
|
|
|
|
|
sub _mean_and_variance { |
|
66
|
973
|
|
|
973
|
|
1049
|
my ($self, $v) = @_; |
|
67
|
973
|
|
|
|
|
1232
|
my ($n, $mean, $M2) = (0, 0, 0); |
|
68
|
|
|
|
|
|
|
|
|
69
|
973
|
|
|
|
|
812
|
foreach my $x (@{$v}) { |
|
|
973
|
|
|
|
|
1520
|
|
|
70
|
28079
|
|
|
|
|
33327
|
$n++; |
|
71
|
28079
|
|
|
|
|
31278
|
my $delta = $x - $mean; |
|
72
|
28079
|
|
|
|
|
31202
|
$mean += $delta/$n; |
|
73
|
28079
|
|
|
|
|
37409
|
$M2 += $delta*($x - $mean); |
|
74
|
|
|
|
|
|
|
} |
|
75
|
|
|
|
|
|
|
|
|
76
|
973
|
|
|
|
|
1517
|
my $variance = $M2/$n; |
|
77
|
|
|
|
|
|
|
|
|
78
|
973
|
|
|
|
|
1823
|
return ($mean, $variance); |
|
79
|
|
|
|
|
|
|
} |
|
80
|
|
|
|
|
|
|
1; |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
__END__ |