File Coverage

blib/lib/Text/NSP/Measures/2D/CHI/tscore.pm
Criterion Covered Total %
statement 20 21 95.2
branch 2 2 100.0
condition n/a
subroutine 6 7 85.7
pod n/a
total 28 30 93.3


line stmt bran cond sub pod time code
1             =head1 NAME
2              
3             Text::NSP::Measures::2D::CHI::tscore - Perl module that implements T-score
4             measure of association for bigrams.
5              
6              
7             =head1 SYNOPSIS
8              
9             =head3 Basic Usage
10              
11             use Text::NSP::Measures::2D::CHI::tscore;
12              
13             my $npp = 60; my $n1p = 20; my $np1 = 20; my $n11 = 10;
14              
15             $tscore_value = calculateStatistic( n11=>$n11,
16             n1p=>$n1p,
17             np1=>$np1,
18             npp=>$npp);
19              
20             if( ($errorCode = getErrorCode()))
21             {
22             print STDERR $errorCode." - ".getErrorMessage()."\n"";
23             }
24             else
25             {
26             print getStatisticName."value for bigram is ".$tscore_value."\n"";
27             }
28              
29             =head1 DESCRIPTION
30              
31             Assume that the frequency count data associated with a bigram
32             is stored in a 2x2 contingency table:
33              
34             word2 ~word2
35             word1 n11 n12 | n1p
36             ~word1 n21 n22 | n2p
37             --------------
38             np1 np2 npp
39              
40             where n11 is the number of times occur together, and
41             n12 is the number of times occurs with some word other than
42             word2, and n1p is the number of times in total that word1 occurs as
43             the first word in a bigram.
44              
45             The T-score is defined as a ratio of difference between the observed
46             and the expected mean to the variance of the sample. Note that this
47             is a variant of the standard t-test that was proposed for use in the
48             identification of collocations in large samples of text.
49              
50             Thus, the T-score is defined as follows:
51              
52             m11 = n1p * np1 / npp
53              
54             T-score = (n11 - m11)/sqrt(n11)
55              
56             =over
57              
58             =cut
59              
60              
61             package Text::NSP::Measures::2D::CHI::tscore;
62              
63              
64 1     1   4671 use Text::NSP::Measures::2D::CHI;
  1         3  
  1         235  
65 1     1   5 use strict;
  1         3  
  1         33  
66 1     1   6 use Carp;
  1         2  
  1         53  
67 1     1   6 use warnings;
  1         2  
  1         27  
68 1     1   5 no warnings 'redefine';
  1         3  
  1         1535  
69             require Exporter;
70              
71             our ($VERSION, @EXPORT, @ISA);
72              
73             @ISA = qw(Exporter);
74              
75             @EXPORT = qw(initializeStatistic calculateStatistic
76             getErrorCode getErrorMessage getStatisticName);
77              
78             $VERSION = '0.97';
79              
80              
81             =item calculateStatistic() - method to calculate the tscore Coefficient
82              
83             INPUT PARAMS : $count_values .. Reference of an hash containing
84             the count values computed by the
85             count.pl program.
86              
87             RETURN VALUES : $tscore .. tscore value for this bigram.
88              
89             =cut
90              
91             sub calculateStatistic
92             {
93 28     28   4624 my %values = @_;
94              
95             # computes and returns the observed and expected values from
96             # the frequency combination values. returns 0 if there is an
97             # error in the computation or the values are inconsistent.
98 28 100       83 if( !(Text::NSP::Measures::2D::CHI::getValues(\%values)) ) {
99 10         25 return;
100             }
101             # Now calculate the tscore
102              
103 18         79 my $tscore = (($n11-$m11)/($n11**0.5));
104              
105 18         58 return ( $tscore );
106             }
107              
108              
109              
110             =item getStatisticName() - Returns the name of this statistic
111              
112             INPUT PARAMS : none
113              
114             RETURN VALUES : $name .. Name of the measure.
115              
116             =cut
117              
118             sub getStatisticName
119             {
120 0     0     return "T-score";
121             }
122              
123              
124              
125             1;
126             __END__