|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Treex::Tool::Parser::MSTperl::ModelLabelling;  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 {  | 
| 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $Treex::Tool::Parser::MSTperl::ModelLabelling::VERSION = '0.11949';  | 
| 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
6
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
4198
 | 
 use Moose;  | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 use Carp;  | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 extends 'Treex::Tool::Parser::MSTperl::ModelBase';  | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # basic MLE from data  | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   unigrams->{label} = prob  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # to be used for smoothing and/or backoff  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # (can be used both for emissions and transitions)  | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # It also contains the SEQUENCE_BOUNDARY_LABEL prob  | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # (the SEQUENCE_BOUNDARY_LABEL is counted once for each sequence)  | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # which might be unappropriate in some cases (eg. for emission probs)  | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'unigrams' => (  | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     is      => 'rw',  | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     isa     => 'HashRef',  | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     default => sub { {} },  | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 );  | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # transition scores for Viterbi with the structure (if MIRA-computed):  | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   transitions->{feature}->{label_prev}->{label_this} = score  | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # or probabilties (if obtained by MLE):  | 
| 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   transitions->{label_prev}->{label_this} = prob  | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # (if MLE is used for transitions, during the precomputing phase  | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # counts are temporarily stored instead of probs  | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # and they are converted to probs on calling prepare_for_mira() );  | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'transitions' => (  | 
| 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     is      => 'rw',  | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     isa     => 'HashRef',  | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     default => sub { {} },  | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 );  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # smoothing parameters of transition probabilities  | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # (to be computed by EM algorithm)  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # PROB(label|prev_label) =  | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #    smooth_bigrams  * transitions->{prev_label}->{label} +  | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #    smooth_unigrams * unigrams->{label} +  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #    smooth_uniform  | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'smooth_bigrams' => (  | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     is      => 'rw',  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     isa     => 'Num',  | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     default => 0.6,  | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 );  | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'smooth_unigrams' => (  | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     is      => 'rw',  | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     isa     => 'Num',  | 
| 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     default => 0.3,  | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 );  | 
| 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'smooth_uniform' => (  | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     is      => 'rw',  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     isa     => 'Num',  | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     default => 0.1,  | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 );  | 
| 
61
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # = 1 / ( keys %{ $self->unigrams } )  | 
| 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # set in compute_smoothing_params  | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'uniform_prob' => (  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     is      => 'rw',  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     isa     => 'Num',  | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     default => 0.02,  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 );  | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # emission scores for Viterbi with the structure  | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #   emissions->{feature}->{label} = score  | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'emissions' => (  | 
| 
73
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     is      => 'rw',  | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     isa     => 'HashRef',  | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     default => sub { {} },  | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 );  | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # just an array ref with the sentences that represent the heldout data  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # to be able to run the EM algorithm in prepare_for_mira()  | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 has 'EM_heldout_data' => (  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     is      => 'rw',  | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     isa     => 'ArrayRef[Treex::Tool::Parser::MSTperl::Sentence]',  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     default => sub { [] },  | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 );  | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
86
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub BUILD {  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ($self) = @_;  | 
| 
88
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->featuresControl( $self->config->labelledFeaturesControl );  | 
| 
90
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return;  | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
94
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # STORING AND LOADING  | 
| 
95
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
96
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub get_data_to_store {  | 
| 
97
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ($self) = @_;  | 
| 
98
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
99
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return {  | 
| 
100
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         'unigrams'        => $self->unigrams,  | 
| 
101
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         'transitions'     => $self->transitions,  | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         'emissions'       => $self->emissions,  | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         'smooth_uniform'  => $self->smooth_uniform,  | 
| 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         'smooth_unigrams' => $self->smooth_unigrams,  | 
| 
105
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         'smooth_bigrams'  => $self->smooth_bigrams,  | 
| 
106
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         'uniform_prob'    => $self->uniform_prob,  | 
| 
107
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     };  | 
| 
108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
109
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub load_data {  | 
| 
111
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $data ) = @_;  | 
| 
113
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
114
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->unigrams( $data->{'unigrams'} );  | 
| 
115
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->transitions( $data->{'transitions'} );  | 
| 
116
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->emissions( $data->{'emissions'} );  | 
| 
117
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
118
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->smooth_uniform( $data->{'smooth_uniform'} );  | 
| 
119
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->smooth_unigrams( $data->{'smooth_unigrams'} );  | 
| 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->smooth_bigrams( $data->{'smooth_bigrams'} );  | 
| 
121
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->uniform_prob( $data->{'uniform_prob'} );  | 
| 
122
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
123
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $unigrams_ok    = scalar( keys %{ $self->unigrams } );  | 
| 
124
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $transitions_ok = scalar( keys %{ $self->transitions } );  | 
| 
125
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $emissions_ok   = scalar( keys %{ $self->emissions } );  | 
| 
126
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
127
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $smooth_sum = $self->smooth_uniform + $self->smooth_unigrams  | 
| 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         + $self->smooth_bigrams;  | 
| 
129
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
130
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $smooth_ok = (  | 
| 
131
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
132
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # should be 1 but might be a little shifted  | 
| 
133
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $smooth_sum > 0.999  | 
| 
134
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             && $smooth_sum < 1.001  | 
| 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # must be between 0 and 1  | 
| 
137
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             && $self->uniform_prob > 0  | 
| 
138
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             && $self->uniform_prob < 1  | 
| 
139
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     );  | 
| 
140
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
141
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $ALGORITHM = $self->config->labeller_algorithm;  | 
| 
142
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
143
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($ALGORITHM == 0  | 
| 
144
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 1  | 
| 
145
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 2  | 
| 
146
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 3  | 
| 
147
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 4  | 
| 
148
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 8  | 
| 
149
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 9  | 
| 
150
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 10  | 
| 
151
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 11  | 
| 
152
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 14  | 
| 
153
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         )  | 
| 
154
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     {  | 
| 
155
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
156
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # these algorithms do not use lambda smoothing  | 
| 
157
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # (smoothing is kind of part of the learning)  | 
| 
158
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $smooth_ok = 1;  | 
| 
159
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
160
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
161
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( $ALGORITHM >= 20 ) {  | 
| 
162
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
163
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # these algorithms do not use separate transitions  | 
| 
164
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # (transitions are included in emissions)  | 
| 
165
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $transitions_ok = 1;  | 
| 
166
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
167
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
168
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( $unigrams_ok && $transitions_ok && $emissions_ok && $smooth_ok ) {  | 
| 
169
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         return 1;  | 
| 
170
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
171
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         return 0;  | 
| 
172
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
173
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
174
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
175
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # TRANSITION AND EMISSION COUNTS AND PROBABILITIES  | 
| 
176
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # (more or less standard MLE)  | 
| 
177
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
178
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub add_unigram {  | 
| 
179
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $label ) = @_;  | 
| 
180
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
181
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( $self->config->DEBUG >= 2 ) {  | 
| 
182
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         print "add_unigram($label)\n";  | 
| 
183
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
184
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
185
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # increment number of unigrams  | 
| 
186
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->unigrams->{$label} += 1;  | 
| 
187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
188
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return;  | 
| 
189
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
190
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
191
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub add_transition {  | 
| 
192
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
193
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # Str, Str, Maybe[Str]  | 
| 
194
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $label_this, $label_prev, $feature ) = @_;  | 
| 
195
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
196
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( defined $feature ) {  | 
| 
197
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ( $self->config->DEBUG >= 2 ) {  | 
| 
198
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             print "add_transition($label_this, $label_prev, $feature)\n";  | 
| 
199
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
200
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
201
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # increment number of bigrams  | 
| 
202
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $self->transitions->{$feature}->{$label_prev}->{$label_this} += 1;  | 
| 
203
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
204
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ( $self->config->DEBUG >= 2 ) {  | 
| 
205
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             print "add_transition($label_this, $label_prev)\n";  | 
| 
206
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
207
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
208
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # increment number of bigrams  | 
| 
209
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $self->transitions->{$label_prev}->{$label_this} += 1;  | 
| 
210
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
211
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
212
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return;  | 
| 
213
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
214
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
215
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub add_emission {  | 
| 
216
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $feature, $label ) = @_;  | 
| 
217
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
218
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( $self->config->DEBUG >= 3 ) {  | 
| 
219
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         print "add_emission($feature, $label)\n";  | 
| 
220
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
221
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
222
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->emissions->{$feature}->{$label} += 1;  | 
| 
223
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
224
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return;  | 
| 
225
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
226
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
227
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # called after preprocessing training data, before entering the MIRA phase  | 
| 
228
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub prepare_for_mira {  | 
| 
229
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
230
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $trainer ) = @_;  | 
| 
231
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # $trainer used only in algoprithm no. 9 for emissions initialization  | 
| 
233
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
234
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $ALGORITHM = $self->config->labeller_algorithm;  | 
| 
235
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
236
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( $ALGORITHM == 9 ) {  | 
| 
237
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
238
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # no need to recompute to probabilities (counts are OK)  | 
| 
239
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # but have to update emissions_summed  | 
| 
240
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # and transitions_summed appropriately  | 
| 
241
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
242
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $sumUpdateWeight = $trainer->number_of_inner_iterations;  | 
| 
243
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
244
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # emissions->{feature}->{label}  | 
| 
245
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         foreach my $feature ( keys %{ $self->emissions } ) {  | 
| 
246
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             foreach my $label ( keys %{ $self->emissions->{$feature} } ) {  | 
| 
247
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 $trainer->emissions_summed->{$feature}->{$label}  | 
| 
248
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     = $sumUpdateWeight * $self->emissions->{$feature}->{$label};  | 
| 
249
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
250
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
251
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
252
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # transitions->{feature}->{label_prev}->{label_this}  | 
| 
253
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         foreach my $feature ( keys %{ $self->transitions } ) {  | 
| 
254
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             foreach my $label_prev (  | 
| 
255
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 keys %{ $self->transitions->{$feature} }  | 
| 
256
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 )  | 
| 
257
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             {  | 
| 
258
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 foreach my $label_this (  | 
| 
259
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     keys %{ $self->transitions->{$feature}->{$label_prev} }  | 
| 
260
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     )  | 
| 
261
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 {  | 
| 
262
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     $trainer->transitions_summed  | 
| 
263
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         ->{$feature}->{$label_prev}->{$label_this}  | 
| 
264
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         = $sumUpdateWeight * $self->transitions  | 
| 
265
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         ->{$feature}->{$label_prev}->{$label_this};  | 
| 
266
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
267
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
268
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
269
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
270
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif ( $ALGORITHM == 1 || $ALGORITHM == 8 || $ALGORITHM >= 20 ) {  | 
| 
271
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
272
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # no recomputing taking place  | 
| 
273
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
274
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif (  | 
| 
275
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $ALGORITHM == 0  | 
| 
276
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 2  | 
| 
277
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 3  | 
| 
278
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 4  | 
| 
279
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 5  | 
| 
280
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 10  | 
| 
281
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 11  | 
| 
282
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 12  | 
| 
283
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 13  | 
| 
284
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 14  | 
| 
285
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 15  | 
| 
286
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 16  | 
| 
287
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 17  | 
| 
288
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 18  | 
| 
289
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 19  | 
| 
290
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         )  | 
| 
291
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     {  | 
| 
292
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
293
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # compute unigram probs  | 
| 
294
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $self->compute_probs_from_counts( $self->unigrams );  | 
| 
295
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
296
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # compute transition probs  | 
| 
297
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         foreach my $label ( keys %{ $self->transitions } ) {  | 
| 
298
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $self->compute_probs_from_counts( $self->transitions->{$label} );  | 
| 
299
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
300
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
301
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ($ALGORITHM == 4  | 
| 
302
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             || $ALGORITHM == 5  | 
| 
303
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             )  | 
| 
304
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         {  | 
| 
305
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
306
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # compute emission probs (MLE)  | 
| 
307
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             foreach my $feature ( keys %{ $self->emissions } ) {  | 
| 
308
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 $self->compute_probs_from_counts(  | 
| 
309
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     $self->emissions->{$feature}  | 
| 
310
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 );  | 
| 
311
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
312
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }    # end if $ALGORITHM == 4|5  | 
| 
313
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
314
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ($ALGORITHM == 5  | 
| 
315
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             || $ALGORITHM == 12  | 
| 
316
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             || $ALGORITHM == 13  | 
| 
317
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             || $ALGORITHM == 15  | 
| 
318
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             || $ALGORITHM == 16  | 
| 
319
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             || $ALGORITHM == 17  | 
| 
320
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             || $ALGORITHM == 18  | 
| 
321
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             || $ALGORITHM == 19  | 
| 
322
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             )  | 
| 
323
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         {  | 
| 
324
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
325
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # run the EM algorithm to compute  | 
| 
326
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # transtition probs smoothing params  | 
| 
327
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $self->compute_smoothing_params();  | 
| 
328
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }    # end if $ALGORITHM == 5|12|12|>=16  | 
| 
329
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
330
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {    # $ALGORITHM not in 0~9  | 
| 
331
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         croak "ModelLabelling->prepare_for_mira not implemented"  | 
| 
332
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             . " for algorithm no. $ALGORITHM!";  | 
| 
333
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
334
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
335
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return;  | 
| 
336
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }    # end prepare_for_mira  | 
| 
337
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
338
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # basic MLE  | 
| 
339
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub compute_probs_from_counts {  | 
| 
340
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $hashref ) = @_;  | 
| 
341
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
342
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $sum = 0;  | 
| 
343
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     foreach my $key ( keys %{$hashref} ) {  | 
| 
344
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $sum += $hashref->{$key};  | 
| 
345
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
346
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     foreach my $key ( keys %{$hashref} ) {  | 
| 
347
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $hashref->{$key} = $hashref->{$key} / $sum;  | 
| 
348
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
349
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
350
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return;  | 
| 
351
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
352
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
353
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # EM algorithm to estimate linear interpolation smoothing parameters  | 
| 
354
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # for smoothing of transition probabilities  | 
| 
355
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub compute_smoothing_params {  | 
| 
356
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ($self) = @_;  | 
| 
357
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
358
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # only progress and/or debug info  | 
| 
359
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( $self->config->DEBUG >= 1 ) {  | 
| 
360
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         print "Running EM algorithm to estimate lambdas...\n";  | 
| 
361
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
362
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
363
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # uniform probability is 1 / number of different labels  | 
| 
364
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->uniform_prob( 1 / ( keys %{ $self->unigrams } ) );  | 
| 
365
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
366
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $change = 1;  | 
| 
367
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     while ( $change > $self->config->EM_EPSILON ) {  | 
| 
368
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
369
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         #compute "expected counts"  | 
| 
370
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $expectedCounts    = $self->count_expected_counts_all();  | 
| 
371
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $expectedCountsSum = $expectedCounts->[0] + $expectedCounts->[1]  | 
| 
372
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             + $expectedCounts->[2];  | 
| 
373
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
374
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         #compute new lambdas  | 
| 
375
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my @new_lambdas = map { $_ / $expectedCountsSum } @$expectedCounts;  | 
| 
376
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
377
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         #compute the change (sum of changes of lambdas)  | 
| 
378
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $change = abs( $self->smooth_uniform - $new_lambdas[0] )  | 
| 
379
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             + abs( $self->smooth_unigrams - $new_lambdas[1] )  | 
| 
380
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             + abs( $self->smooth_bigrams - $new_lambdas[2] );  | 
| 
381
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
382
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # set new lambdas  | 
| 
383
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $self->smooth_uniform( $new_lambdas[0] );  | 
| 
384
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $self->smooth_unigrams( $new_lambdas[1] );  | 
| 
385
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $self->smooth_bigrams( $new_lambdas[2] );  | 
| 
386
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
387
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # only progress and/or debug info  | 
| 
388
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ( $self->config->DEBUG >= 2 ) {  | 
| 
389
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             print "Last change: $change\n";  | 
| 
390
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
391
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
392
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
393
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # only progress and/or debug info  | 
| 
394
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( $self->config->DEBUG >= 2 ) {  | 
| 
395
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         print "Final lambdas:\n"  | 
| 
396
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             . "uniform: " . $self->smooth_uniform  | 
| 
397
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             . "unigram: " . $self->smooth_unigrams  | 
| 
398
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             . "bigram: " . $self->smooth_bigrams;  | 
| 
399
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
400
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( $self->config->DEBUG >= 1 ) {  | 
| 
401
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         print "Done.\n";  | 
| 
402
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
403
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
404
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return;  | 
| 
405
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
406
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
407
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #count "expected counts" of lambdas  | 
| 
408
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub count_expected_counts_all {  | 
| 
409
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ($self) = @_;  | 
| 
410
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
411
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $expectedCounts = [ 0, 0, 0 ];  | 
| 
412
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $sentence_counts;  | 
| 
413
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
414
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     foreach my $sentence ( @{ $self->EM_heldout_data } ) {  | 
| 
415
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $sentence_counts = $self->count_expected_counts_tree(  | 
| 
416
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $sentence->nodes_with_root->[0]  | 
| 
417
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         );  | 
| 
418
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $expectedCounts->[0] += $sentence_counts->[0];  | 
| 
419
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $expectedCounts->[1] += $sentence_counts->[1];  | 
| 
420
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $expectedCounts->[2] += $sentence_counts->[2];  | 
| 
421
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
422
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
423
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return $expectedCounts;  | 
| 
424
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
425
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
426
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #count "expected counts" of lambdas for a parse (sub)tree, recursively  | 
| 
427
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub count_expected_counts_tree {  | 
| 
428
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $root_node ) = @_;  | 
| 
429
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
430
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my @edges = @{ $root_node->children };  | 
| 
431
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
432
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # get sequence of labels  | 
| 
433
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my @labels = map { $_->child->label } @edges;  | 
| 
434
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
435
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # counts for this sequence  | 
| 
436
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $expectedCounts = $self->count_expected_counts_sequence( \@labels );  | 
| 
437
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
438
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # recursion  | 
| 
439
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $subtree_counts;  | 
| 
440
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     foreach my $edge (@edges) {  | 
| 
441
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $subtree_counts = $self->count_expected_counts_tree( $edge->child );  | 
| 
442
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $expectedCounts->[0] += $subtree_counts->[0];  | 
| 
443
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $expectedCounts->[1] += $subtree_counts->[1];  | 
| 
444
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $expectedCounts->[2] += $subtree_counts->[2];  | 
| 
445
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
446
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
447
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return $expectedCounts;  | 
| 
448
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
449
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
450
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # count "expected counts" of lambdas for a sequence of labels  | 
| 
451
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # (including the boundaries)  | 
| 
452
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub count_expected_counts_sequence {  | 
| 
453
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
454
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $labels_sequence ) = @_;  | 
| 
455
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
456
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # to be computed here  | 
| 
457
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $expectedCounts = [ 0, 0, 0 ];  | 
| 
458
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
459
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # boundary at the beginning  | 
| 
460
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $label_prev = $self->config->SEQUENCE_BOUNDARY_LABEL;  | 
| 
461
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
462
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # boundary at the end  | 
| 
463
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     push @$labels_sequence, $self->config->SEQUENCE_BOUNDARY_LABEL;  | 
| 
464
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
465
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     foreach my $label_this (@$labels_sequence) {  | 
| 
466
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
467
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # get probs  | 
| 
468
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $ngramProbs =  | 
| 
469
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $self->get_transition_probs_array( $label_this, $label_prev );  | 
| 
470
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $finalProb = $ngramProbs->[0] * $self->smooth_uniform  | 
| 
471
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             + $ngramProbs->[1] * $self->smooth_unigrams  | 
| 
472
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             + $ngramProbs->[2] * $self->smooth_bigrams;  | 
| 
473
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
474
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # update expected counts  | 
| 
475
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $expectedCounts->[0] +=  | 
| 
476
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $self->smooth_uniform * $ngramProbs->[0] / $finalProb;  | 
| 
477
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $expectedCounts->[1] +=  | 
| 
478
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $self->smooth_unigrams * $ngramProbs->[1] / $finalProb;  | 
| 
479
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $expectedCounts->[2] +=  | 
| 
480
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $self->smooth_bigrams * $ngramProbs->[2] / $finalProb;  | 
| 
481
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
482
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $label_prev = $label_this;  | 
| 
483
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
484
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
485
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return $expectedCounts;  | 
| 
486
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
487
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
488
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub get_all_labels {  | 
| 
489
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ($self) = @_;  | 
| 
490
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
491
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my @labels = keys %{ $self->unigrams };  | 
| 
492
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
493
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return \@labels;  | 
| 
494
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
495
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
496
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # ACCESS TO SCORES  | 
| 
497
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
498
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub get_label_score {  | 
| 
499
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
500
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # (Str $label, Str $label_prev, ArrayRef[Str] $features)  | 
| 
501
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $label, $label_prev, $features ) = @_;  | 
| 
502
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
503
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $ALGORITHM = $self->config->labeller_algorithm;  | 
| 
504
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
505
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( $ALGORITHM == 8 || $ALGORITHM == 9 ) {  | 
| 
506
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
507
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $result = 0;  | 
| 
508
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
509
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # foreach present feature  | 
| 
510
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         foreach my $feature (@$features) {  | 
| 
511
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
512
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # add "emission score" and "transition score"  | 
| 
513
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $result +=  | 
| 
514
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 $self->get_emission_score( $label, $feature )  | 
| 
515
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 +  | 
| 
516
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 $self->get_transition_score(  | 
| 
517
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 $label, $label_prev, $feature  | 
| 
518
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 )  | 
| 
519
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 ;  | 
| 
520
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }    # end foreach $feature  | 
| 
521
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
522
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         return $result;  | 
| 
523
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
524
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif ( $ALGORITHM == 14 || $ALGORITHM == 15 ) {  | 
| 
525
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
526
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $label_scores = $self->get_emission_scores($features);  | 
| 
527
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
528
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $result = $label_scores->{$label};  | 
| 
529
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ( !defined $result ) {  | 
| 
530
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $result = 0;  | 
| 
531
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
532
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
533
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # multiply by transitions score  | 
| 
534
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $result *= $self->get_transition_score( $label, $label_prev );  | 
| 
535
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
536
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         return $result;  | 
| 
537
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
538
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif ( $ALGORITHM == 16 || $ALGORITHM == 18 ) {  | 
| 
539
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
540
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $result = 0;  | 
| 
541
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
542
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # sum of emission scores  | 
| 
543
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         foreach my $feature (@$features) {  | 
| 
544
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $result += $self->get_emission_score( $label, $feature );  | 
| 
545
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
546
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
547
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # multiply by transitions score  | 
| 
548
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $result *= $self->get_transition_score( $label, $label_prev );  | 
| 
549
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
550
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         return $result;  | 
| 
551
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
552
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif ( $ALGORITHM == 19 ) {  | 
| 
553
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
554
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $result = 0;  | 
| 
555
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
556
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # sum of emission scores  | 
| 
557
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         foreach my $feature (@$features) {  | 
| 
558
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $result += $self->get_emission_score( $label, $feature );  | 
| 
559
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
560
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
561
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # sigmoid transformation  | 
| 
562
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $result = 1 / ( 1 + exp( -$result * $self->config->SIGM_LAMBDA ) );  | 
| 
563
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
564
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # multiply by transitions score  | 
| 
565
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $result *= $self->get_transition_score( $label, $label_prev );  | 
| 
566
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
567
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         return $result;  | 
| 
568
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
569
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif ( $ALGORITHM == 17 ) {  | 
| 
570
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
571
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $result = 0;  | 
| 
572
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
573
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # sum of emission scores  | 
| 
574
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         foreach my $feature (@$features) {  | 
| 
575
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $result += $self->get_emission_score( $label, $feature );  | 
| 
576
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
577
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
578
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # multiply by transitions score  | 
| 
579
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ( $result > 0 ) {  | 
| 
580
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $result *= $self->get_transition_score( $label, $label_prev );  | 
| 
581
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } else {  | 
| 
582
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
583
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # For negative scores this works the other way round,  | 
| 
584
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # eg. if I had two labels, both with emission score -5  | 
| 
585
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # and their transition probs were 0.2 and 0.9,  | 
| 
586
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # then the latter should get a higher score;  | 
| 
587
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # simple mltiplication won't help as that would yield scores  | 
| 
588
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # of -1.0 and -4.5, thus inverting the order.  | 
| 
589
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # What I do is that for transition prob p I use (1-p)  | 
| 
590
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # which yields 0.8 and 0.1 transition probabilities here,  | 
| 
591
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # giving scores of -4.0 and -0.5, which is much better.  | 
| 
592
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # Still, a label with negative emission score, even if very close  | 
| 
593
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # to 0 and with a high transition prob, cannot outscore any label  | 
| 
594
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # with a positive emission score, even if low with a low transition  | 
| 
595
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # prob - normalizing scores to be non-negative would be necessary  | 
| 
596
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # for this, as is alg 0 and similar.  | 
| 
597
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             #             $result *=  | 
| 
598
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             #                 ( 1 - $self->get_transition_score( $label, $label_prev ) );  | 
| 
599
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
600
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # TODO trying new variant - setting negative scores to 0  | 
| 
601
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $result = 0;  | 
| 
602
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
603
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
604
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         return $result;  | 
| 
605
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
606
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif ( $ALGORITHM >= 20 ) {  | 
| 
607
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
608
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $result = 0;  | 
| 
609
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
610
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # sum of emission scores  | 
| 
611
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         foreach my $feature (@$features) {  | 
| 
612
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $result += $self->get_emission_score( $label, $feature );  | 
| 
613
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
614
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
615
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # TODO: could also compute using $label_prev,  | 
| 
616
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # using transitions to store these;  | 
| 
617
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # would allow to use full Viterbi  | 
| 
618
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
619
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         return $result;  | 
| 
620
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
621
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
622
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         croak "ModelLabelling->get_label_score not implemented"  | 
| 
623
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             . " for algorithm no. $ALGORITHM!";  | 
| 
624
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
625
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # usually because it needs to know scores of all possible labels  | 
| 
626
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # to normalize them properly  | 
| 
627
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
628
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
629
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
630
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub get_emission_score {  | 
| 
631
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
632
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # (Str $label, Str $feature)  | 
| 
633
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $label, $feature ) = @_;  | 
| 
634
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
635
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $ALGORITHM = $self->config->labeller_algorithm;  | 
| 
636
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
637
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($ALGORITHM == 8  | 
| 
638
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 9  | 
| 
639
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 16  | 
| 
640
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 17  | 
| 
641
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 18  | 
| 
642
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 19  | 
| 
643
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM >= 20  | 
| 
644
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         )  | 
| 
645
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     {  | 
| 
646
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
647
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ($self->emissions->{$feature}  | 
| 
648
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             && $self->emissions->{$feature}->{$label}  | 
| 
649
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             )  | 
| 
650
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         {  | 
| 
651
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             return $self->emissions->{$feature}->{$label};  | 
| 
652
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } else {  | 
| 
653
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             return 0;  | 
| 
654
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
655
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
656
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
657
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         croak "ModelLabelling->get_emission_score not implemented"  | 
| 
658
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             . " for algorithm no. $ALGORITHM!";  | 
| 
659
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
660
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
661
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
662
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub get_transition_score {  | 
| 
663
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
664
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # (Str $label_this, Str $label_prev, Maybe[Str] $feature)  | 
| 
665
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $label_this, $label_prev, $feature ) = @_;  | 
| 
666
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
667
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $ALGORITHM = $self->config->labeller_algorithm;  | 
| 
668
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
669
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( $ALGORITHM == 8 || $ALGORITHM == 9 ) {  | 
| 
670
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ($self->transitions->{$feature}  | 
| 
671
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             && $self->transitions->{$feature}->{$label_prev}  | 
| 
672
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             && $self->transitions->{$feature}->{$label_prev}->{$label_this}  | 
| 
673
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             )  | 
| 
674
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         {  | 
| 
675
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             return $self->transitions->{$feature}->{$label_prev}->{$label_this};  | 
| 
676
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } else {  | 
| 
677
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
678
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # no smoothing as it is used in addition, not in multiplication  | 
| 
679
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             return 0;  | 
| 
680
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
681
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif (  | 
| 
682
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $ALGORITHM == 5  | 
| 
683
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 12 || $ALGORITHM == 13  | 
| 
684
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 15  | 
| 
685
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 16 || $ALGORITHM == 17  | 
| 
686
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 18 || $ALGORITHM == 19  | 
| 
687
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         )  | 
| 
688
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     {  | 
| 
689
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
690
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # smoothing by linear combination  | 
| 
691
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # PROB(label|prev_label) =  | 
| 
692
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         #    smooth_bigrams  * transitions->{prev_label}->{label} +  | 
| 
693
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         #    smooth_unigrams * unigrams->{label} +  | 
| 
694
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         #    smooth_uniform  | 
| 
695
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
696
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $probs =  | 
| 
697
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $self->get_transition_probs_array( $label_this, $label_prev );  | 
| 
698
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
699
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $result = $probs->[0] * $self->smooth_uniform  | 
| 
700
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             + $probs->[1] * $self->smooth_unigrams  | 
| 
701
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             + $probs->[2] * $self->smooth_bigrams;  | 
| 
702
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
703
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         return $result;  | 
| 
704
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
705
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif (  | 
| 
706
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $ALGORITHM == 0  | 
| 
707
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 1  | 
| 
708
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 2  | 
| 
709
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 3  | 
| 
710
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 4  | 
| 
711
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 10  | 
| 
712
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 11  | 
| 
713
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 14  | 
| 
714
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         )  | 
| 
715
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     {  | 
| 
716
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
717
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # no real smoothing  | 
| 
718
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ($self->transitions->{$label_prev}  | 
| 
719
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             && $self->transitions->{$label_prev}->{$label_this}  | 
| 
720
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             )  | 
| 
721
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         {  | 
| 
722
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             return $self->transitions->{$label_prev}->{$label_this};  | 
| 
723
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } else {  | 
| 
724
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             return 0.00001;  | 
| 
725
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
726
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
727
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         croak "ModelLabelling->get_transition_score not implemented"  | 
| 
728
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             . " for algorithm no. $ALGORITHM!";  | 
| 
729
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
730
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }    # end get_transition_score  | 
| 
731
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
732
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $result->[0] = uniform prob  | 
| 
733
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $result->[1] = unigram prob  | 
| 
734
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $result->[2] = bigram prob  | 
| 
735
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub get_transition_probs_array {  | 
| 
736
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
737
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # (Str $label_this, Str $label_prev)  | 
| 
738
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $label_this, $label_prev ) = @_;  | 
| 
739
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
740
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $result = [ 0, 0, 0 ];  | 
| 
741
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
742
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # uniform  | 
| 
743
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $result->[0] = $self->uniform_prob;  | 
| 
744
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
745
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( $self->unigrams->{$label_this} ) {  | 
| 
746
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
747
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # unigram  | 
| 
748
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $result->[1] = $self->unigrams->{$label_this};  | 
| 
749
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
750
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ( $self->transitions->{$label_prev}->{$label_this} ) {  | 
| 
751
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
752
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # bigram  | 
| 
753
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $result->[2] = $self->transitions->{$label_prev}->{$label_this};  | 
| 
754
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
755
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
756
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
757
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return $result;  | 
| 
758
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
759
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
760
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # get scores of all possible labels based on all the features  | 
| 
761
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # (gives different numbers for different algorithms,  | 
| 
762
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # often they are not real probabilities but general scores)  | 
| 
763
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub get_emission_scores {  | 
| 
764
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
765
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # (ArrayRef[Str] $features)  | 
| 
766
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $features ) = @_;  | 
| 
767
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
768
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # a hashref of the structure $result->{label} = prob  | 
| 
769
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # where prob might or might not be a real probability  | 
| 
770
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # (i.e. may or may not fulfill 0 <= prob <= 1 & sum(probs) == 1),  | 
| 
771
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # depending on the algorithm used  | 
| 
772
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # (but always a higher prob means a better scoring (more probable) label  | 
| 
773
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # and all of the probs are non-negative) TODO does it hold?  | 
| 
774
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $result = {};  | 
| 
775
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
776
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $ALGORITHM = $self->config->labeller_algorithm;  | 
| 
777
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
778
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($ALGORITHM == 0  | 
| 
779
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 1  | 
| 
780
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 2  | 
| 
781
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 3  | 
| 
782
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 10  | 
| 
783
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 11  | 
| 
784
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 12  | 
| 
785
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 13  | 
| 
786
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 14  | 
| 
787
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 15  | 
| 
788
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         )  | 
| 
789
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     {  | 
| 
790
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $result = $self->get_emission_scores_basic_MIRA($features);  | 
| 
791
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif ( $ALGORITHM == 4 || $ALGORITHM == 5 ) {  | 
| 
792
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $result = $self->get_emission_scores_no_MIRA($features);  | 
| 
793
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
794
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         croak "ModelLabelling->get_emission_scores not implemented"  | 
| 
795
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             . " for algorithm no. $ALGORITHM!";  | 
| 
796
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
797
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
798
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # the boundary label is NOT a valid label  | 
| 
799
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     delete $result->{ $self->config->SEQUENCE_BOUNDARY_LABEL };  | 
| 
800
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
801
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return $result;  | 
| 
802
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
803
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
804
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub get_emission_scores_basic_MIRA {  | 
| 
805
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
806
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $features ) = @_;  | 
| 
807
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
808
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $ALGORITHM = $self->config->labeller_algorithm;  | 
| 
809
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
810
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $result = {};  | 
| 
811
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
812
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $warnNoEmissionProbs = "!!! WARNING !!! "  | 
| 
813
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         . "Based on the training data, no possible label was found"  | 
| 
814
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         . " for an edge. This usually means that either"  | 
| 
815
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         . " your training data are not big enough or that"  | 
| 
816
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         . " the set of features you are using"  | 
| 
817
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         . " is not well constructed - either it is too small"  | 
| 
818
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         . " or it lacks features that would be general enough"  | 
| 
819
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         . " to cover all possible sentences."  | 
| 
820
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         . " Using blind emission probabilities instead.\n";  | 
| 
821
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
822
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # "pure MIRA", i.e. no MLE  | 
| 
823
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
824
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( $ALGORITHM == 11 || $ALGORITHM == 13 ) {  | 
| 
825
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
826
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # initialize all label scores with 0 (so that all labels get some score)  | 
| 
827
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $all_labels = $self->get_all_labels();  | 
| 
828
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         foreach my $label (@$all_labels) {  | 
| 
829
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $result->{$label} = 0;  | 
| 
830
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
831
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
832
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
833
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # get scores  | 
| 
834
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     foreach my $feature (@$features) {  | 
| 
835
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ( $self->emissions->{$feature} ) {  | 
| 
836
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             foreach my $label ( keys %{ $self->emissions->{$feature} } ) {  | 
| 
837
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 $result->{$label} += $self->emissions->{$feature}->{$label};  | 
| 
838
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
839
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
840
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
841
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
842
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # subtracting the minimum from the score  | 
| 
843
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($ALGORITHM == 0  | 
| 
844
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 1  | 
| 
845
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 2  | 
| 
846
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 10  | 
| 
847
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 11  | 
| 
848
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 12  | 
| 
849
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 13  | 
| 
850
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 14  | 
| 
851
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         || $ALGORITHM == 15  | 
| 
852
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         )  | 
| 
853
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     {  | 
| 
854
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
855
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # find min and max score  | 
| 
856
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $min = 1e300;  | 
| 
857
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $max = -1e300;  | 
| 
858
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         foreach my $label ( keys %$result ) {  | 
| 
859
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             if ( $result->{$label} < $min ) {  | 
| 
860
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 $min = $result->{$label};  | 
| 
861
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
862
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             if ( $result->{$label} > $max ) {  | 
| 
863
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 $max = $result->{$label};  | 
| 
864
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
865
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
866
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # else is between $min and $max -> keep the values as they are  | 
| 
867
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
868
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
869
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ( $min > $max ) {  | 
| 
870
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
871
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # $min > $max, i.e. nothing has been generated -> backoff  | 
| 
872
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             if ( $self->config->DEBUG >= 2 ) {  | 
| 
873
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 print $warnNoEmissionProbs;  | 
| 
874
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
875
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
876
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # backoff by using unigram probabilities  | 
| 
877
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # (or unigram counts in some algorithms)  | 
| 
878
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $result = $self->unigrams;  | 
| 
879
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } else {  | 
| 
880
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
881
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # something has been generated, now 0 and 1 start to differ  | 
| 
882
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             if ($ALGORITHM == 0  | 
| 
883
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 || $ALGORITHM == 10  | 
| 
884
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 || $ALGORITHM == 11  | 
| 
885
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 || $ALGORITHM == 12  | 
| 
886
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 || $ALGORITHM == 13  | 
| 
887
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 || $ALGORITHM == 14  | 
| 
888
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 || $ALGORITHM == 15  | 
| 
889
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 )  | 
| 
890
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             {  | 
| 
891
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
892
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # 0 MIRA-trained scores recomputed by +abs(min)  | 
| 
893
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # and converted to probs  | 
| 
894
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 if ( $min < $max ) {  | 
| 
895
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
896
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     # the typical case  | 
| 
897
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     # my $subtractant = $min;  | 
| 
898
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     my $divisor = 0;  | 
| 
899
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
900
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     foreach my $label ( keys %$result ) {  | 
| 
901
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         $result->{$label} = ( $result->{$label} - $min );  | 
| 
902
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         $divisor += $result->{$label};  | 
| 
903
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     }  | 
| 
904
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     foreach my $label ( keys %$result ) {  | 
| 
905
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         $result->{$label} = $result->{$label} / $divisor;  | 
| 
906
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     }  | 
| 
907
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 } else {  | 
| 
908
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
909
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     # $min == $max  | 
| 
910
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
911
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     # uniform prob distribution  | 
| 
912
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     my $prob = 1 / scalar( keys %$result );  | 
| 
913
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     foreach my $label ( keys %$result ) {  | 
| 
914
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         $result->{$label} = $prob;  | 
| 
915
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     }  | 
| 
916
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
917
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
918
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # end $ALGORITHM == 0|10|11|12|13|14|15  | 
| 
919
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             } else {  | 
| 
920
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
921
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # $ALGORITHM == 1|2  | 
| 
922
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # 1 dtto, NOT converted to probs  | 
| 
923
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 #   (but should behave the same as 0)  | 
| 
924
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # 2 dtto, sum in Viterbi instead of product  | 
| 
925
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 #   (new_prob = old_prob + emiss*trans)  | 
| 
926
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # (for 1 and 2 the emission probs are completely the same,  | 
| 
927
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # they are just handled differently by the Labeller)  | 
| 
928
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
929
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 if ( $min < $max ) {  | 
| 
930
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
931
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     # the typical case  | 
| 
932
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     # my $subtractant = $min;  | 
| 
933
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
934
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     foreach my $label ( keys %$result ) {  | 
| 
935
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         $result->{$label} = ( $result->{$label} - $min );  | 
| 
936
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     }  | 
| 
937
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 } else {  | 
| 
938
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
939
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     # $min == $max  | 
| 
940
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     # uniform prob distribution  | 
| 
941
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
942
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     if ( $min <= 0 ) {  | 
| 
943
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
944
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         # we would like to keep the values  | 
| 
945
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         # but this is not possible in this case  | 
| 
946
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         foreach my $label ( keys %$result ) {  | 
| 
947
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
948
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                             # so lets just assign ones  | 
| 
949
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                             $result->{$label} = 1;  | 
| 
950
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         }  | 
| 
951
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     }  | 
| 
952
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
953
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     # else there is already a uniform distribution  | 
| 
954
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     # so let's keep it as it is  | 
| 
955
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 }  | 
| 
956
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
957
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # end $ALGORITHM == 1|2  | 
| 
958
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
959
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
960
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
961
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # end $ALGORITHM == 0|1|2|10|11|12|13|14|15  | 
| 
962
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
963
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
964
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # $ALGORITHM == 3  | 
| 
965
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # no subtraction of minimum, just throw away <= 0  | 
| 
966
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
967
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         foreach my $label ( keys %$result ) {  | 
| 
968
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             if ( $result->{$label} <= 0 ) {  | 
| 
969
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 delete $result->{$label};  | 
| 
970
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
971
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
972
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # else > 0 -> just keep it there and that's it  | 
| 
973
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
974
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }    # end $ALGORITHM == 3  | 
| 
975
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
976
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return $result;  | 
| 
977
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }    # end get_emission_scores_basic_MIRA  | 
| 
978
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
979
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub get_emission_scores_no_MIRA {  | 
| 
980
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
981
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $features ) = @_;  | 
| 
982
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
983
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $result = {};  | 
| 
984
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
985
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $warnNoEmissionProbs = "!!! WARNING !!! "  | 
| 
986
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         . "Based on the training data, no possible label was found"  | 
| 
987
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         . " for an edge. This usually means that either"  | 
| 
988
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         . " your training data are not big enough or that"  | 
| 
989
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         . " the set of features you are using"  | 
| 
990
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         . " is not well constructed - either it is too small"  | 
| 
991
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         . " or it lacks features that would be general enough"  | 
| 
992
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         . " to cover all possible sentences."  | 
| 
993
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         . " Using blind emission probabilities instead.\n";  | 
| 
994
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
995
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # basic or full MLE, no MIRA  | 
| 
996
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
997
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my %counts    = ();  | 
| 
998
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my %prob_sums = ();  | 
| 
999
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1000
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # get scores  | 
| 
1001
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     foreach my $feature (@$features) {  | 
| 
1002
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ( $self->emissions->{$feature} ) {  | 
| 
1003
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1004
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # !!! TODO tady by mÄl bejt souÄin !!!  | 
| 
1005
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             foreach my $label ( keys %{ $self->emissions->{$feature} } ) {  | 
| 
1006
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 $prob_sums{$label} +=  | 
| 
1007
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     $self->emissions->{$feature}->{$label};  | 
| 
1008
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 $counts{$label}++;  | 
| 
1009
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
1010
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
1011
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
1012
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1013
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( keys %prob_sums ) {  | 
| 
1014
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         foreach my $label ( keys %prob_sums ) {  | 
| 
1015
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1016
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # something like average pobability  | 
| 
1017
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # = all features have the score of 1  | 
| 
1018
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # (or more precisely 1/number_of_features)  | 
| 
1019
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $result->{$label} = $prob_sums{$label} / $counts{$label};  | 
| 
1020
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
1021
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
1022
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1023
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # backoff  | 
| 
1024
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         if ( $self->config->DEBUG >= 2 ) {  | 
| 
1025
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             print $warnNoEmissionProbs;  | 
| 
1026
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
1027
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1028
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # backoff by using unigram probabilities  | 
| 
1029
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # (or unigram counts in some algorithms)  | 
| 
1030
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $result = $self->unigrams;  | 
| 
1031
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
1032
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1033
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return $result;  | 
| 
1034
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }    # end get_emission_scores_no_MIRA  | 
| 
1035
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1036
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # sets emission score (if $label_prev is not set)  | 
| 
1037
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # or transition score (if it is)  | 
| 
1038
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # of the $feature to $score  | 
| 
1039
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub set_feature_score {  | 
| 
1040
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1041
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # (Str $feature, Num $score, Str $label, Maybe[Str] $label_prev)  | 
| 
1042
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $feature, $score, $label, $label_prev ) = @_;  | 
| 
1043
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1044
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( defined $label_prev ) {  | 
| 
1045
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $self->transitions->{$feature}->{$label_prev}->{$label} = $score;  | 
| 
1046
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
1047
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $self->emissions->{$feature}->{$label} = $score;  | 
| 
1048
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
1049
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1050
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return;  | 
| 
1051
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
1052
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1053
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # updates emission score (if $label_prev is not set)  | 
| 
1054
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # or transition score (if it is)  | 
| 
1055
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # of the $feature by adding $update  | 
| 
1056
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub update_feature_score {  | 
| 
1057
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1058
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # (Str $feature, Num $update, Str $label, Maybe[Str] $label_prev)  | 
| 
1059
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ( $self, $feature, $update, $label, $label_prev ) = @_;  | 
| 
1060
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1061
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( defined $label_prev ) {  | 
| 
1062
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $self->transitions->{$feature}->{$label_prev}->{$label} += $update;  | 
| 
1063
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
1064
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $self->emissions->{$feature}->{$label} += $update;  | 
| 
1065
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
1066
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1067
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return;  | 
| 
1068
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
1069
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1070
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # returns number of features in the model (where a "feature" can stand for  | 
| 
1071
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # various things depending on the algorithm used)  | 
| 
1072
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub get_feature_count {  | 
| 
1073
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1074
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my ($self) = @_;  | 
| 
1075
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1076
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $ALGORITHM = $self->config->labeller_algorithm;  | 
| 
1077
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1078
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # result = $emissions_count + $transitions_count  | 
| 
1079
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $emissions_count   = 0;  | 
| 
1080
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $transitions_count = 0;  | 
| 
1081
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1082
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # structure: emissions->{feature}->{label}  | 
| 
1083
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my @emission_features = keys %{ $self->emissions };  | 
| 
1084
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     foreach my $feature (@emission_features) {  | 
| 
1085
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $emissions_count += scalar( keys %{ $self->emissions->{$feature} } );  | 
| 
1086
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
1087
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1088
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ( $ALGORITHM == 8 || $ALGORITHM == 9 ) {  | 
| 
1089
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1090
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # structure: transitions->{feature}->{label_prev}->{label}  | 
| 
1091
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1092
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my @transition_features = keys %{ $self->transitions };  | 
| 
1093
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         foreach my $feature (@transition_features) {  | 
| 
1094
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1095
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             my @labels = keys %{ $self->transitions->{$feature} };  | 
| 
1096
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             foreach my $label_prev (@labels) {  | 
| 
1097
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1098
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 $transitions_count += scalar(  | 
| 
1099
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                     keys %{ $self->transitions->{$feature}->{$label_prev} }  | 
| 
1100
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 );  | 
| 
1101
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
1102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
1103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
1105
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1106
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # structure: transitions->{label_prev}->{label}  | 
| 
1107
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my @labels = keys %{ $self->transitions };  | 
| 
1109
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         foreach my $label_prev (@labels) {  | 
| 
1110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1111
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $transitions_count +=  | 
| 
1112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 scalar( keys %{ $self->transitions->{$label_prev} } );  | 
| 
1113
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
1114
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
1115
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1116
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return $emissions_count + $transitions_count;  | 
| 
1117
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1118
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }    # end get_feature_count  | 
| 
1119
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
1121
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1122
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  | 
| 
1123
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1124
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =pod  | 
| 
1125
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1126
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =for Pod::Coverage BUILD  | 
| 
1127
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =encoding utf-8  | 
| 
1129
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1130
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 NAME  | 
| 
1131
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1132
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Treex::Tool::Parser::MSTperl::ModelLabelling  | 
| 
1133
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1134
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 VERSION  | 
| 
1135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 version 0.11949  | 
| 
1137
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1138
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 DESCRIPTION  | 
| 
1139
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1140
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This is an in-memory represenation of a labelling model,  | 
| 
1141
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 extended from L<Treex::Tool::Parser::MSTperl::ModelBase>.  | 
| 
1142
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1143
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 FIELDS  | 
| 
1144
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1145
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 Inherited from base package  | 
| 
1146
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1147
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Fields inherited from L<Treex::Tool::Parser::MSTperl::ModelBase>.  | 
| 
1148
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1149
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4  | 
| 
1150
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1151
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item config  | 
| 
1152
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1153
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Instance of L<Treex::Tool::Parser::MSTperl::Config> containing settings to be  | 
| 
1154
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 used for the model.  | 
| 
1155
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1156
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Currently the settings most relevant to the model are the following:  | 
| 
1157
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1158
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 8  | 
| 
1159
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1160
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item EM_EPSILON  | 
| 
1161
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1162
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 See L<Treex::Tool::Parser::MSTperl::Config/EM_EPSILON>.  | 
| 
1163
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1164
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item labeller_algorithm  | 
| 
1165
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1166
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 See L<Treex::Tool::Parser::MSTperl::Config/labeller_algorithm>.  | 
| 
1167
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1168
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item labelledFeaturesControl  | 
| 
1169
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1170
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 See L<Treex::Tool::Parser::MSTperl::Config/labelledFeaturesControl>.  | 
| 
1171
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1172
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item SEQUENCE_BOUNDARY_LABEL  | 
| 
1173
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1174
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 See L<Treex::Tool::Parser::MSTperl::Config/SEQUENCE_BOUNDARY_LABEL>.  | 
| 
1175
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1176
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
1177
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1178
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item featuresControl  | 
| 
1179
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1180
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Provides access to labeller features, especially enabling their computation.  | 
| 
1181
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Intance of L<Treex::Tool::Parser::MSTperl::FeaturesControl>.  | 
| 
1182
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1183
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
1184
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1185
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 Label scoring  | 
| 
1186
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4  | 
| 
1188
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1189
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item emissions  | 
| 
1190
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1191
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Emission scores for Viterbi. They follow the edge-based factorization  | 
| 
1192
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 and provide scores for various labels for an edge based on its features.  | 
| 
1193
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1194
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The structure is:  | 
| 
1195
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1196
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   emissions->{feature}->{label} = score  | 
| 
1197
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1198
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Scores may or may not be probabilities, based on the algorithm used.  | 
| 
1199
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Also based on the algorithm they may be MIRA-computed  | 
| 
1200
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 or they might be obtained by standard MLE.  | 
| 
1201
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1202
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item transitions  | 
| 
1203
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1204
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Transition scores for Viterbi. They follow the  | 
| 
1205
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 first order Markov chain edge-based factorization  | 
| 
1206
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 and provide scores for various labels for an edge  | 
| 
1207
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 probably based on its features  | 
| 
1208
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 and always based on previous edge label.  | 
| 
1209
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1210
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Scores may or may not be probabilities, based on the algorithm used.  | 
| 
1211
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Also based on the algorithm they may be obtained by standard MLE  | 
| 
1212
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 or they might be MIRA-computed.  | 
| 
1213
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1214
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The structure is:  | 
| 
1215
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1216
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   transitions->{label_prev}->{label_this} = prob  | 
| 
1217
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1218
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 or  | 
| 
1219
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1220
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   transitions->{feature}->{label_prev}->{label_this} = score  | 
| 
1221
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1222
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
1223
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1224
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 Transitions smoothing  | 
| 
1225
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1226
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 In some algorithms linear combination smoothing is used  | 
| 
1227
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 for transition probabilities.  | 
| 
1228
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The resulting transition probability is then obtained as:  | 
| 
1229
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1230
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  PROB(label|prev_label) =  | 
| 
1231
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     smooth_bigrams  * transitions->{prev_label}->{label} +  | 
| 
1232
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     smooth_unigrams * unigrams->{label} +  | 
| 
1233
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     smooth_uniform  | 
| 
1234
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1235
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4  | 
| 
1236
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1237
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item smooth_bigrams  | 
| 
1238
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1239
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item smooth_unigrams  | 
| 
1240
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1241
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item smooth_uniform  | 
| 
1242
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1243
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The actual smoothing parameters computed by EM algorithm.  | 
| 
1244
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Each of them is between 0 and 1 and together they sum up to 1.  | 
| 
1245
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1246
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item uniform_prob  | 
| 
1247
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1248
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Unifrom probability of a label, computed as  | 
| 
1249
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C<1 / ( keys %{ $self->unigrams } )>.  | 
| 
1250
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1251
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Set in C<compute_smoothing_params>.  | 
| 
1252
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1253
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item unigrams  | 
| 
1254
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1255
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Basic MLE from data, the structure is  | 
| 
1256
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1257
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  unigrams->{label} = prob  | 
| 
1258
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1259
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 To be used for transitions smoothing and/or backoff  | 
| 
1260
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (can be used both for emissions and transitions)  | 
| 
1261
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 It also contains the C<SEQUENCE_BOUNDARY_LABEL> prob  | 
| 
1262
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (the SEQUENCE_BOUNDARY_LABEL is counted once for each sequence)  | 
| 
1263
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 which might be unappropriate in some cases (eg. for emission probs).  | 
| 
1264
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1265
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item EM_heldout_data  | 
| 
1266
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1267
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Just an array ref with the sentences that represent the heldout data  | 
| 
1268
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 to be able to run the EM algorithm in C<prepare_for_mira()>.  | 
| 
1269
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Used only in training.  | 
| 
1270
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1271
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
1272
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1273
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 METHODS  | 
| 
1274
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1275
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 Inherited  | 
| 
1276
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1277
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Subroutines inherited from L<Treex::Tool::Parser::MSTperl::ModelBase>.  | 
| 
1278
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1279
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head3 Load and store  | 
| 
1280
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1281
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4  | 
| 
1282
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1283
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item store  | 
| 
1284
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1285
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 See L<Treex::Tool::Parser::MSTperl::ModelBase/store>.  | 
| 
1286
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1287
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item store_tsv  | 
| 
1288
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1289
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 See L<Treex::Tool::Parser::MSTperl::ModelBase/store_tsv>.  | 
| 
1290
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1291
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item load  | 
| 
1292
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1293
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 See L<Treex::Tool::Parser::MSTperl::ModelBase/load>.  | 
| 
1294
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1295
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item load_tsv  | 
| 
1296
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1297
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 See L<Treex::Tool::Parser::MSTperl::ModelBase/load_tsv>.  | 
| 
1298
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1299
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
1300
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1301
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 Overriden  | 
| 
1302
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1303
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Subroutines overriding stubs in L<Treex::Tool::Parser::MSTperl::ModelBase>.  | 
| 
1304
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1305
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head3 Load and store  | 
| 
1306
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1307
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4  | 
| 
1308
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1309
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item $data = get_data_to_store(), $data = get_data_to_store_tsv()  | 
| 
1310
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1311
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Returns the model data, containing the following fields:  | 
| 
1312
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C<unigrams>,  | 
| 
1313
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C<transitions>,  | 
| 
1314
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C<emissions>,  | 
| 
1315
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C<smooth_uniform>,  | 
| 
1316
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C<smooth_unigrams>,  | 
| 
1317
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C<smooth_bigrams>,  | 
| 
1318
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C<uniform_prob>  | 
| 
1319
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1320
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item load_data($data), load_data_tsv($data)  | 
| 
1321
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1322
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Tries to get all necessary data from C<$data>  | 
| 
1323
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (see C<get_data_to_store> to see what data are stored).  | 
| 
1324
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Also does basic checks on the data, eg. for non-emptiness, but nothing  | 
| 
1325
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sophisticated. Is algorithm-sensitive, i.e. if some data are not needed  | 
| 
1326
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 for the algorithm used, they do not have to be contained in the hash.  | 
| 
1327
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1328
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
1329
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1330
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head3 Training support  | 
| 
1331
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1332
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4  | 
| 
1333
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1334
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item prepare_for_mira  | 
| 
1335
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1336
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Called after preprocessing training data, before entering the MIRA phase.  | 
| 
1337
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1338
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Function varies depending on algorithm used.  | 
| 
1339
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Usually recomputes counts stored in C<emissions>, C<transitions> and C<unigrams>  | 
| 
1340
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 to probabilities that have been computed by C<add_emission>,  | 
| 
1341
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C<add_transition> and C<add_unigram>.  | 
| 
1342
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Also calls C<compute_smoothing_params> to estimate smoothing parameters  | 
| 
1343
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 for smoothing of transition probabilities.  | 
| 
1344
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1345
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item get_feature_count  | 
| 
1346
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1347
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Only to provide information about the model.  | 
| 
1348
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Returns number of features in the model (where a "feature" can stand for  | 
| 
1349
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 various things depending on the algorithm used).  | 
| 
1350
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1351
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
1352
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1353
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 Technical methods  | 
| 
1354
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1355
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4  | 
| 
1356
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1357
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item BUILD  | 
| 
1358
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1359
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  my $model = Treex::Tool::Parser::MSTperl::ModelLabelling->new(  | 
| 
1360
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     config => $config,  | 
| 
1361
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  );  | 
| 
1362
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1363
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Creates an empty model. If you are training the model, this is probably what you  | 
| 
1364
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 want, otherwise you can use C<load> or C<load_tsv>  | 
| 
1365
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 to load an existing labelling model from a file.  | 
| 
1366
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1367
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 However, most often you would probably use a model for a labeller  | 
| 
1368
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (L<Treex::Tool::Parser::MSTperl::Labeller>)  | 
| 
1369
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 or a labelling trainer  | 
| 
1370
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (L<Treex::Tool::Parser::MSTperl::TrainerLabelling>)  | 
| 
1371
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 which both automatically create the model on build.  | 
| 
1372
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The labeller also provides wrapping methods  | 
| 
1373
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L<Treex::Tool::Parser::MSTperl::Labeller/load_model>  | 
| 
1374
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 and  | 
| 
1375
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L<Treex::Tool::Parser::MSTperl::Labeller/load_model_tsv>  | 
| 
1376
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 which you can call to load the model from a file.  | 
| 
1377
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (Btw. as you might expect, the trainer provides methods  | 
| 
1378
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L<Treex::Tool::Parser::MSTperl::TrainerLabelling/store_model>  | 
| 
1379
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 and  | 
| 
1380
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L<Treex::Tool::Parser::MSTperl::TrainerLabelling/store_model_tsv>.)  | 
| 
1381
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1382
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
1383
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1384
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 MLE on training data  | 
| 
1385
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1386
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C<emissions> and C<transitions> can be either MIRA-trained  | 
| 
1387
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 or estimated directly from training data using MLE  | 
| 
1388
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (Maximum Likelihood Estimate).  | 
| 
1389
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C<unigrams> are always estimated by MLE.  | 
| 
1390
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1391
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4  | 
| 
1392
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1393
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item add_unigram ($label)  | 
| 
1394
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1395
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Increment count for the label in C<unigrams>.  | 
| 
1396
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1397
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item add_transition ($label_this, $label_prev)  | 
| 
1398
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1399
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item add_transition ($label_this, $label_prev, $feature)  | 
| 
1400
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1401
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Increment count for the transition in C<transitions>, possible including a  | 
| 
1402
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 feature on "this" edge if the algorithm uses features with transitions.  | 
| 
1403
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1404
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item add_emission ($feature, $label)  | 
| 
1405
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1406
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Increment count for this label on an edge with this feature in C<emissions>.  | 
| 
1407
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1408
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item compute_probs_from_counts ($self->emissions)  | 
| 
1409
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1410
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Takes a hash reference with label counts and chnages the counts  | 
| 
1411
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 to probabilities (this is the actual MLE).  | 
| 
1412
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 May be called in C<prepare_for_mira> on  | 
| 
1413
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C<emissions>, C<transitions> and C<unigrams>.  | 
| 
1414
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1415
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
1416
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1417
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 EM algorithm  | 
| 
1418
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1419
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4  | 
| 
1420
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1421
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item compute_smoothing_params()  | 
| 
1422
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1423
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The main method containing an implementation of the Expectation Maximization  | 
| 
1424
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Algorithm to compute smoothing parameters (C<smooth_bigrams>,  | 
| 
1425
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 C<smooth_unigrams>, C<smooth_uniform>) for transition probabilities  | 
| 
1426
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 smoothing by linear combination of bigram, unigram and uniform probability.  | 
| 
1427
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Iteratively tries to find  | 
| 
1428
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 such parameters that the probabilities from training data  | 
| 
1429
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (C<transitions>, C<unigrams> and C<uniform_prob>) combined together by  | 
| 
1430
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 the smoothing parameters model well enough the heldout data  | 
| 
1431
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (C<EM_heldout_data>), i.e. tries to maximize the probability of the heldout  | 
| 
1432
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 data given the training data probabilities by adjusting the smoothing  | 
| 
1433
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 parameters values.  | 
| 
1434
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1435
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Uses C<EM_EPSILON> as a stopping criterion, i.e. stops when the sum of  | 
| 
1436
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 absolute values of changes to all smoothing parameters are lower  | 
| 
1437
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 than the value of C<EM_EPSILON>.  | 
| 
1438
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1439
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item count_expected_counts_all()  | 
| 
1440
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1441
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item count_expected_counts_tree($root_node)  | 
| 
1442
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1443
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item count_expected_counts_sequence($labels_sequence)  | 
| 
1444
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1445
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Support methods to C<compute_smoothing_params>, in the order in which they  | 
| 
1446
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 call each other.  | 
| 
1447
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1448
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
1449
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1450
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 Scoring  | 
| 
1451
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1452
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 A bunch of methods to score the likelihood of a label being assigned to an  | 
| 
1453
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 edge based on the edge's features and the label assigned to the previous  | 
| 
1454
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 edge.  | 
| 
1455
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1456
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4  | 
| 
1457
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1458
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item get_all_labels()  | 
| 
1459
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1460
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Returns (a reference to) an array of all labels found in the training data  | 
| 
1461
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (eg. C<['Subj', 'Obj', 'Atr']>).  | 
| 
1462
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1463
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item get_label_score($label, $label_prev, $features)  | 
| 
1464
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1465
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Computes a score of assigning the given label to an edge,  | 
| 
1466
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 given the features of the edge and the label assigned to the previous edge.  | 
| 
1467
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1468
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Always a higher score means a more likely label for the edge.  | 
| 
1469
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Some algorithms may give a negative score.  | 
| 
1470
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1471
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Is semantically equivalent to calling C<get_emission_score>  | 
| 
1472
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 and C<get_transition_score> and then combining it together somehow.  | 
| 
1473
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1474
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item get_emission_score($label, $feature)  | 
| 
1475
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1476
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Computes the "emission score" of assigning the given label to an edge,  | 
| 
1477
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 given one of the feature of the edge and disregarding  | 
| 
1478
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 the label assigned to the previous edge.  | 
| 
1479
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1480
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item get_transition_score($label_this, $label_prev, $feature)  | 
| 
1481
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1482
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Computes the "transition score" of assigning the given label to an edge,  | 
| 
1483
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 given the label assigned to the previous edge  | 
| 
1484
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 and possibly also one of the features of the edge  | 
| 
1485
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 but NOT including the emission score returned by C<get_emission_score>.  | 
| 
1486
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1487
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item $result = get_transition_probs_array ($label_this, $label_prev)  | 
| 
1488
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1489
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Returns (a reference to) an array of the probabilities of the transition  | 
| 
1490
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 from label_prev to label_this (to be smoothed together),  | 
| 
1491
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 having the following structure:  | 
| 
1492
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1493
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $result->[0] = uniform prob  | 
| 
1494
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $result->[1] = unigram prob  | 
| 
1495
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $result->[2] = bigram prob  | 
| 
1496
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1497
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item $result = get_emission_scores($features)  | 
| 
1498
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1499
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Get scores of assigning each of the possible labels to an edge  | 
| 
1500
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 based on all the features of the edge. Is semantically equivalent  | 
| 
1501
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 to doing:  | 
| 
1502
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1503
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  foreach label  | 
| 
1504
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     foreach feature  | 
| 
1505
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         get_emission_score(label, feature)  | 
| 
1506
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1507
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The structure is:  | 
| 
1508
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1509
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  $result->{label} = score  | 
| 
1510
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1511
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Actually only serves as a switch for several implementations of the method  | 
| 
1512
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (C<get_emission_scores_basic_MIRA> and C<get_emission_scores_no_MIRA>);  | 
| 
1513
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 the method to be used is selected based on the algorithm being used.  | 
| 
1514
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1515
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item get_emission_scores_basic_MIRA($features)  | 
| 
1516
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1517
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 A C<get_emission_scores> implementation used with algorithms  | 
| 
1518
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 where the emission scores are computed by MIRA (this is currently  | 
| 
1519
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 the most successful implementation).  | 
| 
1520
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1521
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item get_emission_scores_no_MIRA($features)  | 
| 
1522
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1523
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 A C<get_emission_scores> implementation using only MLE. Probably obsolete now.  | 
| 
1524
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1525
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
1526
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1527
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 Changing the scores  | 
| 
1528
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1529
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Methods used by the trainer  | 
| 
1530
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (L<Treex::Tool::Parser::MSTperl::TrainerLabelling>)  | 
| 
1531
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 to adjust the scores to whatever seems to be  | 
| 
1532
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 the best idea at the moment. Used only in MIRA training  | 
| 
1533
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (MLE uses C<add_unigram>, C<add_emission>, C<add_transition>  | 
| 
1534
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 and C<compute_probs_from_counts> instead).  | 
| 
1535
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1536
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =over 4  | 
| 
1537
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1538
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item set_feature_score($feature, $score, $label, $label_prev)  | 
| 
1539
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1540
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Sets the specified emission score (if label_prev is not set)  | 
| 
1541
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 or transition score (if it is)  | 
| 
1542
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 to the given value (C<$score>).  | 
| 
1543
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1544
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =item update_feature_score($feature, $update, $label, $label_prev)  | 
| 
1545
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1546
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Updates the specified emission score (if label_prev is not set)  | 
| 
1547
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 or transition score (if it is)  | 
| 
1548
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 by the given value (C<$update>), i.e. adds that value to the  | 
| 
1549
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 current value.  | 
| 
1550
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1551
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =back  | 
| 
1552
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1553
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 AUTHORS  | 
| 
1554
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1555
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Rudolf Rosa <rosa@ufal.mff.cuni.cz>  | 
| 
1556
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1557
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 COPYRIGHT AND LICENSE  | 
| 
1558
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1559
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Copyright © 2011 by Institute of Formal and Applied Linguistics, Charles  | 
| 
1560
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 University in Prague  | 
| 
1561
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
1562
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 This module is free software; you can redistribute it and/or modify it under  | 
| 
1563
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 the same terms as Perl itself.  |