line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2
|
|
|
|
|
|
|
/* ***** BEGIN LICENSE BLOCK ***** |
3
|
|
|
|
|
|
|
* Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
4
|
|
|
|
|
|
|
* |
5
|
|
|
|
|
|
|
* The contents of this file are subject to the Mozilla Public License Version |
6
|
|
|
|
|
|
|
* 1.1 (the "License"); you may not use this file except in compliance with |
7
|
|
|
|
|
|
|
* the License. You may obtain a copy of the License at |
8
|
|
|
|
|
|
|
* http://www.mozilla.org/MPL/ |
9
|
|
|
|
|
|
|
* |
10
|
|
|
|
|
|
|
* Software distributed under the License is distributed on an "AS IS" basis, |
11
|
|
|
|
|
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
12
|
|
|
|
|
|
|
* for the specific language governing rights and limitations under the |
13
|
|
|
|
|
|
|
* License. |
14
|
|
|
|
|
|
|
* |
15
|
|
|
|
|
|
|
* The Original Code is Mozilla Universal charset detector code. |
16
|
|
|
|
|
|
|
* |
17
|
|
|
|
|
|
|
* The Initial Developer of the Original Code is |
18
|
|
|
|
|
|
|
* Netscape Communications Corporation. |
19
|
|
|
|
|
|
|
* Portions created by the Initial Developer are Copyright (C) 2001 |
20
|
|
|
|
|
|
|
* the Initial Developer. All Rights Reserved. |
21
|
|
|
|
|
|
|
* |
22
|
|
|
|
|
|
|
* Contributor(s): |
23
|
|
|
|
|
|
|
* Shy Shalom <shooshX@gmail.com> |
24
|
|
|
|
|
|
|
* Proofpoint, Inc. |
25
|
|
|
|
|
|
|
* |
26
|
|
|
|
|
|
|
* Alternatively, the contents of this file may be used under the terms of |
27
|
|
|
|
|
|
|
* either the GNU General Public License Version 2 or later (the "GPL"), or |
28
|
|
|
|
|
|
|
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
29
|
|
|
|
|
|
|
* in which case the provisions of the GPL or the LGPL are applicable instead |
30
|
|
|
|
|
|
|
* of those above. If you wish to allow use of your version of this file only |
31
|
|
|
|
|
|
|
* under the terms of either the GPL or the LGPL, and not to allow others to |
32
|
|
|
|
|
|
|
* use your version of this file under the terms of the MPL, indicate your |
33
|
|
|
|
|
|
|
* decision by deleting the provisions above and replace them with the notice |
34
|
|
|
|
|
|
|
* and other provisions required by the GPL or the LGPL. If you do not delete |
35
|
|
|
|
|
|
|
* the provisions above, a recipient may use your version of this file under |
36
|
|
|
|
|
|
|
* the terms of any one of the MPL, the GPL or the LGPL. |
37
|
|
|
|
|
|
|
* |
38
|
|
|
|
|
|
|
* ***** END LICENSE BLOCK ***** */ |
39
|
|
|
|
|
|
|
#include <stdio.h> |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
#include "nsMBCSGroupProber.h" |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
#if defined(DEBUG_chardet) || defined(DEBUG_jgmyers) |
44
|
|
|
|
|
|
|
const char *ProberName[] = |
45
|
|
|
|
|
|
|
{ |
46
|
|
|
|
|
|
|
"UTF8", |
47
|
|
|
|
|
|
|
"SJIS", |
48
|
|
|
|
|
|
|
"EUCJP", |
49
|
|
|
|
|
|
|
"GB18030", |
50
|
|
|
|
|
|
|
"EUCKR", |
51
|
|
|
|
|
|
|
"Big5", |
52
|
|
|
|
|
|
|
"EUCTW", |
53
|
|
|
|
|
|
|
}; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
#endif |
56
|
|
|
|
|
|
|
|
57
|
4
|
|
|
|
|
|
nsMBCSGroupProber::nsMBCSGroupProber() |
58
|
|
|
|
|
|
|
{ |
59
|
4
|
50
|
|
|
|
|
mProbers[0] = new nsUTF8Prober(); |
|
|
50
|
|
|
|
|
|
60
|
4
|
50
|
|
|
|
|
mProbers[1] = new nsSJISProber(); |
|
|
50
|
|
|
|
|
|
61
|
4
|
50
|
|
|
|
|
mProbers[2] = new nsEUCJPProber(); |
|
|
50
|
|
|
|
|
|
62
|
4
|
50
|
|
|
|
|
mProbers[3] = new nsGB18030Prober(); |
|
|
50
|
|
|
|
|
|
63
|
4
|
50
|
|
|
|
|
mProbers[4] = new nsEUCKRProber(); |
|
|
50
|
|
|
|
|
|
64
|
4
|
50
|
|
|
|
|
mProbers[5] = new nsBig5Prober(); |
|
|
50
|
|
|
|
|
|
65
|
4
|
50
|
|
|
|
|
mProbers[6] = new nsEUCTWProber(); |
|
|
50
|
|
|
|
|
|
66
|
4
|
50
|
|
|
|
|
Reset(); |
67
|
4
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
12
|
|
|
|
|
|
nsMBCSGroupProber::~nsMBCSGroupProber() |
70
|
|
|
|
|
|
|
{ |
71
|
32
|
100
|
|
|
|
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) |
72
|
|
|
|
|
|
|
{ |
73
|
28
|
50
|
|
|
|
|
delete mProbers[i]; |
|
|
50
|
|
|
|
|
|
74
|
|
|
|
|
|
|
} |
75
|
8
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
4
|
|
|
|
|
|
const char* nsMBCSGroupProber::GetCharSetName() |
78
|
|
|
|
|
|
|
{ |
79
|
4
|
50
|
|
|
|
|
if (mBestGuess == -1) |
80
|
|
|
|
|
|
|
{ |
81
|
0
|
|
|
|
|
|
GetConfidence(); |
82
|
0
|
0
|
|
|
|
|
if (mBestGuess == -1) |
83
|
0
|
|
|
|
|
|
mBestGuess = 0; |
84
|
|
|
|
|
|
|
} |
85
|
4
|
|
|
|
|
|
return mProbers[mBestGuess]->GetCharSetName(); |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
4
|
|
|
|
|
|
void nsMBCSGroupProber::Reset(void) |
89
|
|
|
|
|
|
|
{ |
90
|
4
|
|
|
|
|
|
mActiveNum = 0; |
91
|
32
|
100
|
|
|
|
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) |
92
|
|
|
|
|
|
|
{ |
93
|
28
|
50
|
|
|
|
|
if (mProbers[i]) |
94
|
|
|
|
|
|
|
{ |
95
|
28
|
|
|
|
|
|
mProbers[i]->Reset(); |
96
|
28
|
|
|
|
|
|
mIsActive[i] = PR_TRUE; |
97
|
28
|
|
|
|
|
|
++mActiveNum; |
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
else |
100
|
0
|
|
|
|
|
|
mIsActive[i] = PR_FALSE; |
101
|
|
|
|
|
|
|
} |
102
|
4
|
|
|
|
|
|
mBestGuess = -1; |
103
|
4
|
|
|
|
|
|
mState = eDetecting; |
104
|
4
|
|
|
|
|
|
mKeepNext = 0; |
105
|
4
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
4
|
|
|
|
|
|
nsProbingState nsMBCSGroupProber::HandleData(const char* aBuf, PRUint32 aLen) |
108
|
|
|
|
|
|
|
{ |
109
|
|
|
|
|
|
|
nsProbingState st; |
110
|
|
|
|
|
|
|
PRUint32 start = 0; |
111
|
4
|
|
|
|
|
|
PRUint32 keepNext = mKeepNext; |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
//do filtering to reduce load to probers |
114
|
50
|
100
|
|
|
|
|
for (PRUint32 pos = 0; pos < aLen; ++pos) |
115
|
|
|
|
|
|
|
{ |
116
|
46
|
100
|
|
|
|
|
if (aBuf[pos] & 0x80) |
117
|
|
|
|
|
|
|
{ |
118
|
28
|
100
|
|
|
|
|
if (!keepNext) |
119
|
|
|
|
|
|
|
start = pos; |
120
|
|
|
|
|
|
|
keepNext = 2; |
121
|
|
|
|
|
|
|
} |
122
|
18
|
100
|
|
|
|
|
else if (keepNext) |
123
|
|
|
|
|
|
|
{ |
124
|
4
|
100
|
|
|
|
|
if (--keepNext == 0) |
125
|
|
|
|
|
|
|
{ |
126
|
16
|
100
|
|
|
|
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) |
127
|
|
|
|
|
|
|
{ |
128
|
14
|
50
|
|
|
|
|
if (!mIsActive[i]) |
129
|
|
|
|
|
|
|
continue; |
130
|
14
|
|
|
|
|
|
st = mProbers[i]->HandleData(aBuf + start, pos + 1 - start); |
131
|
14
|
50
|
|
|
|
|
if (st == eFoundIt) |
132
|
|
|
|
|
|
|
{ |
133
|
0
|
|
|
|
|
|
mBestGuess = i; |
134
|
0
|
|
|
|
|
|
mState = eFoundIt; |
135
|
0
|
|
|
|
|
|
return mState; |
136
|
|
|
|
|
|
|
} |
137
|
14
|
100
|
|
|
|
|
else if (st == eNotMe) |
138
|
|
|
|
|
|
|
{ |
139
|
6
|
|
|
|
|
|
mIsActive[i] = PR_FALSE; |
140
|
6
|
|
|
|
|
|
mActiveNum--; |
141
|
6
|
50
|
|
|
|
|
if (mActiveNum <= 0) |
142
|
|
|
|
|
|
|
{ |
143
|
0
|
|
|
|
|
|
mState = eNotMe; |
144
|
0
|
|
|
|
|
|
return mState; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
4
|
100
|
|
|
|
|
if (keepNext) { |
153
|
16
|
100
|
|
|
|
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; i++) |
154
|
|
|
|
|
|
|
{ |
155
|
14
|
50
|
|
|
|
|
if (!mIsActive[i]) |
156
|
|
|
|
|
|
|
continue; |
157
|
14
|
|
|
|
|
|
st = mProbers[i]->HandleData(aBuf + start, aLen + 1 - start); |
158
|
14
|
50
|
|
|
|
|
if (st == eFoundIt) |
159
|
|
|
|
|
|
|
{ |
160
|
0
|
|
|
|
|
|
mBestGuess = i; |
161
|
0
|
|
|
|
|
|
mState = eFoundIt; |
162
|
0
|
|
|
|
|
|
return mState; |
163
|
|
|
|
|
|
|
} |
164
|
14
|
100
|
|
|
|
|
else if (st == eNotMe) |
165
|
|
|
|
|
|
|
{ |
166
|
10
|
|
|
|
|
|
mIsActive[i] = PR_FALSE; |
167
|
10
|
|
|
|
|
|
mActiveNum--; |
168
|
10
|
50
|
|
|
|
|
if (mActiveNum <= 0) |
169
|
|
|
|
|
|
|
{ |
170
|
0
|
|
|
|
|
|
mState = eNotMe; |
171
|
0
|
|
|
|
|
|
return mState; |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
} |
176
|
4
|
|
|
|
|
|
mKeepNext = keepNext; |
177
|
|
|
|
|
|
|
|
178
|
4
|
|
|
|
|
|
return mState; |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
4
|
|
|
|
|
|
float nsMBCSGroupProber::GetConfidence(void) |
182
|
|
|
|
|
|
|
{ |
183
|
|
|
|
|
|
|
PRUint32 i; |
184
|
|
|
|
|
|
|
float bestConf = 0.0, cf; |
185
|
|
|
|
|
|
|
|
186
|
4
|
|
|
|
|
|
switch (mState) |
187
|
|
|
|
|
|
|
{ |
188
|
|
|
|
|
|
|
case eFoundIt: |
189
|
|
|
|
|
|
|
return (float)0.99; |
190
|
|
|
|
|
|
|
case eNotMe: |
191
|
0
|
|
|
|
|
|
return (float)0.01; |
192
|
|
|
|
|
|
|
default: |
193
|
32
|
100
|
|
|
|
|
for (i = 0; i < NUM_OF_PROBERS; i++) |
194
|
|
|
|
|
|
|
{ |
195
|
28
|
100
|
|
|
|
|
if (!mIsActive[i]) |
196
|
|
|
|
|
|
|
continue; |
197
|
12
|
|
|
|
|
|
cf = mProbers[i]->GetConfidence(); |
198
|
12
|
100
|
|
|
|
|
if (bestConf < cf) |
199
|
|
|
|
|
|
|
{ |
200
|
|
|
|
|
|
|
bestConf = cf; |
201
|
4
|
|
|
|
|
|
mBestGuess = i; |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
return bestConf; |
206
|
|
|
|
|
|
|
} |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
#ifdef DEBUG_chardet |
209
|
|
|
|
|
|
|
void nsMBCSGroupProber::DumpStatus() |
210
|
|
|
|
|
|
|
{ |
211
|
|
|
|
|
|
|
PRUint32 i; |
212
|
|
|
|
|
|
|
float cf; |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
GetConfidence(); |
215
|
|
|
|
|
|
|
for (i = 0; i < NUM_OF_PROBERS; i++) |
216
|
|
|
|
|
|
|
{ |
217
|
|
|
|
|
|
|
if (!mIsActive[i]) |
218
|
|
|
|
|
|
|
printf(" MBCS inactive: [%s] (confidence is too low).\r\n", ProberName[i]); |
219
|
|
|
|
|
|
|
else |
220
|
|
|
|
|
|
|
{ |
221
|
|
|
|
|
|
|
cf = mProbers[i]->GetConfidence(); |
222
|
|
|
|
|
|
|
printf(" MBCS %1.3f: [%s]\r\n", cf, ProberName[i]); |
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
#endif |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
#ifdef DEBUG_jgmyers |
229
|
|
|
|
|
|
|
void nsMBCSGroupProber::GetDetectorState(nsUniversalDetector::DetectorState (&states)[nsUniversalDetector::NumDetectors], PRUint32 &offset) |
230
|
|
|
|
|
|
|
{ |
231
|
|
|
|
|
|
|
for (PRUint32 i = 0; i < NUM_OF_PROBERS; ++i) { |
232
|
|
|
|
|
|
|
states[offset].name = ProberName[i]; |
233
|
|
|
|
|
|
|
states[offset].isActive = mIsActive[i]; |
234
|
|
|
|
|
|
|
states[offset].confidence = mIsActive[i] ? mProbers[i]->GetConfidence() : 0.0; |
235
|
|
|
|
|
|
|
++offset; |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
#endif /* DEBUG_jgmyers */ |