line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
/* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ |
2
|
|
|
|
|
|
|
/* ***** BEGIN LICENSE BLOCK ***** |
3
|
|
|
|
|
|
|
* Version: MPL 1.1/GPL 2.0/LGPL 2.1 |
4
|
|
|
|
|
|
|
* |
5
|
|
|
|
|
|
|
* The contents of this file are subject to the Mozilla Public License Version |
6
|
|
|
|
|
|
|
* 1.1 (the "License"); you may not use this file except in compliance with |
7
|
|
|
|
|
|
|
* the License. You may obtain a copy of the License at |
8
|
|
|
|
|
|
|
* http://www.mozilla.org/MPL/ |
9
|
|
|
|
|
|
|
* |
10
|
|
|
|
|
|
|
* Software distributed under the License is distributed on an "AS IS" basis, |
11
|
|
|
|
|
|
|
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License |
12
|
|
|
|
|
|
|
* for the specific language governing rights and limitations under the |
13
|
|
|
|
|
|
|
* License. |
14
|
|
|
|
|
|
|
* |
15
|
|
|
|
|
|
|
* The Original Code is Mozilla Universal charset detector code. |
16
|
|
|
|
|
|
|
* |
17
|
|
|
|
|
|
|
* The Initial Developer of the Original Code is |
18
|
|
|
|
|
|
|
* Netscape Communications Corporation. |
19
|
|
|
|
|
|
|
* Portions created by the Initial Developer are Copyright (C) 2001 |
20
|
|
|
|
|
|
|
* the Initial Developer. All Rights Reserved. |
21
|
|
|
|
|
|
|
* |
22
|
|
|
|
|
|
|
* Contributor(s): |
23
|
|
|
|
|
|
|
* Shy Shalom <shooshX@gmail.com> |
24
|
|
|
|
|
|
|
* |
25
|
|
|
|
|
|
|
* Alternatively, the contents of this file may be used under the terms of |
26
|
|
|
|
|
|
|
* either the GNU General Public License Version 2 or later (the "GPL"), or |
27
|
|
|
|
|
|
|
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), |
28
|
|
|
|
|
|
|
* in which case the provisions of the GPL or the LGPL are applicable instead |
29
|
|
|
|
|
|
|
* of those above. If you wish to allow use of your version of this file only |
30
|
|
|
|
|
|
|
* under the terms of either the GPL or the LGPL, and not to allow others to |
31
|
|
|
|
|
|
|
* use your version of this file under the terms of the MPL, indicate your |
32
|
|
|
|
|
|
|
* decision by deleting the provisions above and replace them with the notice |
33
|
|
|
|
|
|
|
* and other provisions required by the GPL or the LGPL. If you do not delete |
34
|
|
|
|
|
|
|
* the provisions above, a recipient may use your version of this file under |
35
|
|
|
|
|
|
|
* the terms of any one of the MPL, the GPL or the LGPL. |
36
|
|
|
|
|
|
|
* |
37
|
|
|
|
|
|
|
* ***** END LICENSE BLOCK ***** */ |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
#include "nsLatin1Prober.h" |
40
|
|
|
|
|
|
|
#include "prmem.h" |
41
|
|
|
|
|
|
|
#include <stdio.h> |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
#define UDF 0 // undefined |
44
|
|
|
|
|
|
|
#define OTH 1 //other |
45
|
|
|
|
|
|
|
#define ASC 2 // ascii capital letter |
46
|
|
|
|
|
|
|
#define ASS 3 // ascii small letter |
47
|
|
|
|
|
|
|
#define ACV 4 // accent capital vowel |
48
|
|
|
|
|
|
|
#define ACO 5 // accent capital other |
49
|
|
|
|
|
|
|
#define ASV 6 // accent small vowel |
50
|
|
|
|
|
|
|
#define ASO 7 // accent small other |
51
|
|
|
|
|
|
|
#define CLASS_NUM 8 // total classes |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
static unsigned char Latin1_CharToClass[] = |
54
|
|
|
|
|
|
|
{ |
55
|
|
|
|
|
|
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 00 - 07 |
56
|
|
|
|
|
|
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 08 - 0F |
57
|
|
|
|
|
|
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 10 - 17 |
58
|
|
|
|
|
|
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 18 - 1F |
59
|
|
|
|
|
|
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 20 - 27 |
60
|
|
|
|
|
|
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 28 - 2F |
61
|
|
|
|
|
|
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 30 - 37 |
62
|
|
|
|
|
|
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 38 - 3F |
63
|
|
|
|
|
|
|
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 40 - 47 |
64
|
|
|
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 48 - 4F |
65
|
|
|
|
|
|
|
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, // 50 - 57 |
66
|
|
|
|
|
|
|
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, // 58 - 5F |
67
|
|
|
|
|
|
|
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 60 - 67 |
68
|
|
|
|
|
|
|
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 68 - 6F |
69
|
|
|
|
|
|
|
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, // 70 - 77 |
70
|
|
|
|
|
|
|
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, // 78 - 7F |
71
|
|
|
|
|
|
|
OTH, UDF, OTH, ASO, OTH, OTH, OTH, OTH, // 80 - 87 |
72
|
|
|
|
|
|
|
OTH, OTH, ACO, OTH, ACO, UDF, ACO, UDF, // 88 - 8F |
73
|
|
|
|
|
|
|
UDF, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // 90 - 97 |
74
|
|
|
|
|
|
|
OTH, OTH, ASO, OTH, ASO, UDF, ASO, ACO, // 98 - 9F |
75
|
|
|
|
|
|
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A0 - A7 |
76
|
|
|
|
|
|
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // A8 - AF |
77
|
|
|
|
|
|
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B0 - B7 |
78
|
|
|
|
|
|
|
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, // B8 - BF |
79
|
|
|
|
|
|
|
ACV, ACV, ACV, ACV, ACV, ACV, ACO, ACO, // C0 - C7 |
80
|
|
|
|
|
|
|
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, // C8 - CF |
81
|
|
|
|
|
|
|
ACO, ACO, ACV, ACV, ACV, ACV, ACV, OTH, // D0 - D7 |
82
|
|
|
|
|
|
|
ACV, ACV, ACV, ACV, ACV, ACO, ACO, ACO, // D8 - DF |
83
|
|
|
|
|
|
|
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASO, // E0 - E7 |
84
|
|
|
|
|
|
|
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, // E8 - EF |
85
|
|
|
|
|
|
|
ASO, ASO, ASV, ASV, ASV, ASV, ASV, OTH, // F0 - F7 |
86
|
|
|
|
|
|
|
ASV, ASV, ASV, ASV, ASV, ASO, ASO, ASO, // F8 - FF |
87
|
|
|
|
|
|
|
}; |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
/* 0 : illegal |
91
|
|
|
|
|
|
|
1 : very unlikely |
92
|
|
|
|
|
|
|
2 : normal |
93
|
|
|
|
|
|
|
3 : very likely |
94
|
|
|
|
|
|
|
*/ |
95
|
|
|
|
|
|
|
static unsigned char Latin1ClassModel[] = |
96
|
|
|
|
|
|
|
{ |
97
|
|
|
|
|
|
|
/* UDF OTH ASC ASS ACV ACO ASV ASO */ |
98
|
|
|
|
|
|
|
/*UDF*/ 0, 0, 0, 0, 0, 0, 0, 0, |
99
|
|
|
|
|
|
|
/*OTH*/ 0, 3, 3, 3, 3, 3, 3, 3, |
100
|
|
|
|
|
|
|
/*ASC*/ 0, 3, 3, 3, 3, 3, 3, 3, |
101
|
|
|
|
|
|
|
/*ASS*/ 0, 3, 3, 3, 1, 1, 3, 3, |
102
|
|
|
|
|
|
|
/*ACV*/ 0, 3, 3, 3, 1, 2, 1, 2, |
103
|
|
|
|
|
|
|
/*ACO*/ 0, 3, 3, 3, 3, 3, 3, 3, |
104
|
|
|
|
|
|
|
/*ASV*/ 0, 3, 1, 3, 1, 1, 1, 3, |
105
|
|
|
|
|
|
|
/*ASO*/ 0, 3, 1, 3, 1, 1, 3, 3, |
106
|
|
|
|
|
|
|
}; |
107
|
|
|
|
|
|
|
|
108
|
4
|
|
|
|
|
|
void nsLatin1Prober::Reset(void) |
109
|
|
|
|
|
|
|
{ |
110
|
4
|
|
|
|
|
|
mState = eDetecting; |
111
|
4
|
|
|
|
|
|
mLastCharClass = OTH; |
112
|
20
|
100
|
|
|
|
|
for (int i = 0; i < FREQ_CAT_NUM; i++) |
113
|
16
|
|
|
|
|
|
mFreqCounter[i] = 0; |
114
|
4
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
|
117
|
4
|
|
|
|
|
|
nsProbingState nsLatin1Prober::HandleData(const char* aBuf, PRUint32 aLen) |
118
|
|
|
|
|
|
|
{ |
119
|
4
|
|
|
|
|
|
char *newBuf1 = 0; |
120
|
4
|
|
|
|
|
|
PRUint32 newLen1 = 0; |
121
|
|
|
|
|
|
|
|
122
|
4
|
50
|
|
|
|
|
if (!FilterWithEnglishLetters(aBuf, aLen, &newBuf1, newLen1)) { |
123
|
0
|
|
|
|
|
|
newBuf1 = (char*)aBuf; |
124
|
4
|
|
|
|
|
|
newLen1 = aLen; |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
unsigned char charClass; |
128
|
|
|
|
|
|
|
unsigned char freq; |
129
|
50
|
100
|
|
|
|
|
for (PRUint32 i = 0; i < newLen1; i++) |
130
|
|
|
|
|
|
|
{ |
131
|
46
|
|
|
|
|
|
charClass = Latin1_CharToClass[(unsigned char)newBuf1[i]]; |
132
|
46
|
|
|
|
|
|
freq = Latin1ClassModel[mLastCharClass*CLASS_NUM + charClass]; |
133
|
46
|
50
|
|
|
|
|
if (freq == 0) { |
134
|
0
|
|
|
|
|
|
mState = eNotMe; |
135
|
0
|
|
|
|
|
|
break; |
136
|
|
|
|
|
|
|
} |
137
|
46
|
|
|
|
|
|
mFreqCounter[freq]++; |
138
|
46
|
|
|
|
|
|
mLastCharClass = charClass; |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
|
141
|
4
|
50
|
|
|
|
|
if (newBuf1 != aBuf) |
142
|
4
|
50
|
|
|
|
|
PR_FREEIF(newBuf1); |
143
|
|
|
|
|
|
|
|
144
|
4
|
|
|
|
|
|
return mState; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
4
|
|
|
|
|
|
float nsLatin1Prober::GetConfidence(void) |
148
|
|
|
|
|
|
|
{ |
149
|
4
|
50
|
|
|
|
|
if (mState == eNotMe) |
150
|
|
|
|
|
|
|
return 0.01f; |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
float confidence; |
153
|
|
|
|
|
|
|
PRUint32 total = 0; |
154
|
20
|
100
|
|
|
|
|
for (PRInt32 i = 0; i < FREQ_CAT_NUM; i++) |
155
|
16
|
|
|
|
|
|
total += mFreqCounter[i]; |
156
|
|
|
|
|
|
|
|
157
|
4
|
50
|
|
|
|
|
if(!total) |
158
|
|
|
|
|
|
|
confidence = 0.0f; |
159
|
|
|
|
|
|
|
else |
160
|
|
|
|
|
|
|
{ |
161
|
4
|
|
|
|
|
|
confidence = mFreqCounter[3]*1.0f / total; |
162
|
4
|
|
|
|
|
|
confidence -= mFreqCounter[1]*20.0f/total; |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
4
|
100
|
|
|
|
|
if (confidence < 0.0f) |
166
|
|
|
|
|
|
|
confidence = 0.0f; |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
// lower the confidence of latin1 so that other more accurate detector |
169
|
|
|
|
|
|
|
// can take priority. |
170
|
4
|
|
|
|
|
|
confidence *= 0.50f; |
171
|
|
|
|
|
|
|
|
172
|
4
|
|
|
|
|
|
return confidence; |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
#ifdef DEBUG_chardet |
176
|
|
|
|
|
|
|
void nsLatin1Prober::DumpStatus() |
177
|
|
|
|
|
|
|
{ |
178
|
|
|
|
|
|
|
printf(" Latin1Prober: %1.3f [%s]\r\n", GetConfidence(), GetCharSetName()); |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
#endif |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
|