File Coverage

src/nsUniversalDetector.cpp
Criterion Covered Total %
statement 57 106 53.7
branch 42 118 35.5
condition n/a
subroutine n/a
pod n/a
total 99 224 44.2


line stmt bran cond sub pod time code
1             /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2             /* ***** BEGIN LICENSE BLOCK *****
3             * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4             *
5             * The contents of this file are subject to the Mozilla Public License Version
6             * 1.1 (the "License"); you may not use this file except in compliance with
7             * the License. You may obtain a copy of the License at
8             * http://www.mozilla.org/MPL/
9             *
10             * Software distributed under the License is distributed on an "AS IS" basis,
11             * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12             * for the specific language governing rights and limitations under the
13             * License.
14             *
15             * The Original Code is Mozilla Universal charset detector code.
16             *
17             * The Initial Developer of the Original Code is
18             * Netscape Communications Corporation.
19             * Portions created by the Initial Developer are Copyright (C) 2001
20             * the Initial Developer. All Rights Reserved.
21             *
22             * Contributor(s):
23             * Shy Shalom <shooshX@gmail.com>
24             *
25             * Alternatively, the contents of this file may be used under the terms of
26             * either the GNU General Public License Version 2 or later (the "GPL"), or
27             * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28             * in which case the provisions of the GPL or the LGPL are applicable instead
29             * of those above. If you wish to allow use of your version of this file only
30             * under the terms of either the GPL or the LGPL, and not to allow others to
31             * use your version of this file under the terms of the MPL, indicate your
32             * decision by deleting the provisions above and replace them with the notice
33             * and other provisions required by the GPL or the LGPL. If you do not delete
34             * the provisions above, a recipient may use your version of this file under
35             * the terms of any one of the MPL, the GPL or the LGPL.
36             *
37             * ***** END LICENSE BLOCK ***** */
38              
39             #include "nscore.h"
40              
41             #include "nsUniversalDetector.h"
42              
43             #include "nsMBCSGroupProber.h"
44             #include "nsSBCSGroupProber.h"
45             #include "nsEscCharsetProber.h"
46             #include "nsLatin1Prober.h"
47              
48 4           nsUniversalDetector::nsUniversalDetector()
49             {
50 4           mDone = PR_FALSE;
51 4           mBestGuess = -1; //illegal value as signal
52 4           mInTag = PR_FALSE;
53 4           mEscCharSetProber = nsnull;
54              
55 4           mStart = PR_TRUE;
56 4           mDetectedCharset = nsnull;
57 4           mGotData = PR_FALSE;
58 4           mInputState = ePureAscii;
59 4           mLastChar = '\0';
60              
61             PRUint32 i;
62 16 100         for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
63 12           mCharSetProbers[i] = nsnull;
64 4           }
65              
66 8           nsUniversalDetector::~nsUniversalDetector()
67             {
68 16 100         for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
69 12 50         if (mCharSetProbers[i])
70 12 50         delete mCharSetProbers[i];
71 4 50         if (mEscCharSetProber)
72 0 0         delete mEscCharSetProber;
73 0           }
74              
75             void
76 0           nsUniversalDetector::Reset()
77             {
78 0           mDone = PR_FALSE;
79 0           mBestGuess = -1; //illegal value as signal
80 0           mInTag = PR_FALSE;
81              
82 0           mStart = PR_TRUE;
83 0           mDetectedCharset = nsnull;
84 0           mGotData = PR_FALSE;
85 0           mInputState = ePureAscii;
86 0           mLastChar = '\0';
87              
88 0 0         if (mEscCharSetProber)
89 0           mEscCharSetProber->Reset();
90              
91             PRUint32 i;
92 0 0         for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
93 0 0         if (mCharSetProbers[i])
94 0           mCharSetProbers[i]->Reset();
95 0           }
96              
97             //---------------------------------------------------------------------
98             #define SHORTCUT_THRESHOLD (float)0.95
99             #define MINIMUM_THRESHOLD (float)0.20
100              
101 4           nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
102             {
103 4 50         if(mDone)
104             return NS_OK;
105              
106 4 50         if (aLen > 0)
107 4           mGotData = PR_TRUE;
108              
109             //If the data starts with BOM, we know it is UTF
110 4 50         if (mStart)
111             {
112 4           mStart = PR_FALSE;
113 4 50         if (aLen > 3)
114 4           switch (aBuf[0])
115             {
116             case '\xEF':
117 0 0         if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
    0          
118             // EF BB BF UTF-8 encoded BOM
119 0           mDetectedCharset = "UTF-8";
120             break;
121             case '\xFE':
122 0 0         if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
    0          
    0          
123             // FE FF 00 00 UCS-4, unusual octet order BOM (3412)
124 0           mDetectedCharset = "X-ISO-10646-UCS-4-3412";
125 0 0         else if ('\xFF' == aBuf[1])
126             // FE FF UTF-16, big endian BOM
127 0           mDetectedCharset = "UTF-16BE";
128             break;
129             case '\x00':
130 0 0         if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
    0          
    0          
131             // 00 00 FE FF UTF-32, big-endian BOM
132 0           mDetectedCharset = "UTF-32BE";
133 0 0         else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
    0          
    0          
134             // 00 00 FF FE UCS-4, unusual octet order BOM (2143)
135 0           mDetectedCharset = "X-ISO-10646-UCS-4-2143";
136             break;
137             case '\xFF':
138 0 0         if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
    0          
    0          
139             // FF FE 00 00 UTF-32, little-endian BOM
140 0           mDetectedCharset = "UTF-32LE";
141 0 0         else if ('\xFE' == aBuf[1])
142             // FF FE UTF-16, little endian BOM
143 0           mDetectedCharset = "UTF-16LE";
144             break;
145             } // switch
146              
147 4 50         if (mDetectedCharset)
148             {
149 0           mDone = PR_TRUE;
150 0           return NS_OK;
151             }
152             }
153            
154             PRUint32 i;
155 50 100         for (i = 0; i < aLen; i++)
156             {
157             //other than 0xa0, if every othe character is ascii, the page is ascii
158 46 100         if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP
159             {
160             //we got a non-ascii byte (high-byte)
161 28 100         if (mInputState != eHighbyte)
162             {
163             //adjust state
164 4           mInputState = eHighbyte;
165              
166             //kill mEscCharSetProber if it is active
167 4 50         if (mEscCharSetProber) {
168 0 0         delete mEscCharSetProber;
169 0           mEscCharSetProber = nsnull;
170             }
171              
172             //start multibyte and singlebyte charset prober
173 4 50         if (nsnull == mCharSetProbers[0])
174 4 50         mCharSetProbers[0] = new nsMBCSGroupProber;
175 4 50         if (nsnull == mCharSetProbers[1])
176 4 50         mCharSetProbers[1] = new nsSBCSGroupProber;
177 4 50         if (nsnull == mCharSetProbers[2])
178 8           mCharSetProbers[2] = new nsLatin1Prober;
179              
180 4 50         if ((nsnull == mCharSetProbers[0]) ||
    50          
181 4 50         (nsnull == mCharSetProbers[1]) ||
182 4           (nsnull == mCharSetProbers[2]))
183             return NS_ERROR_OUT_OF_MEMORY;
184             }
185             }
186             else
187             {
188             //ok, just pure ascii so far
189 18 100         if ( ePureAscii == mInputState &&
    50          
190 2 50         (aBuf[i] == '\033' || (aBuf[i] == '{' && mLastChar == '~')) )
    0          
191             {
192             //found escape character or HZ "~{"
193 0           mInputState = eEscAscii;
194             }
195 18           mLastChar = aBuf[i];
196             }
197             }
198              
199             nsProbingState st;
200 4           switch (mInputState)
201             {
202             case eEscAscii:
203 0 0         if (nsnull == mEscCharSetProber) {
204 0 0         mEscCharSetProber = new nsEscCharSetProber;
205 0 0         if (nsnull == mEscCharSetProber)
206             return NS_ERROR_OUT_OF_MEMORY;
207             }
208 0           st = mEscCharSetProber->HandleData(aBuf, aLen);
209 0 0         if (st == eFoundIt)
210             {
211 0           mDone = PR_TRUE;
212 0           mDetectedCharset = mEscCharSetProber->GetCharSetName();
213             }
214             break;
215             case eHighbyte:
216 16 100         for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
217             {
218 12           st = mCharSetProbers[i]->HandleData(aBuf, aLen);
219 12 50         if (st == eFoundIt)
220             {
221 0           mDone = PR_TRUE;
222 0           mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
223 0           return NS_OK;
224             }
225             }
226             break;
227              
228             default: //pure ascii
229             ;//do nothing here
230             }
231             return NS_OK;
232             }
233              
234              
235             //---------------------------------------------------------------------
236 4           void nsUniversalDetector::DataEnd()
237             {
238 4 50         if (!mGotData)
239             {
240             // we haven't got any data yet, return immediately
241             // caller program sometimes call DataEnd before anything has been sent to detector
242             return;
243             }
244              
245 4 50         if (mDetectedCharset)
246             {
247 0           mDone = PR_TRUE;
248 0           Report(mDetectedCharset);
249 0           return;
250             }
251            
252 4 50         switch (mInputState)
253             {
254             case eHighbyte:
255             {
256             float proberConfidence;
257             float maxProberConfidence = (float)0.0;
258             PRInt32 maxProber = 0;
259              
260 16 100         for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
261             {
262 12           proberConfidence = mCharSetProbers[i]->GetConfidence();
263 12 100         if (proberConfidence > maxProberConfidence)
264             {
265             maxProberConfidence = proberConfidence;
266             maxProber = i;
267             }
268             }
269             //do not report anything because we are not confident of it, that's in fact a negative answer
270 4 50         if (maxProberConfidence > MINIMUM_THRESHOLD)
271 4           Report(mCharSetProbers[maxProber]->GetCharSetName());
272             }
273             break;
274             case eEscAscii:
275             break;
276             default:
277             ;
278             }
279             return;
280             }