File Coverage

src/nsUniversalDetector.cpp

Criterion	Covered	Total	%
statement	57	106	53.7
branch	42	118	35.5
condition			n/a
subroutine			n/a
pod			n/a
total	99	224	44.2

line	stmt	bran	code
1			/* -- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -- */
2			/* *** BEGIN LICENSE BLOCK ***
3			* Version: MPL 1.1/GPL 2.0/LGPL 2.1
4			*
5			* The contents of this file are subject to the Mozilla Public License Version
6			* 1.1 (the "License"); you may not use this file except in compliance with
7			* the License. You may obtain a copy of the License at
8			* http://www.mozilla.org/MPL/
9			*
10			* Software distributed under the License is distributed on an "AS IS" basis,
11			* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12			* for the specific language governing rights and limitations under the
13			* License.
14			*
15			* The Original Code is Mozilla Universal charset detector code.
16			*
17			* The Initial Developer of the Original Code is
18			* Netscape Communications Corporation.
19			* Portions created by the Initial Developer are Copyright (C) 2001
20			* the Initial Developer. All Rights Reserved.
21			*
22			* Contributor(s):
23			* Shy Shalom <shooshX@gmail.com>
24			*
25			* Alternatively, the contents of this file may be used under the terms of
26			* either the GNU General Public License Version 2 or later (the "GPL"), or
27			* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
28			* in which case the provisions of the GPL or the LGPL are applicable instead
29			* of those above. If you wish to allow use of your version of this file only
30			* under the terms of either the GPL or the LGPL, and not to allow others to
31			* use your version of this file under the terms of the MPL, indicate your
32			* decision by deleting the provisions above and replace them with the notice
33			* and other provisions required by the GPL or the LGPL. If you do not delete
34			* the provisions above, a recipient may use your version of this file under
35			* the terms of any one of the MPL, the GPL or the LGPL.
36			*
37			* *** END LICENSE BLOCK *** */
38
39			#include "nscore.h"
40
41			#include "nsUniversalDetector.h"
42
43			#include "nsMBCSGroupProber.h"
44			#include "nsSBCSGroupProber.h"
45			#include "nsEscCharsetProber.h"
46			#include "nsLatin1Prober.h"
47
48	4		nsUniversalDetector::nsUniversalDetector()
49			{
50	4		mDone = PR_FALSE;
51	4		mBestGuess = -1; //illegal value as signal
52	4		mInTag = PR_FALSE;
53	4		mEscCharSetProber = nsnull;
54
55	4		mStart = PR_TRUE;
56	4		mDetectedCharset = nsnull;
57	4		mGotData = PR_FALSE;
58	4		mInputState = ePureAscii;
59	4		mLastChar = '\0';
60
61			PRUint32 i;
62	16	100	for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
63	12		mCharSetProbers[i] = nsnull;
64	4		}
65
66	8		nsUniversalDetector::~nsUniversalDetector()
67			{
68	16	100	for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
69	12	50	if (mCharSetProbers[i])
70	12	50	delete mCharSetProbers[i];
71	4	50	if (mEscCharSetProber)
72	0	0	delete mEscCharSetProber;
73	0		}
74
75			void
76	0		nsUniversalDetector::Reset()
77			{
78	0		mDone = PR_FALSE;
79	0		mBestGuess = -1; //illegal value as signal
80	0		mInTag = PR_FALSE;
81
82	0		mStart = PR_TRUE;
83	0		mDetectedCharset = nsnull;
84	0		mGotData = PR_FALSE;
85	0		mInputState = ePureAscii;
86	0		mLastChar = '\0';
87
88	0	0	if (mEscCharSetProber)
89	0		mEscCharSetProber->Reset();
90
91			PRUint32 i;
92	0	0	for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
93	0	0	if (mCharSetProbers[i])
94	0		mCharSetProbers[i]->Reset();
95	0		}
96
97			//---------------------------------------------------------------------
98			#define SHORTCUT_THRESHOLD (float)0.95
99			#define MINIMUM_THRESHOLD (float)0.20
100
101	4		nsresult nsUniversalDetector::HandleData(const char* aBuf, PRUint32 aLen)
102			{
103	4	50	if(mDone)
104			return NS_OK;
105
106	4	50	if (aLen > 0)
107	4		mGotData = PR_TRUE;
108
109			//If the data starts with BOM, we know it is UTF
110	4	50	if (mStart)
111			{
112	4		mStart = PR_FALSE;
113	4	50	if (aLen > 3)
114	4		switch (aBuf[0])
115			{
116			case '\xEF':
117	0	0	if (('\xBB' == aBuf[1]) && ('\xBF' == aBuf[2]))
		0
118			// EF BB BF UTF-8 encoded BOM
119	0		mDetectedCharset = "UTF-8";
120			break;
121			case '\xFE':
122	0	0	if (('\xFF' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
		0
		0
123			// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
124	0		mDetectedCharset = "X-ISO-10646-UCS-4-3412";
125	0	0	else if ('\xFF' == aBuf[1])
126			// FE FF UTF-16, big endian BOM
127	0		mDetectedCharset = "UTF-16BE";
128			break;
129			case '\x00':
130	0	0	if (('\x00' == aBuf[1]) && ('\xFE' == aBuf[2]) && ('\xFF' == aBuf[3]))
		0
		0
131			// 00 00 FE FF UTF-32, big-endian BOM
132	0		mDetectedCharset = "UTF-32BE";
133	0	0	else if (('\x00' == aBuf[1]) && ('\xFF' == aBuf[2]) && ('\xFE' == aBuf[3]))
		0
		0
134			// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
135	0		mDetectedCharset = "X-ISO-10646-UCS-4-2143";
136			break;
137			case '\xFF':
138	0	0	if (('\xFE' == aBuf[1]) && ('\x00' == aBuf[2]) && ('\x00' == aBuf[3]))
		0
		0
139			// FF FE 00 00 UTF-32, little-endian BOM
140	0		mDetectedCharset = "UTF-32LE";
141	0	0	else if ('\xFE' == aBuf[1])
142			// FF FE UTF-16, little endian BOM
143	0		mDetectedCharset = "UTF-16LE";
144			break;
145			} // switch
146
147	4	50	if (mDetectedCharset)
148			{
149	0		mDone = PR_TRUE;
150	0		return NS_OK;
151			}
152			}
153
154			PRUint32 i;
155	50	100	for (i = 0; i < aLen; i++)
156			{
157			//other than 0xa0, if every othe character is ascii, the page is ascii
158	46	100	if (aBuf[i] & '\x80' && aBuf[i] != '\xA0') //Since many Ascii only page contains NBSP
159			{
160			//we got a non-ascii byte (high-byte)
161	28	100	if (mInputState != eHighbyte)
162			{
163			//adjust state
164	4		mInputState = eHighbyte;
165
166			//kill mEscCharSetProber if it is active
167	4	50	if (mEscCharSetProber) {
168	0	0	delete mEscCharSetProber;
169	0		mEscCharSetProber = nsnull;
170			}
171
172			//start multibyte and singlebyte charset prober
173	4	50	if (nsnull == mCharSetProbers[0])
174	4	50	mCharSetProbers[0] = new nsMBCSGroupProber;
175	4	50	if (nsnull == mCharSetProbers[1])
176	4	50	mCharSetProbers[1] = new nsSBCSGroupProber;
177	4	50	if (nsnull == mCharSetProbers[2])
178	8		mCharSetProbers[2] = new nsLatin1Prober;
179
180	4	50	if ((nsnull == mCharSetProbers[0]) \|\|
		50
181	4	50	(nsnull == mCharSetProbers[1]) \|\|
182	4		(nsnull == mCharSetProbers[2]))
183			return NS_ERROR_OUT_OF_MEMORY;
184			}
185			}
186			else
187			{
188			//ok, just pure ascii so far
189	18	100	if ( ePureAscii == mInputState &&
		50
190	2	50	(aBuf[i] == '\033' \|\| (aBuf[i] == '{' && mLastChar == '~')) )
		0
191			{
192			//found escape character or HZ "~{"
193	0		mInputState = eEscAscii;
194			}
195	18		mLastChar = aBuf[i];
196			}
197			}
198
199			nsProbingState st;
200	4		switch (mInputState)
201			{
202			case eEscAscii:
203	0	0	if (nsnull == mEscCharSetProber) {
204	0	0	mEscCharSetProber = new nsEscCharSetProber;
205	0	0	if (nsnull == mEscCharSetProber)
206			return NS_ERROR_OUT_OF_MEMORY;
207			}
208	0		st = mEscCharSetProber->HandleData(aBuf, aLen);
209	0	0	if (st == eFoundIt)
210			{
211	0		mDone = PR_TRUE;
212	0		mDetectedCharset = mEscCharSetProber->GetCharSetName();
213			}
214			break;
215			case eHighbyte:
216	16	100	for (i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
217			{
218	12		st = mCharSetProbers[i]->HandleData(aBuf, aLen);
219	12	50	if (st == eFoundIt)
220			{
221	0		mDone = PR_TRUE;
222	0		mDetectedCharset = mCharSetProbers[i]->GetCharSetName();
223	0		return NS_OK;
224			}
225			}
226			break;
227
228			default: //pure ascii
229			;//do nothing here
230			}
231			return NS_OK;
232			}
233
234
235			//---------------------------------------------------------------------
236	4		void nsUniversalDetector::DataEnd()
237			{
238	4	50	if (!mGotData)
239			{
240			// we haven't got any data yet, return immediately
241			// caller program sometimes call DataEnd before anything has been sent to detector
242			return;
243			}
244
245	4	50	if (mDetectedCharset)
246			{
247	0		mDone = PR_TRUE;
248	0		Report(mDetectedCharset);
249	0		return;
250			}
251
252	4	50	switch (mInputState)
253			{
254			case eHighbyte:
255			{
256			float proberConfidence;
257			float maxProberConfidence = (float)0.0;
258			PRInt32 maxProber = 0;
259
260	16	100	for (PRInt32 i = 0; i < NUM_OF_CHARSET_PROBERS; i++)
261			{
262	12		proberConfidence = mCharSetProbers[i]->GetConfidence();
263	12	100	if (proberConfidence > maxProberConfidence)
264			{
265			maxProberConfidence = proberConfidence;
266			maxProber = i;
267			}
268			}
269			//do not report anything because we are not confident of it, that's in fact a negative answer
270	4	50	if (maxProberConfidence > MINIMUM_THRESHOLD)
271	4		Report(mCharSetProbers[maxProber]->GetCharSetName());
272			}
273			break;
274			case eEscAscii:
275			break;
276			default:
277			;
278			}
279			return;
280			}