File Coverage

src/nsHebrewProber.cpp
Criterion Covered Total %
statement 20 37 54.0
branch 13 46 28.2
condition n/a
subroutine n/a
pod n/a
total 33 83 39.7


line stmt bran cond sub pod time code
1             /* -*- Mode: C; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
2             /* ***** BEGIN LICENSE BLOCK *****
3             * Version: MPL 1.1/GPL 2.0/LGPL 2.1
4             *
5             * The contents of this file are subject to the Mozilla Public License Version
6             * 1.1 (the "License"); you may not use this file except in compliance with
7             * the License. You may obtain a copy of the License at
8             * http://www.mozilla.org/MPL/
9             *
10             * Software distributed under the License is distributed on an "AS IS" basis,
11             * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
12             * for the specific language governing rights and limitations under the
13             * License.
14             *
15             * The Original Code is Mozilla Universal charset detector code.
16             *
17             * The Initial Developer of the Original Code is
18             * Shy Shalom <shooshX@gmail.com>
19             * Portions created by the Initial Developer are Copyright (C) 2005
20             * the Initial Developer. All Rights Reserved.
21             *
22             * Contributor(s):
23             *
24             * Alternatively, the contents of this file may be used under the terms of
25             * either the GNU General Public License Version 2 or later (the "GPL"), or
26             * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
27             * in which case the provisions of the GPL or the LGPL are applicable instead
28             * of those above. If you wish to allow use of your version of this file only
29             * under the terms of either the GPL or the LGPL, and not to allow others to
30             * use your version of this file under the terms of the MPL, indicate your
31             * decision by deleting the provisions above and replace them with the notice
32             * and other provisions required by the GPL or the LGPL. If you do not delete
33             * the provisions above, a recipient may use your version of this file under
34             * the terms of any one of the MPL, the GPL or the LGPL.
35             *
36             * ***** END LICENSE BLOCK ***** */
37              
38             #include "nsHebrewProber.h"
39             #include <stdio.h>
40              
41             // windows-1255 / ISO-8859-8 code points of interest
42             #define FINAL_KAF ('\xea')
43             #define NORMAL_KAF ('\xeb')
44             #define FINAL_MEM ('\xed')
45             #define NORMAL_MEM ('\xee')
46             #define FINAL_NUN ('\xef')
47             #define NORMAL_NUN ('\xf0')
48             #define FINAL_PE ('\xf3')
49             #define NORMAL_PE ('\xf4')
50             #define FINAL_TSADI ('\xf5')
51             #define NORMAL_TSADI ('\xf6')
52              
53             // Minimum Visual vs Logical final letter score difference.
54             // If the difference is below this, don't rely solely on the final letter score distance.
55             #define MIN_FINAL_CHAR_DISTANCE (5)
56              
57             // Minimum Visual vs Logical model score difference.
58             // If the difference is below this, don't rely at all on the model score distance.
59             #define MIN_MODEL_DISTANCE (0.01)
60              
61             #define VISUAL_HEBREW_NAME ("ISO-8859-8")
62             #define LOGICAL_HEBREW_NAME ("windows-1255")
63              
64 8           PRBool nsHebrewProber::isFinal(char c)
65             {
66 8 50         return ((c == FINAL_KAF) || (c == FINAL_MEM) || (c == FINAL_NUN) || (c == FINAL_PE) || (c == FINAL_TSADI));
    50          
    50          
67             }
68              
69 0           PRBool nsHebrewProber::isNonFinal(char c)
70             {
71 0 0         return ((c == NORMAL_KAF) || (c == NORMAL_MEM) || (c == NORMAL_NUN) || (c == NORMAL_PE));
    0          
72             // The normal Tsadi is not a good Non-Final letter due to words like
73             // 'lechotet' (to chat) containing an apostrophe after the tsadi. This
74             // apostrophe is converted to a space in FilterWithoutEnglishLetters causing
75             // the Non-Final tsadi to appear at an end of a word even though this is not
76             // the case in the original text.
77             // The letters Pe and Kaf rarely display a related behavior of not being a
78             // good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
79             // example legally end with a Non-Final Pe or Kaf. However, the benefit of
80             // these letters as Non-Final letters outweighs the damage since these words
81             // are quite rare.
82             }
83              
84             /** HandleData
85             * Final letter analysis for logical-visual decision.
86             * Look for evidence that the received buffer is either logical Hebrew or
87             * visual Hebrew.
88             * The following cases are checked:
89             * 1) A word longer than 1 letter, ending with a final letter. This is an
90             * indication that the text is laid out "naturally" since the final letter
91             * really appears at the end. +1 for logical score.
92             * 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
93             * Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
94             * the Non-Final form of that letter. Exceptions to this rule are mentioned
95             * above in isNonFinal(). This is an indication that the text is laid out
96             * backwards. +1 for visual score
97             * 3) A word longer than 1 letter, starting with a final letter. Final letters
98             * should not appear at the beginning of a word. This is an indication that
99             * the text is laid out backwards. +1 for visual score.
100             *
101             * The visual score and logical score are accumulated throughout the text and
102             * are finally checked against each other in GetCharSetName().
103             * No checking for final letters in the middle of words is done since that case
104             * is not an indication for either Logical or Visual text.
105             *
106             * The input buffer should not contain any white spaces that are not (' ')
107             * or any low-ascii punctuation marks.
108             */
109 4           nsProbingState nsHebrewProber::HandleData(const char* aBuf, PRUint32 aLen)
110             {
111             // Both model probers say it's not them. No reason to continue.
112 4 50         if (GetState() == eNotMe)
113             return eNotMe;
114              
115 4           const char *curPtr, *endPtr = aBuf+aLen;
116             char cur;
117              
118 50 100         for (curPtr = (char*)aBuf; curPtr < endPtr; ++curPtr)
119             {
120 46           cur = *curPtr;
121 46 50         if (cur == ' ') // We stand on a space - a word just ended
122             {
123 0 0         if (mBeforePrev != ' ') // *(curPtr-2) was not a space so prev is not a 1 letter word
124             {
125 0 0         if (isFinal(mPrev)) // case (1) [-2:not space][-1:final letter][cur:space]
126 0           ++mFinalCharLogicalScore;
127 0 0         else if (isNonFinal(mPrev)) // case (2) [-2:not space][-1:Non-Final letter][cur:space]
128 0           ++mFinalCharVisualScore;
129             }
130             }
131             else // Not standing on a space
132             {
133 46 100         if ((mBeforePrev == ' ') && (isFinal(mPrev)) && (cur != ' ')) // case (3) [-2:space][-1:final letter][cur:not space]
    50          
    0          
    50          
134 0           ++mFinalCharVisualScore;
135             }
136 46           mBeforePrev = mPrev;
137 46           mPrev = cur;
138             }
139              
140             // Forever detecting, till the end or until both model probers return eNotMe (handled above).
141             return eDetecting;
142             }
143              
144             // Make the decision: is it Logical or Visual?
145 0           const char* nsHebrewProber::GetCharSetName()
146             {
147             // If the final letter score distance is dominant enough, rely on it.
148 0           PRInt32 finalsub = mFinalCharLogicalScore - mFinalCharVisualScore;
149 0 0         if (finalsub >= MIN_FINAL_CHAR_DISTANCE)
150             return LOGICAL_HEBREW_NAME;
151 0 0         if (finalsub <= -(MIN_FINAL_CHAR_DISTANCE))
152             return VISUAL_HEBREW_NAME;
153              
154             // It's not dominant enough, try to rely on the model scores instead.
155 0           float modelsub = mLogicalProb->GetConfidence() - mVisualProb->GetConfidence();
156 0 0         if (modelsub > MIN_MODEL_DISTANCE)
157             return LOGICAL_HEBREW_NAME;
158 0 0         if (modelsub < -(MIN_MODEL_DISTANCE))
159             return VISUAL_HEBREW_NAME;
160              
161             // Still no good, back to final letter distance, maybe it'll save the day.
162 0 0         if (finalsub < 0)
163             return VISUAL_HEBREW_NAME;
164              
165             // (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
166 0           return LOGICAL_HEBREW_NAME;
167             }
168              
169              
170 8           void nsHebrewProber::Reset(void)
171             {
172 8           mFinalCharLogicalScore = 0;
173 8           mFinalCharVisualScore = 0;
174              
175             // mPrev and mBeforePrev are initialized to space in order to simulate a word
176             // delimiter at the beginning of the data
177 8           mPrev = ' ';
178 8           mBeforePrev = ' ';
179 8           }
180              
181 4           nsProbingState nsHebrewProber::GetState(void)
182             {
183             // Remain active as long as any of the model probers are active.
184 4 50         if ((mLogicalProb->GetState() == eNotMe) && (mVisualProb->GetState() == eNotMe))
    0          
    50          
185             return eNotMe;
186 4           return eDetecting;
187             }
188              
189             #ifdef DEBUG_chardet
190             void nsHebrewProber::DumpStatus()
191             {
192             printf(" HEB: %d - %d [Logical-Visual score]\r\n", mFinalCharLogicalScore, mFinalCharVisualScore);
193             }
194             #endif