323 lines
14 KiB
JavaScript
323 lines
14 KiB
JavaScript
/*
|
|
* The Original Code is Mozilla Universal charset detector code.
|
|
*
|
|
* The Initial Developer of the Original Code is
|
|
* Netscape Communications Corporation.
|
|
* Portions created by the Initial Developer are Copyright (C) 2001
|
|
* the Initial Developer. All Rights Reserved.
|
|
*
|
|
* Contributor(s):
|
|
* António Afonso (antonio.afonso gmail.com) - port to JavaScript
|
|
* Mark Pilgrim - port to Python
|
|
* Shy Shalom - original C code
|
|
*
|
|
* This library is free software; you can redistribute it and/or
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
* License as published by the Free Software Foundation; either
|
|
* version 2.1 of the License, or (at your option) any later version.
|
|
*
|
|
* This library is distributed in the hope that it will be useful,
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
* Lesser General Public License for more details.
|
|
*
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
* License along with this library; if not, write to the Free Software
|
|
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
|
|
* 02110-1301 USA
|
|
*/
|
|
|
|
// This prober doesn't actually recognize a language or a charset.
|
|
// It is a helper prober for the use of the Hebrew model probers
|
|
|
|
////// General ideas of the Hebrew charset recognition //////
|
|
//
|
|
// Four main charsets exist in Hebrew:
|
|
// "ISO-8859-8" - Visual Hebrew
|
|
// "windows-1255" - Logical Hebrew
|
|
// "ISO-8859-8-I" - Logical Hebrew
|
|
// "x-mac-hebrew" - ?? Logical Hebrew ??
|
|
//
|
|
// Both "ISO" charsets use a completely identical set of code points, whereas
|
|
// "windows-1255" and "x-mac-hebrew" are two different proper supersets of
|
|
// these code points. windows-1255 defines additional characters in the range
|
|
// 0x80-0x9F as some misc punctuation marks as well as some Hebrew-specific
|
|
// diacritics and additional 'Yiddish' ligature letters in the range 0xc0-0xd6.
|
|
// x-mac-hebrew defines similar additional code points but with a different
|
|
// mapping.
|
|
//
|
|
// As far as an average Hebrew text with no diacritics is concerned, all four
|
|
// charsets are identical with respect to code points. Meaning that for the
|
|
// main Hebrew alphabet, all four map the same values to all 27 Hebrew letters
|
|
// (including final letters).
|
|
//
|
|
// The dominant difference between these charsets is their directionality.
|
|
// "Visual" directionality means that the text is ordered as if the renderer is
|
|
// not aware of a BIDI rendering algorithm. The renderer sees the text and
|
|
// draws it from left to right. The text itself when ordered naturally is read
|
|
// backwards. A buffer of Visual Hebrew generally looks like so:
|
|
// "[last word of first line spelled backwards] [whole line ordered backwards
|
|
// and spelled backwards] [first word of first line spelled backwards]
|
|
// [end of line] [last word of second line] ... etc' "
|
|
// adding punctuation marks, numbers and English text to visual text is
|
|
// naturally also "visual" and from left to right.
|
|
//
|
|
// "Logical" directionality means the text is ordered "naturally" according to
|
|
// the order it is read. It is the responsibility of the renderer to display
|
|
// the text from right to left. A BIDI algorithm is used to place general
|
|
// punctuation marks, numbers and English text in the text.
|
|
//
|
|
// Texts in x-mac-hebrew are almost impossible to find on the Internet. From
|
|
// what little evidence I could find, it seems that its general directionality
|
|
// is Logical.
|
|
//
|
|
// To sum up all of the above, the Hebrew probing mechanism knows about two
|
|
// charsets:
|
|
// Visual Hebrew - "ISO-8859-8" - backwards text - Words and sentences are
|
|
// backwards while line order is natural. For charset recognition purposes
|
|
// the line order is unimportant (In fact, for this implementation, even
|
|
// word order is unimportant).
|
|
// Logical Hebrew - "windows-1255" - normal, naturally ordered text.
|
|
//
|
|
// "ISO-8859-8-I" is a subset of windows-1255 and doesn't need to be
|
|
// specifically identified.
|
|
// "x-mac-hebrew" is also identified as windows-1255. A text in x-mac-hebrew
|
|
// that contain special punctuation marks or diacritics is displayed with
|
|
// some unconverted characters showing as question marks. This problem might
|
|
// be corrected using another model prober for x-mac-hebrew. Due to the fact
|
|
// that x-mac-hebrew texts are so rare, writing another model prober isn't
|
|
// worth the effort and performance hit.
|
|
//
|
|
//////// The Prober ////////
|
|
//
|
|
// The prober is divided between two SBCharSetProbers and a HebrewProber,
|
|
// all of which are managed, created, fed data, inquired and deleted by the
|
|
// SBCSGroupProber. The two SBCharSetProbers identify that the text is in
|
|
// fact some kind of Hebrew, Logical or Visual. The final decision about which
|
|
// one is it is made by the HebrewProber by combining final-letter scores
|
|
// with the scores of the two SBCharSetProbers to produce a final answer.
|
|
//
|
|
// The SBCSGroupProber is responsible for stripping the original text of HTML
|
|
// tags, English characters, numbers, low-ASCII punctuation characters, spaces
|
|
// and new lines. It reduces any sequence of such characters to a single space.
|
|
// The buffer fed to each prober in the SBCS group prober is pure text in
|
|
// high-ASCII.
|
|
// The two SBCharSetProbers (model probers) share the same language model:
|
|
// Win1255Model.
|
|
// The first SBCharSetProber uses the model normally as any other
|
|
// SBCharSetProber does, to recognize windows-1255, upon which this model was
|
|
// built. The second SBCharSetProber is told to make the pair-of-letter
|
|
// lookup in the language model backwards. This in practice exactly simulates
|
|
// a visual Hebrew model using the windows-1255 logical Hebrew model.
|
|
//
|
|
// The HebrewProber is not using any language model. All it does is look for
|
|
// final-letter evidence suggesting the text is either logical Hebrew or visual
|
|
// Hebrew. Disjointed from the model probers, the results of the HebrewProber
|
|
// alone are meaningless. HebrewProber always returns 0.00 as confidence
|
|
// since it never identifies a charset by itself. Instead, the pointer to the
|
|
// HebrewProber is passed to the model probers as a helper "Name Prober".
|
|
// When the Group prober receives a positive identification from any prober,
|
|
// it asks for the name of the charset identified. If the prober queried is a
|
|
// Hebrew model prober, the model prober forwards the call to the
|
|
// HebrewProber to make the final decision. In the HebrewProber, the
|
|
// decision is made according to the final-letters scores maintained and Both
|
|
// model probers scores. The answer is returned in the form of the name of the
|
|
// charset identified, either "windows-1255" or "ISO-8859-8".
|
|
|
|
var CharSetProber = require('./charsetprober');
|
|
var constants = require('./constants')
|
|
|
|
// https://developer.mozilla.org/En/Core_JavaScript_1.5_Reference/Objects/Array/IndexOf
|
|
if (!Array.prototype.indexOf)
|
|
{
|
|
Array.prototype.indexOf = function(elt /*, from*/)
|
|
{
|
|
var len = this.length >>> 0;
|
|
|
|
var from = Number(arguments[1]) || 0;
|
|
from = (from < 0)
|
|
? Math.ceil(from)
|
|
: Math.floor(from);
|
|
if (from < 0)
|
|
from += len;
|
|
|
|
for (; from < len; from++)
|
|
{
|
|
if (from in this &&
|
|
this[from] === elt)
|
|
return from;
|
|
}
|
|
return -1;
|
|
};
|
|
}
|
|
|
|
function HebrewProber() {
|
|
CharSetProber.apply(this);
|
|
|
|
// windows-1255 / ISO-8859-8 code points of interest
|
|
var FINAL_KAF = '\xea'
|
|
var NORMAL_KAF = '\xeb'
|
|
var FINAL_MEM = '\xed'
|
|
var NORMAL_MEM = '\xee'
|
|
var FINAL_NUN = '\xef'
|
|
var NORMAL_NUN = '\xf0'
|
|
var FINAL_PE = '\xf3'
|
|
var NORMAL_PE = '\xf4'
|
|
var FINAL_TSADI = '\xf5'
|
|
var NORMAL_TSADI = '\xf6'
|
|
|
|
// Minimum Visual vs Logical final letter score difference.
|
|
// If the difference is below this, don't rely solely on the final letter score distance.
|
|
var MIN_FINAL_CHAR_DISTANCE = 5
|
|
|
|
// Minimum Visual vs Logical model score difference.
|
|
// If the difference is below this, don't rely at all on the model score distance.
|
|
var MIN_MODEL_DISTANCE = 0.01
|
|
|
|
var VISUAL_HEBREW_NAME = "ISO-8859-8"
|
|
var LOGICAL_HEBREW_NAME = "windows-1255"
|
|
var self = this;
|
|
|
|
function init() {
|
|
self._mLogicalProber = null;
|
|
self._mVisualProber = null;
|
|
self.reset();
|
|
}
|
|
|
|
this.reset = function() {
|
|
this._mFinalCharLogicalScore = 0;
|
|
this._mFinalCharVisualScore = 0;
|
|
// The two last characters seen in the previous buffer,
|
|
// mPrev and mBeforePrev are initialized to space in order to simulate a word
|
|
// delimiter at the beginning of the data
|
|
this._mPrev = " ";
|
|
this._mBeforePrev = " ";
|
|
// These probers are owned by the group prober.
|
|
}
|
|
|
|
this.setModelProbers = function(logicalProber, visualProber) {
|
|
this._mLogicalProber = logicalProber;
|
|
this._mVisualProber = visualProber;
|
|
}
|
|
|
|
this.isFinal = function(c) {
|
|
return [FINAL_KAF, FINAL_MEM, FINAL_NUN, FINAL_PE, FINAL_TSADI].indexOf(c) != -1;
|
|
}
|
|
|
|
this.isNonFinal = function(c) {
|
|
// The normal Tsadi is not a good Non-Final letter due to words like
|
|
// 'lechotet' (to chat) containing an apostrophe after the tsadi. This
|
|
// apostrophe is converted to a space in FilterWithoutEnglishLetters causing
|
|
// the Non-Final tsadi to appear at an end of a word even though this is not
|
|
// the case in the original text.
|
|
// The letters Pe and Kaf rarely display a related behavior of not being a
|
|
// good Non-Final letter. Words like 'Pop', 'Winamp' and 'Mubarak' for
|
|
// example legally end with a Non-Final Pe or Kaf. However, the benefit of
|
|
// these letters as Non-Final letters outweighs the damage since these words
|
|
// are quite rare.
|
|
return [NORMAL_KAF, NORMAL_MEM, NORMAL_NUN, NORMAL_PE].indexOf(c) != -1;
|
|
}
|
|
|
|
this.feed = function(aBuf) {
|
|
// Final letter analysis for logical-visual decision.
|
|
// Look for evidence that the received buffer is either logical Hebrew or
|
|
// visual Hebrew.
|
|
// The following cases are checked:
|
|
// 1) A word longer than 1 letter, ending with a final letter. This is an
|
|
// indication that the text is laid out "naturally" since the final letter
|
|
// really appears at the end. +1 for logical score.
|
|
// 2) A word longer than 1 letter, ending with a Non-Final letter. In normal
|
|
// Hebrew, words ending with Kaf, Mem, Nun, Pe or Tsadi, should not end with
|
|
// the Non-Final form of that letter. Exceptions to this rule are mentioned
|
|
// above in isNonFinal(). This is an indication that the text is laid out
|
|
// backwards. +1 for visual score
|
|
// 3) A word longer than 1 letter, starting with a final letter. Final letters
|
|
// should not appear at the beginning of a word. This is an indication that
|
|
// the text is laid out backwards. +1 for visual score.
|
|
//
|
|
// The visual score and logical score are accumulated throughout the text and
|
|
// are finally checked against each other in GetCharSetName().
|
|
// No checking for final letters in the middle of words is done since that case
|
|
// is not an indication for either Logical or Visual text.
|
|
//
|
|
// We automatically filter out all 7-bit characters (replace them with spaces)
|
|
// so the word boundary detection works properly. [MAP]
|
|
|
|
if( this.getState() == constants.notMe ) {
|
|
// Both model probers say it's not them. No reason to continue.
|
|
return constants.notMe;
|
|
}
|
|
|
|
aBuf = this.filterHighBitOnly(aBuf);
|
|
|
|
for( var i = 0, cur; i < aBuf.length; i++ ) {
|
|
cur = aBuf[i];
|
|
if( cur == " " ) {
|
|
// We stand on a space - a word just ended
|
|
if( this._mBeforePrev != " " ) {
|
|
// next-to-last char was not a space so self._mPrev is not a 1 letter word
|
|
if( this.isFinal(this._mPrev) ) {
|
|
// case (1) [-2:not space][-1:final letter][cur:space]
|
|
this._mFinalCharLogicalScore++;
|
|
} else if( this.isNonFinal(this._mPrev) ) {
|
|
// case (2) [-2:not space][-1:Non-Final letter][cur:space]
|
|
this._mFinalCharVisualScore++;
|
|
}
|
|
}
|
|
} else {
|
|
// Not standing on a space
|
|
if( this._mBeforePrev == " " && this.isFinal(this._mPrev) && cur != " " ) {
|
|
// case (3) [-2:space][-1:final letter][cur:not space]
|
|
this._mFinalCharVisualScore++;
|
|
}
|
|
}
|
|
this._mBeforePrev = this._mPrev;
|
|
this._mPrev = cur;
|
|
}
|
|
// Forever detecting, till the end or until both model probers return eNotMe (handled above)
|
|
return constants.detecting;
|
|
}
|
|
|
|
this.getCharsetName = function() {
|
|
// Make the decision: is it Logical or Visual?
|
|
// If the final letter score distance is dominant enough, rely on it.
|
|
var finalsub = this._mFinalCharLogicalScore - this._mFinalCharVisualScore;
|
|
if( finalsub >= MIN_FINAL_CHAR_DISTANCE ) {
|
|
return LOGICAL_HEBREW_NAME;
|
|
}
|
|
if( finalsub <= -MIN_FINAL_CHAR_DISTANCE ) {
|
|
return VISUAL_HEBREW_NAME;
|
|
}
|
|
|
|
// It's not dominant enough, try to rely on the model scores instead.
|
|
var modelsub = this._mLogicalProber.getConfidence() - this._mVisualProber.getConfidence();
|
|
if( modelsub > MIN_MODEL_DISTANCE ) {
|
|
return LOGICAL_HEBREW_NAME;
|
|
}
|
|
if( modelsub < -MIN_MODEL_DISTANCE ) {
|
|
return VISUAL_HEBREW_NAME;
|
|
}
|
|
|
|
// Still no good, back to final letter distance, maybe it'll save the day.
|
|
if( finalsub < 0 ) {
|
|
return VISUAL_HEBREW_NAME;
|
|
}
|
|
|
|
// (finalsub > 0 - Logical) or (don't know what to do) default to Logical.
|
|
return LOGICAL_HEBREW_NAME;
|
|
}
|
|
|
|
this.getState = function() {
|
|
// Remain active as long as any of the model probers are active.
|
|
if( this._mLogicalProber.getState() == constants.notMe &&
|
|
this._mVisualProber.getState() == constants.notMe ) {
|
|
return constants.notMe;
|
|
}
|
|
return constants.detecting;
|
|
}
|
|
|
|
init();
|
|
}
|
|
HebrewProber.prototype = new CharSetProber();
|
|
|
|
module.exports = HebrewProber
|