codeql-action/node_modules/jschardet/src/universaldetector.js
2024-09-09 13:21:27 -07:00

293 lines
11 KiB
JavaScript

/*
* The Original Code is Mozilla Universal charset detector code.
*
* The Initial Developer of the Original Code is
* Netscape Communications Corporation.
* Portions created by the Initial Developer are Copyright (C) 2001
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* António Afonso (antonio.afonso gmail.com) - port to JavaScript
* Mark Pilgrim - port to Python
* Shy Shalom - original C code
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
* 02110-1301 USA
*/
/**
* This is a port from the python port, version "2.0.1"
*/
var constants = require('./constants');
var MBCSGroupProber = require('./mbcsgroupprober');
var SBCSGroupProber = require('./sbcsgroupprober');
var Latin1Prober = require('./latin1prober');
var EscCharSetProber = require('./escprober');
var logger = require('./logger');
const supportedEncodings = (function() {
const BOM_UTF = [
"UTF-8", "UTF-32LE", "UTF-32BE", "UTF-32BE", "UTF-16LE", "UTF-16BE",
"X-ISO-10646-UCS-4-3412", "X-ISO-10646-UCS-4-2143"
]
const probers = [
new EscCharSetProber(),
new MBCSGroupProber(),
new SBCSGroupProber(),
new Latin1Prober()
];
const encodings = BOM_UTF.slice(0);
for (const prober of probers) {
[].push.apply(encodings, prober.getSupportedCharsetNames());
}
return encodings;
})();
const supportedEncodingsDenormalized = (function() {
const denormalizedEncodings = [];
for (const encoding of supportedEncodings) {
denormalizedEncodings.push(
encoding.toLocaleLowerCase(),
encoding.toLocaleLowerCase().replace(/-/g, "")
);
}
return denormalizedEncodings;
})();
function UniversalDetector(options) {
if (!options) options = {};
if (typeof options.minimumThreshold !== "number") {
if (options.detectEncodings) {
// If encodings are narrowed down by the user allow for
// any threshold to be returned.
options.minimumThreshold = 0;
} else {
options.minimumThreshold = 0.20;
}
}
if (options.detectEncodings) {
for (const encoding of options.detectEncodings) {
if (!supportedEncodingsDenormalized.includes(encoding.toLowerCase())) {
throw new Error(`Encoding ${encoding} is not supported. Supported encodings: ${supportedEncodings}.`);
}
}
}
var _state = {
pureAscii : 0,
escAscii : 1,
highbyte : 2
};
var self = this;
function init() {
self._highBitDetector = /[\x80-\xFF]/;
self._escDetector = /(\x1B|~\{)/;
self._mEscCharsetProber = null;
self._mCharsetProbers = [];
self.reset();
}
function canDetectEncoding(encoding) {
if (!options.detectEncodings) {
return true;
}
lowerDetectedEncodings = options.detectEncodings.map(encoding => encoding.toLowerCase());
return lowerDetectedEncodings.includes(encoding.toLowerCase());
}
this.reset = function() {
this.result = {"encoding": null, "confidence": 0.0};
this.results = []
this.done = false;
this._mStart = true;
this._mGotData = false;
this._mInputState = _state.pureAscii;
this._mLastChar = [];
this._mBOM = "";
if( this._mEscCharsetProber ) {
this._mEscCharsetProber.reset();
}
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
prober.reset();
}
}
this.feed = function(aBuf) {
if( this.done ) return;
var aLen = aBuf.length;
if( !aLen ) return;
if( !this._mGotData ) {
this._mBOM += aBuf;
// If the data starts with BOM, we know it is UTF
if( this._mBOM.slice(0,3) == "\xEF\xBB\xBF" && canDetectEncoding("UTF-8")) {
// EF BB BF UTF-8 with BOM
this.result = {"encoding": "UTF-8", "confidence": 1.0};
} else if( this._mBOM.slice(0,4) == "\xFF\xFE\x00\x00" && canDetectEncoding("UTF-32LE") ) {
// FF FE 00 00 UTF-32, little-endian BOM
this.result = {"encoding": "UTF-32LE", "confidence": 1.0};
} else if( this._mBOM.slice(0,4) == "\x00\x00\xFE\xFF" && canDetectEncoding("UTF-32BE")) {
// 00 00 FE FF UTF-32, big-endian BOM
this.result = {"encoding": "UTF-32BE", "confidence": 1.0};
} else if( this._mBOM.slice(0,4) == "\xFE\xFF\x00\x00" && canDetectEncoding("X-ISO-10646-UCS-4-3412")) {
// FE FF 00 00 UCS-4, unusual octet order BOM (3412)
this.result = {"encoding": "X-ISO-10646-UCS-4-3412", "confidence": 1.0};
} else if( this._mBOM.slice(0,4) == "\x00\x00\xFF\xFE" && canDetectEncoding("X-ISO-10646-UCS-4-2143")) {
// 00 00 FF FE UCS-4, unusual octet order BOM (2143)
this.result = {"encoding": "X-ISO-10646-UCS-4-2143", "confidence": 1.0};
} else if( this._mBOM.slice(0,2) == "\xFF\xFE" && canDetectEncoding("UTF-16LE")) {
// FF FE UTF-16, little endian BOM
this.result = {"encoding": "UTF-16LE", "confidence": 1.0};
} else if( this._mBOM.slice(0,2) == "\xFE\xFF" && canDetectEncoding("UTF-16BE")) {
// FE FF UTF-16, big endian BOM
this.result = {"encoding": "UTF-16BE", "confidence": 1.0};
}
if (this.result.confidence > 0) {
this.results = [this.result];
}
// If we got to 4 chars without being able to detect a BOM we
// stop trying.
if( this._mBOM.length > 3 ) {
this._mGotData = true;
}
}
if( this.result.encoding && (this.result.confidence > 0.0) ) {
this.done = true;
return;
}
if( this._mInputState == _state.pureAscii ) {
if( this._highBitDetector.test(aBuf) ) {
this._mInputState = _state.highbyte;
} else if( this._escDetector.test(this._mLastChar.join('') + aBuf) ) {
this._mInputState = _state.escAscii;
}
}
this._mLastChar = aBuf.slice(-1).split('');
if( this._mInputState == _state.escAscii ) {
if( !this._mEscCharsetProber ) {
this._mEscCharsetProber = new EscCharSetProber();
}
if( this._mEscCharsetProber.feed(aBuf) == constants.foundIt && canDetectEncoding(this._mEscCharsetProber.getCharsetName()) ) {
this.result = {
"encoding": this._mEscCharsetProber.getCharsetName(),
"confidence": this._mEscCharsetProber.getConfidence()
};
this.results = [this.result];
this.done = true;
}
} else if( this._mInputState == _state.highbyte ) {
if( this._mCharsetProbers.length == 0 ) {
this._mCharsetProbers = [
new MBCSGroupProber(),
new SBCSGroupProber(),
new Latin1Prober()
];
}
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
if( prober.feed(aBuf) == constants.foundIt && canDetectEncoding(prober.getCharsetName()) ) {
this.result = {
"encoding": prober.getCharsetName(),
"confidence": prober.getConfidence()
};
this.results = [this.result];
this.done = true;
break;
}
}
}
}
this.close = function() {
if( this.done ) return;
if( this._mBOM.length === 0 ) {
logger.log("no data received!\n");
return;
}
this.done = true;
if( this._mInputState == _state.pureAscii && canDetectEncoding("ascii") ) {
logger.log("pure ascii")
this.result = {"encoding": "ascii", "confidence": 1.0};
this.results.push(this.result);
return this.result;
}
if (this._mInputState == _state.highbyte) {
let windows_1252_confidence = 0;
let windows_1250_detected = false;
for (var i = 0, prober; prober = this._mCharsetProbers[i]; i++) {
if (!prober) continue;
const charsetName = prober.getCharsetName();
const confidence = prober.getConfidence();
if (prober.getCharsetName() === "windows-1252") {
windows_1252_confidence = confidence;
}
if (!charsetName || !canDetectEncoding(charsetName)) continue;
this.results.push({
"encoding": prober.getCharsetName(),
"confidence": confidence
});
if (prober.getCharsetName() === "windows-1250") {
windows_1250_detected = true;
}
logger.log(prober.getCharsetName() + " confidence " + confidence);
}
// HACK: When windows-1252 is detected it's almost sure that it can
// also be windows-1250.
// https://en.wikipedia.org/wiki/Windows-1250 (Central European)
if (windows_1252_confidence && !windows_1250_detected && canDetectEncoding("windows-1250")) {
this.results.push({
"encoding": "windows-1250",
// Report the confidence just a bit under windows-1252's.
"confidence": windows_1252_confidence - Math.pow(5/10, (String(windows_1252_confidence).length - 1)),
});
}
this.results.sort(function(a, b) {
return b.confidence - a.confidence;
});
if (this.results.length > 0) {
var topResult = this.results[0];
if (topResult.confidence >= options.minimumThreshold) {
this.result = topResult;
return topResult;
}
}
}
if( logger.enabled ) {
logger.log("no probers hit minimum threshhold\n");
for( var i = 0, prober; prober = this._mCharsetProbers[i]; i++ ) {
if( !prober || !canDetectEncoding(prober.getCharsetName()) ) continue;
logger.log(prober.getCharsetName() + " confidence = " +
prober.getConfidence() + "\n");
}
}
}
init();
}
module.exports = UniversalDetector;