|
|
- 'use strict';
-
- const unicode = require('../common/unicode');
- const ERR = require('../common/error-codes');
-
- //Aliases
- const $ = unicode.CODE_POINTS;
-
- //Const
- const DEFAULT_BUFFER_WATERLINE = 1 << 16;
-
- //Preprocessor
- //NOTE: HTML input preprocessing
- //(see: http://www.whatwg.org/specs/web-apps/current-work/multipage/parsing.html#preprocessing-the-input-stream)
- class Preprocessor {
- constructor() {
- this.html = null;
-
- this.pos = -1;
- this.lastGapPos = -1;
- this.lastCharPos = -1;
-
- this.gapStack = [];
-
- this.skipNextNewLine = false;
-
- this.lastChunkWritten = false;
- this.endOfChunkHit = false;
- this.bufferWaterline = DEFAULT_BUFFER_WATERLINE;
- }
-
- _err() {
- // NOTE: err reporting is noop by default. Enabled by mixin.
- }
-
- _addGap() {
- this.gapStack.push(this.lastGapPos);
- this.lastGapPos = this.pos;
- }
-
- _processSurrogate(cp) {
- //NOTE: try to peek a surrogate pair
- if (this.pos !== this.lastCharPos) {
- const nextCp = this.html.charCodeAt(this.pos + 1);
-
- if (unicode.isSurrogatePair(nextCp)) {
- //NOTE: we have a surrogate pair. Peek pair character and recalculate code point.
- this.pos++;
-
- //NOTE: add gap that should be avoided during retreat
- this._addGap();
-
- return unicode.getSurrogatePairCodePoint(cp, nextCp);
- }
- }
-
- //NOTE: we are at the end of a chunk, therefore we can't infer surrogate pair yet.
- else if (!this.lastChunkWritten) {
- this.endOfChunkHit = true;
- return $.EOF;
- }
-
- //NOTE: isolated surrogate
- this._err(ERR.surrogateInInputStream);
-
- return cp;
- }
-
- dropParsedChunk() {
- if (this.pos > this.bufferWaterline) {
- this.lastCharPos -= this.pos;
- this.html = this.html.substring(this.pos);
- this.pos = 0;
- this.lastGapPos = -1;
- this.gapStack = [];
- }
- }
-
- write(chunk, isLastChunk) {
- if (this.html) {
- this.html += chunk;
- } else {
- this.html = chunk;
- }
-
- this.lastCharPos = this.html.length - 1;
- this.endOfChunkHit = false;
- this.lastChunkWritten = isLastChunk;
- }
-
- insertHtmlAtCurrentPos(chunk) {
- this.html = this.html.substring(0, this.pos + 1) + chunk + this.html.substring(this.pos + 1, this.html.length);
-
- this.lastCharPos = this.html.length - 1;
- this.endOfChunkHit = false;
- }
-
- advance() {
- this.pos++;
-
- if (this.pos > this.lastCharPos) {
- this.endOfChunkHit = !this.lastChunkWritten;
- return $.EOF;
- }
-
- let cp = this.html.charCodeAt(this.pos);
-
- //NOTE: any U+000A LINE FEED (LF) characters that immediately follow a U+000D CARRIAGE RETURN (CR) character
- //must be ignored.
- if (this.skipNextNewLine && cp === $.LINE_FEED) {
- this.skipNextNewLine = false;
- this._addGap();
- return this.advance();
- }
-
- //NOTE: all U+000D CARRIAGE RETURN (CR) characters must be converted to U+000A LINE FEED (LF) characters
- if (cp === $.CARRIAGE_RETURN) {
- this.skipNextNewLine = true;
- return $.LINE_FEED;
- }
-
- this.skipNextNewLine = false;
-
- if (unicode.isSurrogate(cp)) {
- cp = this._processSurrogate(cp);
- }
-
- //OPTIMIZATION: first check if code point is in the common allowed
- //range (ASCII alphanumeric, whitespaces, big chunk of BMP)
- //before going into detailed performance cost validation.
- const isCommonValidRange =
- (cp > 0x1f && cp < 0x7f) || cp === $.LINE_FEED || cp === $.CARRIAGE_RETURN || (cp > 0x9f && cp < 0xfdd0);
-
- if (!isCommonValidRange) {
- this._checkForProblematicCharacters(cp);
- }
-
- return cp;
- }
-
- _checkForProblematicCharacters(cp) {
- if (unicode.isControlCodePoint(cp)) {
- this._err(ERR.controlCharacterInInputStream);
- } else if (unicode.isUndefinedCodePoint(cp)) {
- this._err(ERR.noncharacterInInputStream);
- }
- }
-
- retreat() {
- if (this.pos === this.lastGapPos) {
- this.lastGapPos = this.gapStack.pop();
- this.pos--;
- }
-
- this.pos--;
- }
- }
-
- module.exports = Preprocessor;
|