调整分包,完成隐私协议(样式待优化)

This commit is contained in:
2024-07-18 14:18:05 +08:00
parent 765b3ad111
commit 00384d7382
1045 changed files with 34254 additions and 98 deletions

View File

@@ -0,0 +1,492 @@
var Tokenizer_js_1 = require("./Tokenizer.js");
var decode_js_1 = require("./entities/decode.js");
var formTags = new Set([
"input",
"option",
"optgroup",
"select",
"button",
"datalist",
"textarea",
]);
var pTag = new Set(["p"]);
var tableSectionTags = new Set(["thead", "tbody"]);
var ddtTags = new Set(["dd", "dt"]);
var rtpTags = new Set(["rt", "rp"]);
var openImpliesClose = new Map([
["tr", new Set(["tr", "th", "td"])],
["th", new Set(["th"])],
["td", new Set(["thead", "th", "td"])],
["body", new Set(["head", "link", "script"])],
["li", new Set(["li"])],
["p", pTag],
["h1", pTag],
["h2", pTag],
["h3", pTag],
["h4", pTag],
["h5", pTag],
["h6", pTag],
["select", formTags],
["input", formTags],
["output", formTags],
["button", formTags],
["datalist", formTags],
["textarea", formTags],
["option", new Set(["option"])],
["optgroup", new Set(["optgroup", "option"])],
["dd", ddtTags],
["dt", ddtTags],
["address", pTag],
["article", pTag],
["aside", pTag],
["blockquote", pTag],
["details", pTag],
["div", pTag],
["dl", pTag],
["fieldset", pTag],
["figcaption", pTag],
["figure", pTag],
["footer", pTag],
["form", pTag],
["header", pTag],
["hr", pTag],
["main", pTag],
["nav", pTag],
["ol", pTag],
["pre", pTag],
["section", pTag],
["table", pTag],
["ul", pTag],
["rt", rtpTags],
["rp", rtpTags],
["tbody", tableSectionTags],
["tfoot", tableSectionTags],
]);
var voidElements = new Set([
"area",
"base",
"basefont",
"br",
"col",
"command",
"embed",
"frame",
"hr",
"img",
"input",
"isindex",
"keygen",
"link",
"meta",
"param",
"source",
"track",
"wbr",
]);
var foreignContextElements = new Set(["math", "svg"]);
var htmlIntegrationElements = new Set([
"mi",
"mo",
"mn",
"ms",
"mtext",
"annotation-xml",
"foreignobject",
"desc",
"title",
]);
var reNameEnd = /\s|\//;
var Parser = /** @class */ (function () {
function Parser(cbs, options) {
if (options === void 0) { options = {}; }
var _a, _b, _c, _d, _e;
this.options = options;
/** The start index of the last event. */
this.startIndex = 0;
/** The end index of the last event. */
this.endIndex = 0;
/**
* Store the start index of the current open tag,
* so we can update the start index for attributes.
*/
this.openTagStart = 0;
this.tagname = "";
this.attribname = "";
this.attribvalue = "";
this.attribs = null;
this.stack = [];
this.foreignContext = [];
this.buffers = [];
this.bufferOffset = 0;
/** The index of the last written buffer. Used when resuming after a `pause()`. */
this.writeIndex = 0;
/** Indicates whether the parser has finished running / `.end` has been called. */
this.ended = false;
this.cbs = cbs !== null && cbs !== void 0 ? cbs : {};
this.lowerCaseTagNames = (_a = options.lowerCaseTags) !== null && _a !== void 0 ? _a : !options.xmlMode;
this.lowerCaseAttributeNames =
(_b = options.lowerCaseAttributeNames) !== null && _b !== void 0 ? _b : !options.xmlMode;
this.tokenizer = new ((_c = options.Tokenizer) !== null && _c !== void 0 ? _c : Tokenizer_js_1.default)(this.options, this);
(_e = (_d = this.cbs).onparserinit) === null || _e === void 0 ? void 0 : _e.call(_d, this);
}
// Tokenizer event handlers
/** @internal */
Parser.prototype.ontext = function (start, endIndex) {
var _a, _b;
var data = this.getSlice(start, endIndex);
this.endIndex = endIndex - 1;
(_b = (_a = this.cbs).ontext) === null || _b === void 0 ? void 0 : _b.call(_a, data);
this.startIndex = endIndex;
};
/** @internal */
Parser.prototype.ontextentity = function (cp) {
var _a, _b;
/*
* Entities can be emitted on the character, or directly after.
* We use the section start here to get accurate indices.
*/
var idx = this.tokenizer.getSectionStart();
this.endIndex = idx - 1;
(_b = (_a = this.cbs).ontext) === null || _b === void 0 ? void 0 : _b.call(_a, (0, decode_js_1.fromCodePoint)(cp));
this.startIndex = idx;
};
Parser.prototype.isVoidElement = function (name) {
return !this.options.xmlMode && voidElements.has(name);
};
/** @internal */
Parser.prototype.onopentagname = function (start, endIndex) {
this.endIndex = endIndex;
var name = this.getSlice(start, endIndex);
if (this.lowerCaseTagNames) {
name = name.toLowerCase();
}
this.emitOpenTag(name);
};
Parser.prototype.emitOpenTag = function (name) {
var _a, _b, _c, _d;
this.openTagStart = this.startIndex;
this.tagname = name;
var impliesClose = !this.options.xmlMode && openImpliesClose.get(name);
if (impliesClose) {
while (this.stack.length > 0 &&
impliesClose.has(this.stack[this.stack.length - 1])) {
var el = this.stack.pop();
(_b = (_a = this.cbs).onclosetag) === null || _b === void 0 ? void 0 : _b.call(_a, el, true);
}
}
if (!this.isVoidElement(name)) {
this.stack.push(name);
if (foreignContextElements.has(name)) {
this.foreignContext.push(true);
}
else if (htmlIntegrationElements.has(name)) {
this.foreignContext.push(false);
}
}
(_d = (_c = this.cbs).onopentagname) === null || _d === void 0 ? void 0 : _d.call(_c, name);
if (this.cbs.onopentag)
this.attribs = {};
};
Parser.prototype.endOpenTag = function (isImplied) {
var _a, _b;
this.startIndex = this.openTagStart;
if (this.attribs) {
(_b = (_a = this.cbs).onopentag) === null || _b === void 0 ? void 0 : _b.call(_a, this.tagname, this.attribs, isImplied);
this.attribs = null;
}
if (this.cbs.onclosetag && this.isVoidElement(this.tagname)) {
this.cbs.onclosetag(this.tagname, true);
}
this.tagname = "";
};
/** @internal */
Parser.prototype.onopentagend = function (endIndex) {
this.endIndex = endIndex;
this.endOpenTag(false);
// Set `startIndex` for next node
this.startIndex = endIndex + 1;
};
/** @internal */
Parser.prototype.onclosetag = function (start, endIndex) {
var _a, _b, _c, _d, _e, _f;
this.endIndex = endIndex;
var name = this.getSlice(start, endIndex);
if (this.lowerCaseTagNames) {
name = name.toLowerCase();
}
if (foreignContextElements.has(name) ||
htmlIntegrationElements.has(name)) {
this.foreignContext.pop();
}
if (!this.isVoidElement(name)) {
var pos = this.stack.lastIndexOf(name);
if (pos !== -1) {
if (this.cbs.onclosetag) {
var count = this.stack.length - pos;
while (count--) {
// We know the stack has sufficient elements.
this.cbs.onclosetag(this.stack.pop(), count !== 0);
}
}
else
this.stack.length = pos;
}
else if (!this.options.xmlMode && name === "p") {
// Implicit open before close
this.emitOpenTag("p");
this.closeCurrentTag(true);
}
}
else if (!this.options.xmlMode && name === "br") {
// We can't use `emitOpenTag` for implicit open, as `br` would be implicitly closed.
(_b = (_a = this.cbs).onopentagname) === null || _b === void 0 ? void 0 : _b.call(_a, "br");
(_d = (_c = this.cbs).onopentag) === null || _d === void 0 ? void 0 : _d.call(_c, "br", {}, true);
(_f = (_e = this.cbs).onclosetag) === null || _f === void 0 ? void 0 : _f.call(_e, "br", false);
}
// Set `startIndex` for next node
this.startIndex = endIndex + 1;
};
/** @internal */
Parser.prototype.onselfclosingtag = function (endIndex) {
this.endIndex = endIndex;
if (this.options.xmlMode ||
this.options.recognizeSelfClosing ||
this.foreignContext[this.foreignContext.length - 1]) {
this.closeCurrentTag(false);
// Set `startIndex` for next node
this.startIndex = endIndex + 1;
}
else {
// Ignore the fact that the tag is self-closing.
this.onopentagend(endIndex);
}
};
Parser.prototype.closeCurrentTag = function (isOpenImplied) {
var _a, _b;
var name = this.tagname;
this.endOpenTag(isOpenImplied);
// Self-closing tags will be on the top of the stack
if (this.stack[this.stack.length - 1] === name) {
// If the opening tag isn't implied, the closing tag has to be implied.
(_b = (_a = this.cbs).onclosetag) === null || _b === void 0 ? void 0 : _b.call(_a, name, !isOpenImplied);
this.stack.pop();
}
};
/** @internal */
Parser.prototype.onattribname = function (start, endIndex) {
this.startIndex = start;
var name = this.getSlice(start, endIndex);
this.attribname = this.lowerCaseAttributeNames
? name.toLowerCase()
: name;
};
/** @internal */
Parser.prototype.onattribdata = function (start, endIndex) {
this.attribvalue += this.getSlice(start, endIndex);
};
/** @internal */
Parser.prototype.onattribentity = function (cp) {
this.attribvalue += (0, decode_js_1.fromCodePoint)(cp);
};
/** @internal */
Parser.prototype.onattribend = function (quote, endIndex) {
var _a, _b;
this.endIndex = endIndex;
(_b = (_a = this.cbs).onattribute) === null || _b === void 0 ? void 0 : _b.call(_a, this.attribname, this.attribvalue, quote === Tokenizer_js_1.QuoteType.Double
? '"'
: quote === Tokenizer_js_1.QuoteType.Single
? "'"
: quote === Tokenizer_js_1.QuoteType.NoValue
? undefined
: null);
if (this.attribs &&
!Object.prototype.hasOwnProperty.call(this.attribs, this.attribname)) {
this.attribs[this.attribname] = this.attribvalue;
}
this.attribvalue = "";
};
Parser.prototype.getInstructionName = function (value) {
var idx = value.search(reNameEnd);
var name = idx < 0 ? value : value.substr(0, idx);
if (this.lowerCaseTagNames) {
name = name.toLowerCase();
}
return name;
};
/** @internal */
Parser.prototype.ondeclaration = function (start, endIndex) {
this.endIndex = endIndex;
var value = this.getSlice(start, endIndex);
if (this.cbs.onprocessinginstruction) {
var name = this.getInstructionName(value);
this.cbs.onprocessinginstruction("!".concat(name), "!".concat(value));
}
// Set `startIndex` for next node
this.startIndex = endIndex + 1;
};
/** @internal */
Parser.prototype.onprocessinginstruction = function (start, endIndex) {
this.endIndex = endIndex;
var value = this.getSlice(start, endIndex);
if (this.cbs.onprocessinginstruction) {
var name = this.getInstructionName(value);
this.cbs.onprocessinginstruction("?".concat(name), "?".concat(value));
}
// Set `startIndex` for next node
this.startIndex = endIndex + 1;
};
/** @internal */
Parser.prototype.oncomment = function (start, endIndex, offset) {
var _a, _b, _c, _d;
this.endIndex = endIndex;
(_b = (_a = this.cbs).oncomment) === null || _b === void 0 ? void 0 : _b.call(_a, this.getSlice(start, endIndex - offset));
(_d = (_c = this.cbs).oncommentend) === null || _d === void 0 ? void 0 : _d.call(_c);
// Set `startIndex` for next node
this.startIndex = endIndex + 1;
};
/** @internal */
Parser.prototype.oncdata = function (start, endIndex, offset) {
var _a, _b, _c, _d, _e, _f, _g, _h, _j, _k;
this.endIndex = endIndex;
var value = this.getSlice(start, endIndex - offset);
if (this.options.xmlMode || this.options.recognizeCDATA) {
(_b = (_a = this.cbs).oncdatastart) === null || _b === void 0 ? void 0 : _b.call(_a);
(_d = (_c = this.cbs).ontext) === null || _d === void 0 ? void 0 : _d.call(_c, value);
(_f = (_e = this.cbs).oncdataend) === null || _f === void 0 ? void 0 : _f.call(_e);
}
else {
(_h = (_g = this.cbs).oncomment) === null || _h === void 0 ? void 0 : _h.call(_g, "[CDATA[".concat(value, "]]"));
(_k = (_j = this.cbs).oncommentend) === null || _k === void 0 ? void 0 : _k.call(_j);
}
// Set `startIndex` for next node
this.startIndex = endIndex + 1;
};
/** @internal */
Parser.prototype.onend = function () {
var _a, _b;
if (this.cbs.onclosetag) {
// Set the end index for all remaining tags
this.endIndex = this.startIndex;
for (var i = this.stack.length; i > 0; this.cbs.onclosetag(this.stack[--i], true))
;
}
(_b = (_a = this.cbs).onend) === null || _b === void 0 ? void 0 : _b.call(_a);
};
/**
* Resets the parser to a blank state, ready to parse a new HTML document
*/
Parser.prototype.reset = function () {
var _a, _b, _c, _d;
(_b = (_a = this.cbs).onreset) === null || _b === void 0 ? void 0 : _b.call(_a);
this.tokenizer.reset();
this.tagname = "";
this.attribname = "";
this.attribs = null;
this.stack.length = 0;
this.startIndex = 0;
this.endIndex = 0;
(_d = (_c = this.cbs).onparserinit) === null || _d === void 0 ? void 0 : _d.call(_c, this);
this.buffers.length = 0;
this.bufferOffset = 0;
this.writeIndex = 0;
this.ended = false;
};
/**
* Resets the parser, then parses a complete document and
* pushes it to the handler.
*
* @param data Document to parse.
*/
Parser.prototype.parseComplete = function (data) {
this.reset();
this.end(data);
};
Parser.prototype.getSlice = function (start, end) {
while (start - this.bufferOffset >= this.buffers[0].length) {
this.shiftBuffer();
}
var str = this.buffers[0].slice(start - this.bufferOffset, end - this.bufferOffset);
while (end - this.bufferOffset > this.buffers[0].length) {
this.shiftBuffer();
str += this.buffers[0].slice(0, end - this.bufferOffset);
}
return str;
};
Parser.prototype.shiftBuffer = function () {
this.bufferOffset += this.buffers[0].length;
this.writeIndex--;
this.buffers.shift();
};
/**
* Parses a chunk of data and calls the corresponding callbacks.
*
* @param chunk Chunk to parse.
*/
Parser.prototype.write = function (chunk) {
var _a, _b;
if (this.ended) {
(_b = (_a = this.cbs).onerror) === null || _b === void 0 ? void 0 : _b.call(_a, new Error(".write() after done!"));
return;
}
this.buffers.push(chunk);
if (this.tokenizer.running) {
this.tokenizer.write(chunk);
this.writeIndex++;
}
};
/**
* Parses the end of the buffer and clears the stack, calls onend.
*
* @param chunk Optional final chunk to parse.
*/
Parser.prototype.end = function (chunk) {
var _a, _b;
if (this.ended) {
(_b = (_a = this.cbs).onerror) === null || _b === void 0 ? void 0 : _b.call(_a, Error(".end() after done!"));
return;
}
if (chunk)
this.write(chunk);
this.ended = true;
this.tokenizer.end();
};
/**
* Pauses parsing. The parser won't emit events until `resume` is called.
*/
Parser.prototype.pause = function () {
this.tokenizer.pause();
};
/**
* Resumes parsing after `pause` was called.
*/
Parser.prototype.resume = function () {
this.tokenizer.resume();
while (this.tokenizer.running &&
this.writeIndex < this.buffers.length) {
this.tokenizer.write(this.buffers[this.writeIndex++]);
}
if (this.ended)
this.tokenizer.end();
};
/**
* Alias of `write`, for backwards compatibility.
*
* @param chunk Chunk to parse.
* @deprecated
*/
Parser.prototype.parseChunk = function (chunk) {
this.write(chunk);
};
/**
* Alias of `end`, for backwards compatibility.
*
* @param chunk Optional final chunk to parse.
* @deprecated
*/
Parser.prototype.done = function (chunk) {
this.end(chunk);
};
return Parser;
}());
module.exports = Parser;

View File

@@ -0,0 +1,903 @@
var decode_js_1 = require("./entities/decode.js");
var CharCodes;
(function (CharCodes) {
CharCodes[CharCodes["Tab"] = 9] = "Tab";
CharCodes[CharCodes["NewLine"] = 10] = "NewLine";
CharCodes[CharCodes["FormFeed"] = 12] = "FormFeed";
CharCodes[CharCodes["CarriageReturn"] = 13] = "CarriageReturn";
CharCodes[CharCodes["Space"] = 32] = "Space";
CharCodes[CharCodes["ExclamationMark"] = 33] = "ExclamationMark";
CharCodes[CharCodes["Num"] = 35] = "Num";
CharCodes[CharCodes["Amp"] = 38] = "Amp";
CharCodes[CharCodes["SingleQuote"] = 39] = "SingleQuote";
CharCodes[CharCodes["DoubleQuote"] = 34] = "DoubleQuote";
CharCodes[CharCodes["Dash"] = 45] = "Dash";
CharCodes[CharCodes["Slash"] = 47] = "Slash";
CharCodes[CharCodes["Zero"] = 48] = "Zero";
CharCodes[CharCodes["Nine"] = 57] = "Nine";
CharCodes[CharCodes["Semi"] = 59] = "Semi";
CharCodes[CharCodes["Lt"] = 60] = "Lt";
CharCodes[CharCodes["Eq"] = 61] = "Eq";
CharCodes[CharCodes["Gt"] = 62] = "Gt";
CharCodes[CharCodes["Questionmark"] = 63] = "Questionmark";
CharCodes[CharCodes["UpperA"] = 65] = "UpperA";
CharCodes[CharCodes["LowerA"] = 97] = "LowerA";
CharCodes[CharCodes["UpperF"] = 70] = "UpperF";
CharCodes[CharCodes["LowerF"] = 102] = "LowerF";
CharCodes[CharCodes["UpperZ"] = 90] = "UpperZ";
CharCodes[CharCodes["LowerZ"] = 122] = "LowerZ";
CharCodes[CharCodes["LowerX"] = 120] = "LowerX";
CharCodes[CharCodes["OpeningSquareBracket"] = 91] = "OpeningSquareBracket";
})(CharCodes || (CharCodes = {}));
/** All the states the tokenizer can be in. */
var State;
(function (State) {
State[State["Text"] = 1] = "Text";
State[State["BeforeTagName"] = 2] = "BeforeTagName";
State[State["InTagName"] = 3] = "InTagName";
State[State["InSelfClosingTag"] = 4] = "InSelfClosingTag";
State[State["BeforeClosingTagName"] = 5] = "BeforeClosingTagName";
State[State["InClosingTagName"] = 6] = "InClosingTagName";
State[State["AfterClosingTagName"] = 7] = "AfterClosingTagName";
// Attributes
State[State["BeforeAttributeName"] = 8] = "BeforeAttributeName";
State[State["InAttributeName"] = 9] = "InAttributeName";
State[State["AfterAttributeName"] = 10] = "AfterAttributeName";
State[State["BeforeAttributeValue"] = 11] = "BeforeAttributeValue";
State[State["InAttributeValueDq"] = 12] = "InAttributeValueDq";
State[State["InAttributeValueSq"] = 13] = "InAttributeValueSq";
State[State["InAttributeValueNq"] = 14] = "InAttributeValueNq";
// Declarations
State[State["BeforeDeclaration"] = 15] = "BeforeDeclaration";
State[State["InDeclaration"] = 16] = "InDeclaration";
// Processing instructions
State[State["InProcessingInstruction"] = 17] = "InProcessingInstruction";
// Comments & CDATA
State[State["BeforeComment"] = 18] = "BeforeComment";
State[State["CDATASequence"] = 19] = "CDATASequence";
State[State["InSpecialComment"] = 20] = "InSpecialComment";
State[State["InCommentLike"] = 21] = "InCommentLike";
// Special tags
State[State["BeforeSpecialS"] = 22] = "BeforeSpecialS";
State[State["SpecialStartSequence"] = 23] = "SpecialStartSequence";
State[State["InSpecialTag"] = 24] = "InSpecialTag";
State[State["BeforeEntity"] = 25] = "BeforeEntity";
State[State["BeforeNumericEntity"] = 26] = "BeforeNumericEntity";
State[State["InNamedEntity"] = 27] = "InNamedEntity";
State[State["InNumericEntity"] = 28] = "InNumericEntity";
State[State["InHexEntity"] = 29] = "InHexEntity";
})(State || (State = {}));
function isWhitespace(c) {
return (c === CharCodes.Space ||
c === CharCodes.NewLine ||
c === CharCodes.Tab ||
c === CharCodes.FormFeed ||
c === CharCodes.CarriageReturn);
}
function isEndOfTagSection(c) {
return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c);
}
function isNumber(c) {
return c >= CharCodes.Zero && c <= CharCodes.Nine;
}
function isASCIIAlpha(c) {
return ((c >= CharCodes.LowerA && c <= CharCodes.LowerZ) ||
(c >= CharCodes.UpperA && c <= CharCodes.UpperZ));
}
function isHexDigit(c) {
return ((c >= CharCodes.UpperA && c <= CharCodes.UpperF) ||
(c >= CharCodes.LowerA && c <= CharCodes.LowerF));
}
var QuoteType;
(function (QuoteType) {
QuoteType[QuoteType["NoValue"] = 0] = "NoValue";
QuoteType[QuoteType["Unquoted"] = 1] = "Unquoted";
QuoteType[QuoteType["Single"] = 2] = "Single";
QuoteType[QuoteType["Double"] = 3] = "Double";
})(QuoteType = exports.QuoteType || (exports.QuoteType = {}));
/**
* Sequences used to match longer strings.
*
* We don't have `Script`, `Style`, or `Title` here. Instead, we re-use the *End
* sequences with an increased offset.
*/
var Sequences = {
Cdata: new Uint8Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]),
CdataEnd: new Uint8Array([0x5d, 0x5d, 0x3e]),
CommentEnd: new Uint8Array([0x2d, 0x2d, 0x3e]),
ScriptEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x63, 0x72, 0x69, 0x70, 0x74]),
StyleEnd: new Uint8Array([0x3c, 0x2f, 0x73, 0x74, 0x79, 0x6c, 0x65]),
TitleEnd: new Uint8Array([0x3c, 0x2f, 0x74, 0x69, 0x74, 0x6c, 0x65]), // `</title`
};
var Tokenizer = /** @class */ (function () {
function Tokenizer(_a, cbs) {
var _b = _a.xmlMode, xmlMode = _b === void 0 ? false : _b, _c = _a.decodeEntities, decodeEntities = _c === void 0 ? true : _c;
this.cbs = cbs;
/** The current state the tokenizer is in. */
this.state = State.Text;
/** The read buffer. */
this.buffer = "";
/** The beginning of the section that is currently being read. */
this.sectionStart = 0;
/** The index within the buffer that we are currently looking at. */
this.index = 0;
/** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */
this.baseState = State.Text;
/** For special parsing behavior inside of script and style tags. */
this.isSpecial = false;
/** Indicates whether the tokenizer has been paused. */
this.running = true;
/** The offset of the current buffer. */
this.offset = 0;
this.sequenceIndex = 0;
this.trieIndex = 0;
this.trieCurrent = 0;
/** For named entities, the index of the value. For numeric entities, the code point. */
this.entityResult = 0;
this.entityExcess = 0;
this.xmlMode = xmlMode;
this.decodeEntities = decodeEntities;
this.entityTrie = xmlMode ? decode_js_1.xmlDecodeTree : decode_js_1.htmlDecodeTree;
}
Tokenizer.prototype.reset = function () {
this.state = State.Text;
this.buffer = "";
this.sectionStart = 0;
this.index = 0;
this.baseState = State.Text;
this.currentSequence = undefined;
this.running = true;
this.offset = 0;
};
Tokenizer.prototype.write = function (chunk) {
this.offset += this.buffer.length;
this.buffer = chunk;
this.parse();
};
Tokenizer.prototype.end = function () {
if (this.running)
this.finish();
};
Tokenizer.prototype.pause = function () {
this.running = false;
};
Tokenizer.prototype.resume = function () {
this.running = true;
if (this.index < this.buffer.length + this.offset) {
this.parse();
}
};
/**
* The current index within all of the written data.
*/
Tokenizer.prototype.getIndex = function () {
return this.index;
};
/**
* The start of the current section.
*/
Tokenizer.prototype.getSectionStart = function () {
return this.sectionStart;
};
Tokenizer.prototype.stateText = function (c) {
if (c === CharCodes.Lt ||
(!this.decodeEntities && this.fastForwardTo(CharCodes.Lt))) {
if (this.index > this.sectionStart) {
this.cbs.ontext(this.sectionStart, this.index);
}
this.state = State.BeforeTagName;
this.sectionStart = this.index;
}
else if (this.decodeEntities && c === CharCodes.Amp) {
this.state = State.BeforeEntity;
}
};
Tokenizer.prototype.stateSpecialStartSequence = function (c) {
var isEnd = this.sequenceIndex === this.currentSequence.length;
var isMatch = isEnd
? // If we are at the end of the sequence, make sure the tag name has ended
isEndOfTagSection(c)
: // Otherwise, do a case-insensitive comparison
(c | 0x20) === this.currentSequence[this.sequenceIndex];
if (!isMatch) {
this.isSpecial = false;
}
else if (!isEnd) {
this.sequenceIndex++;
return;
}
this.sequenceIndex = 0;
this.state = State.InTagName;
this.stateInTagName(c);
};
/** Look for an end tag. For <title> tags, also decode entities. */
Tokenizer.prototype.stateInSpecialTag = function (c) {
if (this.sequenceIndex === this.currentSequence.length) {
if (c === CharCodes.Gt || isWhitespace(c)) {
var endOfText = this.index - this.currentSequence.length;
if (this.sectionStart < endOfText) {
// Spoof the index so that reported locations match up.
var actualIndex = this.index;
this.index = endOfText;
this.cbs.ontext(this.sectionStart, endOfText);
this.index = actualIndex;
}
this.isSpecial = false;
this.sectionStart = endOfText + 2; // Skip over the `</`
this.stateInClosingTagName(c);
return; // We are done; skip the rest of the function.
}
this.sequenceIndex = 0;
}
if ((c | 0x20) === this.currentSequence[this.sequenceIndex]) {
this.sequenceIndex += 1;
}
else if (this.sequenceIndex === 0) {
if (this.currentSequence === Sequences.TitleEnd) {
// We have to parse entities in <title> tags.
if (this.decodeEntities && c === CharCodes.Amp) {
this.state = State.BeforeEntity;
}
}
else if (this.fastForwardTo(CharCodes.Lt)) {
// Outside of <title> tags, we can fast-forward.
this.sequenceIndex = 1;
}
}
else {
// If we see a `<`, set the sequence index to 1; useful for eg. `<</script>`.
this.sequenceIndex = Number(c === CharCodes.Lt);
}
};
Tokenizer.prototype.stateCDATASequence = function (c) {
if (c === Sequences.Cdata[this.sequenceIndex]) {
if (++this.sequenceIndex === Sequences.Cdata.length) {
this.state = State.InCommentLike;
this.currentSequence = Sequences.CdataEnd;
this.sequenceIndex = 0;
this.sectionStart = this.index + 1;
}
}
else {
this.sequenceIndex = 0;
this.state = State.InDeclaration;
this.stateInDeclaration(c); // Reconsume the character
}
};
/**
* When we wait for one specific character, we can speed things up
* by skipping through the buffer until we find it.
*
* @returns Whether the character was found.
*/
Tokenizer.prototype.fastForwardTo = function (c) {
while (++this.index < this.buffer.length + this.offset) {
if (this.buffer.charCodeAt(this.index - this.offset) === c) {
return true;
}
}
/*
* We increment the index at the end of the `parse` loop,
* so set it to `buffer.length - 1` here.
*
* TODO: Refactor `parse` to increment index before calling states.
*/
this.index = this.buffer.length + this.offset - 1;
return false;
};
/**
* Comments and CDATA end with `-->` and `]]>`.
*
* Their common qualities are:
* - Their end sequences have a distinct character they start with.
* - That character is then repeated, so we have to check multiple repeats.
* - All characters but the start character of the sequence can be skipped.
*/
Tokenizer.prototype.stateInCommentLike = function (c) {
if (c === this.currentSequence[this.sequenceIndex]) {
if (++this.sequenceIndex === this.currentSequence.length) {
if (this.currentSequence === Sequences.CdataEnd) {
this.cbs.oncdata(this.sectionStart, this.index, 2);
}
else {
this.cbs.oncomment(this.sectionStart, this.index, 2);
}
this.sequenceIndex = 0;
this.sectionStart = this.index + 1;
this.state = State.Text;
}
}
else if (this.sequenceIndex === 0) {
// Fast-forward to the first character of the sequence
if (this.fastForwardTo(this.currentSequence[0])) {
this.sequenceIndex = 1;
}
}
else if (c !== this.currentSequence[this.sequenceIndex - 1]) {
// Allow long sequences, eg. --->, ]]]>
this.sequenceIndex = 0;
}
};
/**
* HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name.
*
* XML allows a lot more characters here (@see https://www.w3.org/TR/REC-xml/#NT-NameStartChar).
* We allow anything that wouldn't end the tag.
*/
Tokenizer.prototype.isTagStartChar = function (c) {
return this.xmlMode ? !isEndOfTagSection(c) : isASCIIAlpha(c);
};
Tokenizer.prototype.startSpecial = function (sequence, offset) {
this.isSpecial = true;
this.currentSequence = sequence;
this.sequenceIndex = offset;
this.state = State.SpecialStartSequence;
};
Tokenizer.prototype.stateBeforeTagName = function (c) {
if (c === CharCodes.ExclamationMark) {
this.state = State.BeforeDeclaration;
this.sectionStart = this.index + 1;
}
else if (c === CharCodes.Questionmark) {
this.state = State.InProcessingInstruction;
this.sectionStart = this.index + 1;
}
else if (this.isTagStartChar(c)) {
var lower = c | 0x20;
this.sectionStart = this.index;
if (!this.xmlMode && lower === Sequences.TitleEnd[2]) {
this.startSpecial(Sequences.TitleEnd, 3);
}
else {
this.state =
!this.xmlMode && lower === Sequences.ScriptEnd[2]
? State.BeforeSpecialS
: State.InTagName;
}
}
else if (c === CharCodes.Slash) {
this.state = State.BeforeClosingTagName;
}
else {
this.state = State.Text;
this.stateText(c);
}
};
Tokenizer.prototype.stateInTagName = function (c) {
if (isEndOfTagSection(c)) {
this.cbs.onopentagname(this.sectionStart, this.index);
this.sectionStart = -1;
this.state = State.BeforeAttributeName;
this.stateBeforeAttributeName(c);
}
};
Tokenizer.prototype.stateBeforeClosingTagName = function (c) {
if (isWhitespace(c)) {
// Ignore
}
else if (c === CharCodes.Gt) {
this.state = State.Text;
}
else {
this.state = this.isTagStartChar(c)
? State.InClosingTagName
: State.InSpecialComment;
this.sectionStart = this.index;
}
};
Tokenizer.prototype.stateInClosingTagName = function (c) {
if (c === CharCodes.Gt || isWhitespace(c)) {
this.cbs.onclosetag(this.sectionStart, this.index);
this.sectionStart = -1;
this.state = State.AfterClosingTagName;
this.stateAfterClosingTagName(c);
}
};
Tokenizer.prototype.stateAfterClosingTagName = function (c) {
// Skip everything until ">"
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
this.state = State.Text;
this.sectionStart = this.index + 1;
}
};
Tokenizer.prototype.stateBeforeAttributeName = function (c) {
if (c === CharCodes.Gt) {
this.cbs.onopentagend(this.index);
if (this.isSpecial) {
this.state = State.InSpecialTag;
this.sequenceIndex = 0;
}
else {
this.state = State.Text;
}
this.baseState = this.state;
this.sectionStart = this.index + 1;
}
else if (c === CharCodes.Slash) {
this.state = State.InSelfClosingTag;
}
else if (!isWhitespace(c)) {
this.state = State.InAttributeName;
this.sectionStart = this.index;
}
};
Tokenizer.prototype.stateInSelfClosingTag = function (c) {
if (c === CharCodes.Gt) {
this.cbs.onselfclosingtag(this.index);
this.state = State.Text;
this.baseState = State.Text;
this.sectionStart = this.index + 1;
this.isSpecial = false; // Reset special state, in case of self-closing special tags
}
else if (!isWhitespace(c)) {
this.state = State.BeforeAttributeName;
this.stateBeforeAttributeName(c);
}
};
Tokenizer.prototype.stateInAttributeName = function (c) {
if (c === CharCodes.Eq || isEndOfTagSection(c)) {
this.cbs.onattribname(this.sectionStart, this.index);
this.sectionStart = -1;
this.state = State.AfterAttributeName;
this.stateAfterAttributeName(c);
}
};
Tokenizer.prototype.stateAfterAttributeName = function (c) {
if (c === CharCodes.Eq) {
this.state = State.BeforeAttributeValue;
}
else if (c === CharCodes.Slash || c === CharCodes.Gt) {
this.cbs.onattribend(QuoteType.NoValue, this.index);
this.state = State.BeforeAttributeName;
this.stateBeforeAttributeName(c);
}
else if (!isWhitespace(c)) {
this.cbs.onattribend(QuoteType.NoValue, this.index);
this.state = State.InAttributeName;
this.sectionStart = this.index;
}
};
Tokenizer.prototype.stateBeforeAttributeValue = function (c) {
if (c === CharCodes.DoubleQuote) {
this.state = State.InAttributeValueDq;
this.sectionStart = this.index + 1;
}
else if (c === CharCodes.SingleQuote) {
this.state = State.InAttributeValueSq;
this.sectionStart = this.index + 1;
}
else if (!isWhitespace(c)) {
this.sectionStart = this.index;
this.state = State.InAttributeValueNq;
this.stateInAttributeValueNoQuotes(c); // Reconsume token
}
};
Tokenizer.prototype.handleInAttributeValue = function (c, quote) {
if (c === quote ||
(!this.decodeEntities && this.fastForwardTo(quote))) {
this.cbs.onattribdata(this.sectionStart, this.index);
this.sectionStart = -1;
this.cbs.onattribend(quote === CharCodes.DoubleQuote
? QuoteType.Double
: QuoteType.Single, this.index);
this.state = State.BeforeAttributeName;
}
else if (this.decodeEntities && c === CharCodes.Amp) {
this.baseState = this.state;
this.state = State.BeforeEntity;
}
};
Tokenizer.prototype.stateInAttributeValueDoubleQuotes = function (c) {
this.handleInAttributeValue(c, CharCodes.DoubleQuote);
};
Tokenizer.prototype.stateInAttributeValueSingleQuotes = function (c) {
this.handleInAttributeValue(c, CharCodes.SingleQuote);
};
Tokenizer.prototype.stateInAttributeValueNoQuotes = function (c) {
if (isWhitespace(c) || c === CharCodes.Gt) {
this.cbs.onattribdata(this.sectionStart, this.index);
this.sectionStart = -1;
this.cbs.onattribend(QuoteType.Unquoted, this.index);
this.state = State.BeforeAttributeName;
this.stateBeforeAttributeName(c);
}
else if (this.decodeEntities && c === CharCodes.Amp) {
this.baseState = this.state;
this.state = State.BeforeEntity;
}
};
Tokenizer.prototype.stateBeforeDeclaration = function (c) {
if (c === CharCodes.OpeningSquareBracket) {
this.state = State.CDATASequence;
this.sequenceIndex = 0;
}
else {
this.state =
c === CharCodes.Dash
? State.BeforeComment
: State.InDeclaration;
}
};
Tokenizer.prototype.stateInDeclaration = function (c) {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
this.cbs.ondeclaration(this.sectionStart, this.index);
this.state = State.Text;
this.sectionStart = this.index + 1;
}
};
Tokenizer.prototype.stateInProcessingInstruction = function (c) {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
this.cbs.onprocessinginstruction(this.sectionStart, this.index);
this.state = State.Text;
this.sectionStart = this.index + 1;
}
};
Tokenizer.prototype.stateBeforeComment = function (c) {
if (c === CharCodes.Dash) {
this.state = State.InCommentLike;
this.currentSequence = Sequences.CommentEnd;
// Allow short comments (eg. <!-->)
this.sequenceIndex = 2;
this.sectionStart = this.index + 1;
}
else {
this.state = State.InDeclaration;
}
};
Tokenizer.prototype.stateInSpecialComment = function (c) {
if (c === CharCodes.Gt || this.fastForwardTo(CharCodes.Gt)) {
this.cbs.oncomment(this.sectionStart, this.index, 0);
this.state = State.Text;
this.sectionStart = this.index + 1;
}
};
Tokenizer.prototype.stateBeforeSpecialS = function (c) {
var lower = c | 0x20;
if (lower === Sequences.ScriptEnd[3]) {
this.startSpecial(Sequences.ScriptEnd, 4);
}
else if (lower === Sequences.StyleEnd[3]) {
this.startSpecial(Sequences.StyleEnd, 4);
}
else {
this.state = State.InTagName;
this.stateInTagName(c); // Consume the token again
}
};
Tokenizer.prototype.stateBeforeEntity = function (c) {
// Start excess with 1 to include the '&'
this.entityExcess = 1;
this.entityResult = 0;
if (c === CharCodes.Num) {
this.state = State.BeforeNumericEntity;
}
else if (c === CharCodes.Amp) {
// We have two `&` characters in a row. Stay in the current state.
}
else {
this.trieIndex = 0;
this.trieCurrent = this.entityTrie[0];
this.state = State.InNamedEntity;
this.stateInNamedEntity(c);
}
};
Tokenizer.prototype.stateInNamedEntity = function (c) {
this.entityExcess += 1;
this.trieIndex = (0, decode_js_1.determineBranch)(this.entityTrie, this.trieCurrent, this.trieIndex + 1, c);
if (this.trieIndex < 0) {
this.emitNamedEntity();
this.index--;
return;
}
this.trieCurrent = this.entityTrie[this.trieIndex];
var masked = this.trieCurrent & decode_js_1.BinTrieFlags.VALUE_LENGTH;
// If the branch is a value, store it and continue
if (masked) {
// The mask is the number of bytes of the value, including the current byte.
var valueLength = (masked >> 14) - 1;
// If we have a legacy entity while parsing strictly, just skip the number of bytes
if (!this.allowLegacyEntity() && c !== CharCodes.Semi) {
this.trieIndex += valueLength;
}
else {
// Add 1 as we have already incremented the excess
var entityStart = this.index - this.entityExcess + 1;
if (entityStart > this.sectionStart) {
this.emitPartial(this.sectionStart, entityStart);
}
// If this is a surrogate pair, consume the next two bytes
this.entityResult = this.trieIndex;
this.trieIndex += valueLength;
this.entityExcess = 0;
this.sectionStart = this.index + 1;
if (valueLength === 0) {
this.emitNamedEntity();
}
}
}
};
Tokenizer.prototype.emitNamedEntity = function () {
this.state = this.baseState;
if (this.entityResult === 0) {
return;
}
var valueLength = (this.entityTrie[this.entityResult] & decode_js_1.BinTrieFlags.VALUE_LENGTH) >>
14;
switch (valueLength) {
case 1:
this.emitCodePoint(this.entityTrie[this.entityResult] &
~decode_js_1.BinTrieFlags.VALUE_LENGTH);
break;
case 2:
this.emitCodePoint(this.entityTrie[this.entityResult + 1]);
break;
case 3: {
this.emitCodePoint(this.entityTrie[this.entityResult + 1]);
this.emitCodePoint(this.entityTrie[this.entityResult + 2]);
}
}
};
Tokenizer.prototype.stateBeforeNumericEntity = function (c) {
if ((c | 0x20) === CharCodes.LowerX) {
this.entityExcess++;
this.state = State.InHexEntity;
}
else {
this.state = State.InNumericEntity;
this.stateInNumericEntity(c);
}
};
Tokenizer.prototype.emitNumericEntity = function (strict) {
var entityStart = this.index - this.entityExcess - 1;
var numberStart = entityStart + 2 + Number(this.state === State.InHexEntity);
if (numberStart !== this.index) {
// Emit leading data if any
if (entityStart > this.sectionStart) {
this.emitPartial(this.sectionStart, entityStart);
}
this.sectionStart = this.index + Number(strict);
this.emitCodePoint((0, decode_js_1.replaceCodePoint)(this.entityResult));
}
this.state = this.baseState;
};
Tokenizer.prototype.stateInNumericEntity = function (c) {
if (c === CharCodes.Semi) {
this.emitNumericEntity(true);
}
else if (isNumber(c)) {
this.entityResult = this.entityResult * 10 + (c - CharCodes.Zero);
this.entityExcess++;
}
else {
if (this.allowLegacyEntity()) {
this.emitNumericEntity(false);
}
else {
this.state = this.baseState;
}
this.index--;
}
};
Tokenizer.prototype.stateInHexEntity = function (c) {
if (c === CharCodes.Semi) {
this.emitNumericEntity(true);
}
else if (isNumber(c)) {
this.entityResult = this.entityResult * 16 + (c - CharCodes.Zero);
this.entityExcess++;
}
else if (isHexDigit(c)) {
this.entityResult =
this.entityResult * 16 + ((c | 0x20) - CharCodes.LowerA + 10);
this.entityExcess++;
}
else {
if (this.allowLegacyEntity()) {
this.emitNumericEntity(false);
}
else {
this.state = this.baseState;
}
this.index--;
}
};
Tokenizer.prototype.allowLegacyEntity = function () {
return (!this.xmlMode &&
(this.baseState === State.Text ||
this.baseState === State.InSpecialTag));
};
/**
* Remove data that has already been consumed from the buffer.
*/
Tokenizer.prototype.cleanup = function () {
// If we are inside of text or attributes, emit what we already have.
if (this.running && this.sectionStart !== this.index) {
if (this.state === State.Text ||
(this.state === State.InSpecialTag && this.sequenceIndex === 0)) {
this.cbs.ontext(this.sectionStart, this.index);
this.sectionStart = this.index;
}
else if (this.state === State.InAttributeValueDq ||
this.state === State.InAttributeValueSq ||
this.state === State.InAttributeValueNq) {
this.cbs.onattribdata(this.sectionStart, this.index);
this.sectionStart = this.index;
}
}
};
Tokenizer.prototype.shouldContinue = function () {
return this.index < this.buffer.length + this.offset && this.running;
};
/**
* Iterates through the buffer, calling the function corresponding to the current state.
*
* States that are more likely to be hit are higher up, as a performance improvement.
*/
Tokenizer.prototype.parse = function () {
while (this.shouldContinue()) {
var c = this.buffer.charCodeAt(this.index - this.offset);
if (this.state === State.Text) {
this.stateText(c);
}
else if (this.state === State.SpecialStartSequence) {
this.stateSpecialStartSequence(c);
}
else if (this.state === State.InSpecialTag) {
this.stateInSpecialTag(c);
}
else if (this.state === State.CDATASequence) {
this.stateCDATASequence(c);
}
else if (this.state === State.InAttributeValueDq) {
this.stateInAttributeValueDoubleQuotes(c);
}
else if (this.state === State.InAttributeName) {
this.stateInAttributeName(c);
}
else if (this.state === State.InCommentLike) {
this.stateInCommentLike(c);
}
else if (this.state === State.InSpecialComment) {
this.stateInSpecialComment(c);
}
else if (this.state === State.BeforeAttributeName) {
this.stateBeforeAttributeName(c);
}
else if (this.state === State.InTagName) {
this.stateInTagName(c);
}
else if (this.state === State.InClosingTagName) {
this.stateInClosingTagName(c);
}
else if (this.state === State.BeforeTagName) {
this.stateBeforeTagName(c);
}
else if (this.state === State.AfterAttributeName) {
this.stateAfterAttributeName(c);
}
else if (this.state === State.InAttributeValueSq) {
this.stateInAttributeValueSingleQuotes(c);
}
else if (this.state === State.BeforeAttributeValue) {
this.stateBeforeAttributeValue(c);
}
else if (this.state === State.BeforeClosingTagName) {
this.stateBeforeClosingTagName(c);
}
else if (this.state === State.AfterClosingTagName) {
this.stateAfterClosingTagName(c);
}
else if (this.state === State.BeforeSpecialS) {
this.stateBeforeSpecialS(c);
}
else if (this.state === State.InAttributeValueNq) {
this.stateInAttributeValueNoQuotes(c);
}
else if (this.state === State.InSelfClosingTag) {
this.stateInSelfClosingTag(c);
}
else if (this.state === State.InDeclaration) {
this.stateInDeclaration(c);
}
else if (this.state === State.BeforeDeclaration) {
this.stateBeforeDeclaration(c);
}
else if (this.state === State.BeforeComment) {
this.stateBeforeComment(c);
}
else if (this.state === State.InProcessingInstruction) {
this.stateInProcessingInstruction(c);
}
else if (this.state === State.InNamedEntity) {
this.stateInNamedEntity(c);
}
else if (this.state === State.BeforeEntity) {
this.stateBeforeEntity(c);
}
else if (this.state === State.InHexEntity) {
this.stateInHexEntity(c);
}
else if (this.state === State.InNumericEntity) {
this.stateInNumericEntity(c);
}
else {
// `this._state === State.BeforeNumericEntity`
this.stateBeforeNumericEntity(c);
}
this.index++;
}
this.cleanup();
};
Tokenizer.prototype.finish = function () {
if (this.state === State.InNamedEntity) {
this.emitNamedEntity();
}
// If there is remaining data, emit it in a reasonable way
if (this.sectionStart < this.index) {
this.handleTrailingData();
}
this.cbs.onend();
};
/** Handle any trailing data. */
Tokenizer.prototype.handleTrailingData = function () {
var endIndex = this.buffer.length + this.offset;
if (this.state === State.InCommentLike) {
if (this.currentSequence === Sequences.CdataEnd) {
this.cbs.oncdata(this.sectionStart, endIndex, 0);
}
else {
this.cbs.oncomment(this.sectionStart, endIndex, 0);
}
}
else if (this.state === State.InNumericEntity &&
this.allowLegacyEntity()) {
this.emitNumericEntity(false);
// All trailing data will have been consumed
}
else if (this.state === State.InHexEntity &&
this.allowLegacyEntity()) {
this.emitNumericEntity(false);
// All trailing data will have been consumed
}
else if (this.state === State.InTagName ||
this.state === State.BeforeAttributeName ||
this.state === State.BeforeAttributeValue ||
this.state === State.AfterAttributeName ||
this.state === State.InAttributeName ||
this.state === State.InAttributeValueSq ||
this.state === State.InAttributeValueDq ||
this.state === State.InAttributeValueNq ||
this.state === State.InClosingTagName) {
/*
* If we are currently in an opening or closing tag, us not calling the
* respective callback signals that the tag should be ignored.
*/
}
else {
this.cbs.ontext(this.sectionStart, endIndex);
}
};
Tokenizer.prototype.emitPartial = function (start, endIndex) {
if (this.baseState !== State.Text &&
this.baseState !== State.InSpecialTag) {
this.cbs.onattribdata(start, endIndex);
}
else {
this.cbs.ontext(start, endIndex);
}
};
Tokenizer.prototype.emitCodePoint = function (cp) {
if (this.baseState !== State.Text &&
this.baseState !== State.InSpecialTag) {
this.cbs.onattribentity(cp);
}
else {
this.cbs.ontextentity(cp);
}
};
return Tokenizer;
}());
module.exports = {
default:Tokenizer,
QuoteType
}

View File

@@ -0,0 +1,55 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.Doctype = exports.CDATA = exports.Tag = exports.Style = exports.Script = exports.Comment = exports.Directive = exports.Text = exports.Root = exports.isTag = exports.ElementType = void 0;
/** Types of elements found in htmlparser2's DOM */
var ElementType;
(function (ElementType) {
/** Type for the root element of a document */
ElementType["Root"] = "root";
/** Type for Text */
ElementType["Text"] = "text";
/** Type for <? ... ?> */
ElementType["Directive"] = "directive";
/** Type for <!-- ... --> */
ElementType["Comment"] = "comment";
/** Type for <script> tags */
ElementType["Script"] = "script";
/** Type for <style> tags */
ElementType["Style"] = "style";
/** Type for Any tag */
ElementType["Tag"] = "tag";
/** Type for <![CDATA[ ... ]]> */
ElementType["CDATA"] = "cdata";
/** Type for <!doctype ...> */
ElementType["Doctype"] = "doctype";
})(ElementType = exports.ElementType || (exports.ElementType = {}));
/**
* Tests whether an element is a tag or not.
*
* @param elem Element to test
*/
function isTag(elem) {
return (elem.type === ElementType.Tag ||
elem.type === ElementType.Script ||
elem.type === ElementType.Style);
}
exports.isTag = isTag;
// Exports for backwards compatibility
/** Type for the root element of a document */
exports.Root = ElementType.Root;
/** Type for Text */
exports.Text = ElementType.Text;
/** Type for <? ... ?> */
exports.Directive = ElementType.Directive;
/** Type for <!-- ... --> */
exports.Comment = ElementType.Comment;
/** Type for <script> tags */
exports.Script = ElementType.Script;
/** Type for <style> tags */
exports.Style = ElementType.Style;
/** Type for Any tag */
exports.Tag = ElementType.Tag;
/** Type for <![CDATA[ ... ]]> */
exports.CDATA = ElementType.CDATA;
/** Type for <!doctype ...> */
exports.Doctype = ElementType.Doctype;

View File

@@ -0,0 +1,145 @@
var domelementtype_1 = require("../domelementtype/index.js");
var node_js_1 = require("./node.js");
var defaultOpts = {
withStartIndices: false,
withEndIndices: false,
xmlMode: false,
};
var DomHandler = /** @class */ (function () {
/**
* @param callback Called once parsing has completed.
* @param options Settings for the handler.
* @param elementCB Callback whenever a tag is closed.
*/
function DomHandler(callback, options, elementCB) {
/** The elements of the DOM */
this.dom = [];
/** The root element for the DOM */
this.root = new node_js_1.Document(this.dom);
/** Indicated whether parsing has been completed. */
this.done = false;
/** Stack of open tags. */
this.tagStack = [this.root];
/** A data node that is still being written to. */
this.lastNode = null;
/** Reference to the parser instance. Used for location information. */
this.parser = null;
// Make it possible to skip arguments, for backwards-compatibility
if (typeof options === "function") {
elementCB = options;
options = defaultOpts;
}
if (typeof callback === "object") {
options = callback;
callback = undefined;
}
this.callback = callback !== null && callback !== void 0 ? callback : null;
this.options = options !== null && options !== void 0 ? options : defaultOpts;
this.elementCB = elementCB !== null && elementCB !== void 0 ? elementCB : null;
}
DomHandler.prototype.onparserinit = function (parser) {
this.parser = parser;
};
// Resets the handler back to starting state
DomHandler.prototype.onreset = function () {
this.dom = [];
this.root = new node_js_1.Document(this.dom);
this.done = false;
this.tagStack = [this.root];
this.lastNode = null;
this.parser = null;
};
// Signals the handler that parsing is done
DomHandler.prototype.onend = function () {
if (this.done)
return;
this.done = true;
this.parser = null;
this.handleCallback(null);
};
DomHandler.prototype.onerror = function (error) {
this.handleCallback(error);
};
DomHandler.prototype.onclosetag = function () {
this.lastNode = null;
var elem = this.tagStack.pop();
if (this.options.withEndIndices) {
elem.endIndex = this.parser.endIndex;
}
if (this.elementCB)
this.elementCB(elem);
};
DomHandler.prototype.onopentag = function (name, attribs) {
var type = this.options.xmlMode ? domelementtype_1.ElementType.Tag : undefined;
var element = new node_js_1.Element(name, attribs, undefined, type);
this.addNode(element);
this.tagStack.push(element);
};
DomHandler.prototype.ontext = function (data) {
var lastNode = this.lastNode;
if (lastNode && lastNode.type === domelementtype_1.ElementType.Text) {
lastNode.data += data;
if (this.options.withEndIndices) {
lastNode.endIndex = this.parser.endIndex;
}
}
else {
var node = new node_js_1.Text(data);
this.addNode(node);
this.lastNode = node;
}
};
DomHandler.prototype.oncomment = function (data) {
if (this.lastNode && this.lastNode.type === domelementtype_1.ElementType.Comment) {
this.lastNode.data += data;
return;
}
var node = new node_js_1.Comment(data);
this.addNode(node);
this.lastNode = node;
};
DomHandler.prototype.oncommentend = function () {
this.lastNode = null;
};
DomHandler.prototype.oncdatastart = function () {
var text = new node_js_1.Text("");
var node = new node_js_1.CDATA([text]);
this.addNode(node);
text.parent = node;
this.lastNode = text;
};
DomHandler.prototype.oncdataend = function () {
this.lastNode = null;
};
DomHandler.prototype.onprocessinginstruction = function (name, data) {
var node = new node_js_1.ProcessingInstruction(name, data);
this.addNode(node);
};
DomHandler.prototype.handleCallback = function (error) {
if (typeof this.callback === "function") {
this.callback(error, this.dom);
}
else if (error) {
throw error;
}
};
DomHandler.prototype.addNode = function (node) {
var parent = this.tagStack[this.tagStack.length - 1];
var previousSibling = parent.children[parent.children.length - 1];
if (this.options.withStartIndices) {
node.startIndex = this.parser.startIndex;
}
if (this.options.withEndIndices) {
node.endIndex = this.parser.endIndex;
}
parent.children.push(node);
if (previousSibling) {
node.prev = previousSibling;
previousSibling.next = node;
}
node.parent = parent;
this.lastNode = null;
};
return DomHandler;
}());
module.exports = DomHandler;

View File

@@ -0,0 +1,474 @@
"use strict";
var __extends = (this && this.__extends) || (function () {
var extendStatics = function (d, b) {
extendStatics = Object.setPrototypeOf ||
({ __proto__: [] } instanceof Array && function (d, b) { d.__proto__ = b; }) ||
function (d, b) { for (var p in b) if (Object.prototype.hasOwnProperty.call(b, p)) d[p] = b[p]; };
return extendStatics(d, b);
};
return function (d, b) {
if (typeof b !== "function" && b !== null)
throw new TypeError("Class extends value " + String(b) + " is not a constructor or null");
extendStatics(d, b);
function __() { this.constructor = d; }
d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
};
})();
var __assign = (this && this.__assign) || function () {
__assign = Object.assign || function(t) {
for (var s, i = 1, n = arguments.length; i < n; i++) {
s = arguments[i];
for (var p in s) if (Object.prototype.hasOwnProperty.call(s, p))
t[p] = s[p];
}
return t;
};
return __assign.apply(this, arguments);
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.cloneNode = exports.hasChildren = exports.isDocument = exports.isDirective = exports.isComment = exports.isText = exports.isCDATA = exports.isTag = exports.Element = exports.Document = exports.CDATA = exports.NodeWithChildren = exports.ProcessingInstruction = exports.Comment = exports.Text = exports.DataNode = exports.Node = void 0;
var domelementtype_1 = require("../domelementtype/index.js");
/**
* This object will be used as the prototype for Nodes when creating a
* DOM-Level-1-compliant structure.
*/
var Node = /** @class */ (function () {
function Node() {
/** Parent of the node */
this.parent = null;
/** Previous sibling */
this.prev = null;
/** Next sibling */
this.next = null;
/** The start index of the node. Requires `withStartIndices` on the handler to be `true. */
this.startIndex = null;
/** The end index of the node. Requires `withEndIndices` on the handler to be `true. */
this.endIndex = null;
}
Object.defineProperty(Node.prototype, "parentNode", {
// Read-write aliases for properties
/**
* Same as {@link parent}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get: function () {
return this.parent;
},
set: function (parent) {
this.parent = parent;
},
enumerable: false,
configurable: true
});
Object.defineProperty(Node.prototype, "previousSibling", {
/**
* Same as {@link prev}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get: function () {
return this.prev;
},
set: function (prev) {
this.prev = prev;
},
enumerable: false,
configurable: true
});
Object.defineProperty(Node.prototype, "nextSibling", {
/**
* Same as {@link next}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get: function () {
return this.next;
},
set: function (next) {
this.next = next;
},
enumerable: false,
configurable: true
});
/**
* Clone this node, and optionally its children.
*
* @param recursive Clone child nodes as well.
* @returns A clone of the node.
*/
Node.prototype.cloneNode = function (recursive) {
if (recursive === void 0) { recursive = false; }
return cloneNode(this, recursive);
};
return Node;
}());
exports.Node = Node;
/**
* A node that contains some data.
*/
var DataNode = /** @class */ (function (_super) {
__extends(DataNode, _super);
/**
* @param data The content of the data node
*/
function DataNode(data) {
var _this = _super.call(this) || this;
_this.data = data;
return _this;
}
Object.defineProperty(DataNode.prototype, "nodeValue", {
/**
* Same as {@link data}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get: function () {
return this.data;
},
set: function (data) {
this.data = data;
},
enumerable: false,
configurable: true
});
return DataNode;
}(Node));
exports.DataNode = DataNode;
/**
* Text within the document.
*/
var Text = /** @class */ (function (_super) {
__extends(Text, _super);
function Text() {
var _this = _super !== null && _super.apply(this, arguments) || this;
_this.type = domelementtype_1.ElementType.Text;
return _this;
}
Object.defineProperty(Text.prototype, "nodeType", {
get: function () {
return 3;
},
enumerable: false,
configurable: true
});
return Text;
}(DataNode));
exports.Text = Text;
/**
* Comments within the document.
*/
var Comment = /** @class */ (function (_super) {
__extends(Comment, _super);
function Comment() {
var _this = _super !== null && _super.apply(this, arguments) || this;
_this.type = domelementtype_1.ElementType.Comment;
return _this;
}
Object.defineProperty(Comment.prototype, "nodeType", {
get: function () {
return 8;
},
enumerable: false,
configurable: true
});
return Comment;
}(DataNode));
exports.Comment = Comment;
/**
* Processing instructions, including doc types.
*/
var ProcessingInstruction = /** @class */ (function (_super) {
__extends(ProcessingInstruction, _super);
function ProcessingInstruction(name, data) {
var _this = _super.call(this, data) || this;
_this.name = name;
_this.type = domelementtype_1.ElementType.Directive;
return _this;
}
Object.defineProperty(ProcessingInstruction.prototype, "nodeType", {
get: function () {
return 1;
},
enumerable: false,
configurable: true
});
return ProcessingInstruction;
}(DataNode));
exports.ProcessingInstruction = ProcessingInstruction;
/**
* A `Node` that can have children.
*/
var NodeWithChildren = /** @class */ (function (_super) {
__extends(NodeWithChildren, _super);
/**
* @param children Children of the node. Only certain node types can have children.
*/
function NodeWithChildren(children) {
var _this = _super.call(this) || this;
_this.children = children;
return _this;
}
Object.defineProperty(NodeWithChildren.prototype, "firstChild", {
// Aliases
/** First child of the node. */
get: function () {
var _a;
return (_a = this.children[0]) !== null && _a !== void 0 ? _a : null;
},
enumerable: false,
configurable: true
});
Object.defineProperty(NodeWithChildren.prototype, "lastChild", {
/** Last child of the node. */
get: function () {
return this.children.length > 0
? this.children[this.children.length - 1]
: null;
},
enumerable: false,
configurable: true
});
Object.defineProperty(NodeWithChildren.prototype, "childNodes", {
/**
* Same as {@link children}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get: function () {
return this.children;
},
set: function (children) {
this.children = children;
},
enumerable: false,
configurable: true
});
return NodeWithChildren;
}(Node));
exports.NodeWithChildren = NodeWithChildren;
var CDATA = /** @class */ (function (_super) {
__extends(CDATA, _super);
function CDATA() {
var _this = _super !== null && _super.apply(this, arguments) || this;
_this.type = domelementtype_1.ElementType.CDATA;
return _this;
}
Object.defineProperty(CDATA.prototype, "nodeType", {
get: function () {
return 4;
},
enumerable: false,
configurable: true
});
return CDATA;
}(NodeWithChildren));
exports.CDATA = CDATA;
/**
* The root node of the document.
*/
var Document = /** @class */ (function (_super) {
__extends(Document, _super);
function Document() {
var _this = _super !== null && _super.apply(this, arguments) || this;
_this.type = domelementtype_1.ElementType.Root;
return _this;
}
Object.defineProperty(Document.prototype, "nodeType", {
get: function () {
return 9;
},
enumerable: false,
configurable: true
});
return Document;
}(NodeWithChildren));
exports.Document = Document;
/**
* An element within the DOM.
*/
var Element = /** @class */ (function (_super) {
__extends(Element, _super);
/**
* @param name Name of the tag, eg. `div`, `span`.
* @param attribs Object mapping attribute names to attribute values.
* @param children Children of the node.
*/
function Element(name, attribs, children, type) {
if (children === void 0) { children = []; }
if (type === void 0) { type = name === "script"
? domelementtype_1.ElementType.Script
: name === "style"
? domelementtype_1.ElementType.Style
: domelementtype_1.ElementType.Tag; }
var _this = _super.call(this, children) || this;
_this.name = name;
_this.attribs = attribs;
_this.type = type;
return _this;
}
Object.defineProperty(Element.prototype, "nodeType", {
get: function () {
return 1;
},
enumerable: false,
configurable: true
});
Object.defineProperty(Element.prototype, "tagName", {
// DOM Level 1 aliases
/**
* Same as {@link name}.
* [DOM spec](https://dom.spec.whatwg.org)-compatible alias.
*/
get: function () {
return this.name;
},
set: function (name) {
this.name = name;
},
enumerable: false,
configurable: true
});
Object.defineProperty(Element.prototype, "attributes", {
get: function () {
var _this = this;
return Object.keys(this.attribs).map(function (name) {
var _a, _b;
return ({
name: name,
value: _this.attribs[name],
namespace: (_a = _this["x-attribsNamespace"]) === null || _a === void 0 ? void 0 : _a[name],
prefix: (_b = _this["x-attribsPrefix"]) === null || _b === void 0 ? void 0 : _b[name],
});
});
},
enumerable: false,
configurable: true
});
return Element;
}(NodeWithChildren));
exports.Element = Element;
/**
* @param node Node to check.
* @returns `true` if the node is a `Element`, `false` otherwise.
*/
function isTag(node) {
return (0, domelementtype_1.isTag)(node);
}
exports.isTag = isTag;
/**
* @param node Node to check.
* @returns `true` if the node has the type `CDATA`, `false` otherwise.
*/
function isCDATA(node) {
return node.type === domelementtype_1.ElementType.CDATA;
}
exports.isCDATA = isCDATA;
/**
* @param node Node to check.
* @returns `true` if the node has the type `Text`, `false` otherwise.
*/
function isText(node) {
return node.type === domelementtype_1.ElementType.Text;
}
exports.isText = isText;
/**
* @param node Node to check.
* @returns `true` if the node has the type `Comment`, `false` otherwise.
*/
function isComment(node) {
return node.type === domelementtype_1.ElementType.Comment;
}
exports.isComment = isComment;
/**
* @param node Node to check.
* @returns `true` if the node has the type `ProcessingInstruction`, `false` otherwise.
*/
function isDirective(node) {
return node.type === domelementtype_1.ElementType.Directive;
}
exports.isDirective = isDirective;
/**
* @param node Node to check.
* @returns `true` if the node has the type `ProcessingInstruction`, `false` otherwise.
*/
function isDocument(node) {
return node.type === domelementtype_1.ElementType.Root;
}
exports.isDocument = isDocument;
/**
* @param node Node to check.
* @returns `true` if the node has children, `false` otherwise.
*/
function hasChildren(node) {
return Object.prototype.hasOwnProperty.call(node, "children");
}
exports.hasChildren = hasChildren;
/**
* Clone a node, and optionally its children.
*
* @param recursive Clone child nodes as well.
* @returns A clone of the node.
*/
function cloneNode(node, recursive) {
if (recursive === void 0) { recursive = false; }
var result;
if (isText(node)) {
result = new Text(node.data);
}
else if (isComment(node)) {
result = new Comment(node.data);
}
else if (isTag(node)) {
var children = recursive ? cloneChildren(node.children) : [];
var clone_1 = new Element(node.name, __assign({}, node.attribs), children);
children.forEach(function (child) { return (child.parent = clone_1); });
if (node.namespace != null) {
clone_1.namespace = node.namespace;
}
if (node["x-attribsNamespace"]) {
clone_1["x-attribsNamespace"] = __assign({}, node["x-attribsNamespace"]);
}
if (node["x-attribsPrefix"]) {
clone_1["x-attribsPrefix"] = __assign({}, node["x-attribsPrefix"]);
}
result = clone_1;
}
else if (isCDATA(node)) {
var children = recursive ? cloneChildren(node.children) : [];
var clone_2 = new CDATA(children);
children.forEach(function (child) { return (child.parent = clone_2); });
result = clone_2;
}
else if (isDocument(node)) {
var children = recursive ? cloneChildren(node.children) : [];
var clone_3 = new Document(children);
children.forEach(function (child) { return (child.parent = clone_3); });
if (node["x-mode"]) {
clone_3["x-mode"] = node["x-mode"];
}
result = clone_3;
}
else if (isDirective(node)) {
var instruction = new ProcessingInstruction(node.name, node.data);
if (node["x-name"] != null) {
instruction["x-name"] = node["x-name"];
instruction["x-publicId"] = node["x-publicId"];
instruction["x-systemId"] = node["x-systemId"];
}
result = instruction;
}
else {
throw new Error("Not implemented yet: ".concat(node.type));
}
result.startIndex = node.startIndex;
result.endIndex = node.endIndex;
if (node.sourceCodeLocation != null) {
result.sourceCodeLocation = node.sourceCodeLocation;
}
return result;
}
exports.cloneNode = cloneNode;
function cloneChildren(childs) {
var children = childs.map(function (child) { return cloneNode(child, true); });
for (var i = 1; i < children.length; i++) {
children[i].prev = children[i - 1];
children[i - 1].next = children[i];
}
return children;
}

View File

@@ -0,0 +1,179 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.decodeXML = exports.decodeHTMLStrict = exports.decodeHTML = exports.determineBranch = exports.BinTrieFlags = exports.fromCodePoint = exports.replaceCodePoint = exports.decodeCodePoint = exports.xmlDecodeTree = exports.htmlDecodeTree = void 0;
var decode_data_html_js_1 = __importDefault(require("./generated/decode-data-html.js"));
exports.htmlDecodeTree = decode_data_html_js_1.default;
var decode_data_xml_js_1 = __importDefault(require("./generated/decode-data-xml.js"));
exports.xmlDecodeTree = decode_data_xml_js_1.default;
var decode_codepoint_js_1 = __importDefault(require("./decode_codepoint.js"));
exports.decodeCodePoint = decode_codepoint_js_1.default;
var decode_codepoint_js_2 = require("./decode_codepoint.js");
Object.defineProperty(exports, "replaceCodePoint", { enumerable: true, get: function () { return decode_codepoint_js_2.replaceCodePoint; } });
Object.defineProperty(exports, "fromCodePoint", { enumerable: true, get: function () { return decode_codepoint_js_2.fromCodePoint; } });
var CharCodes;
(function (CharCodes) {
CharCodes[CharCodes["NUM"] = 35] = "NUM";
CharCodes[CharCodes["SEMI"] = 59] = "SEMI";
CharCodes[CharCodes["ZERO"] = 48] = "ZERO";
CharCodes[CharCodes["NINE"] = 57] = "NINE";
CharCodes[CharCodes["LOWER_A"] = 97] = "LOWER_A";
CharCodes[CharCodes["LOWER_F"] = 102] = "LOWER_F";
CharCodes[CharCodes["LOWER_X"] = 120] = "LOWER_X";
/** Bit that needs to be set to convert an upper case ASCII character to lower case */
CharCodes[CharCodes["To_LOWER_BIT"] = 32] = "To_LOWER_BIT";
})(CharCodes || (CharCodes = {}));
var BinTrieFlags;
(function (BinTrieFlags) {
BinTrieFlags[BinTrieFlags["VALUE_LENGTH"] = 49152] = "VALUE_LENGTH";
BinTrieFlags[BinTrieFlags["BRANCH_LENGTH"] = 16256] = "BRANCH_LENGTH";
BinTrieFlags[BinTrieFlags["JUMP_TABLE"] = 127] = "JUMP_TABLE";
})(BinTrieFlags = exports.BinTrieFlags || (exports.BinTrieFlags = {}));
function getDecoder(decodeTree) {
return function decodeHTMLBinary(str, strict) {
var ret = "";
var lastIdx = 0;
var strIdx = 0;
while ((strIdx = str.indexOf("&", strIdx)) >= 0) {
ret += str.slice(lastIdx, strIdx);
lastIdx = strIdx;
// Skip the "&"
strIdx += 1;
// If we have a numeric entity, handle this separately.
if (str.charCodeAt(strIdx) === CharCodes.NUM) {
// Skip the leading "&#". For hex entities, also skip the leading "x".
var start = strIdx + 1;
var base = 10;
var cp = str.charCodeAt(start);
if ((cp | CharCodes.To_LOWER_BIT) === CharCodes.LOWER_X) {
base = 16;
strIdx += 1;
start += 1;
}
do
cp = str.charCodeAt(++strIdx);
while ((cp >= CharCodes.ZERO && cp <= CharCodes.NINE) ||
(base === 16 &&
(cp | CharCodes.To_LOWER_BIT) >= CharCodes.LOWER_A &&
(cp | CharCodes.To_LOWER_BIT) <= CharCodes.LOWER_F));
if (start !== strIdx) {
var entity = str.substring(start, strIdx);
var parsed = parseInt(entity, base);
if (str.charCodeAt(strIdx) === CharCodes.SEMI) {
strIdx += 1;
}
else if (strict) {
continue;
}
ret += (0, decode_codepoint_js_1.default)(parsed);
lastIdx = strIdx;
}
continue;
}
var resultIdx = 0;
var excess = 1;
var treeIdx = 0;
var current = decodeTree[treeIdx];
for (; strIdx < str.length; strIdx++, excess++) {
treeIdx = determineBranch(decodeTree, current, treeIdx + 1, str.charCodeAt(strIdx));
if (treeIdx < 0)
break;
current = decodeTree[treeIdx];
var masked = current & BinTrieFlags.VALUE_LENGTH;
// If the branch is a value, store it and continue
if (masked) {
// If we have a legacy entity while parsing strictly, just skip the number of bytes
if (!strict || str.charCodeAt(strIdx) === CharCodes.SEMI) {
resultIdx = treeIdx;
excess = 0;
}
// The mask is the number of bytes of the value, including the current byte.
var valueLength = (masked >> 14) - 1;
if (valueLength === 0)
break;
treeIdx += valueLength;
}
}
if (resultIdx !== 0) {
var valueLength = (decodeTree[resultIdx] & BinTrieFlags.VALUE_LENGTH) >> 14;
ret +=
valueLength === 1
? String.fromCharCode(decodeTree[resultIdx] & ~BinTrieFlags.VALUE_LENGTH)
: valueLength === 2
? String.fromCharCode(decodeTree[resultIdx + 1])
: String.fromCharCode(decodeTree[resultIdx + 1], decodeTree[resultIdx + 2]);
lastIdx = strIdx - excess + 1;
}
}
return ret + str.slice(lastIdx);
};
}
function determineBranch(decodeTree, current, nodeIdx, char) {
var branchCount = (current & BinTrieFlags.BRANCH_LENGTH) >> 7;
var jumpOffset = current & BinTrieFlags.JUMP_TABLE;
// Case 1: Single branch encoded in jump offset
if (branchCount === 0) {
return jumpOffset !== 0 && char === jumpOffset ? nodeIdx : -1;
}
// Case 2: Multiple branches encoded in jump table
if (jumpOffset) {
var value = char - jumpOffset;
return value < 0 || value >= branchCount
? -1
: decodeTree[nodeIdx + value] - 1;
}
// Case 3: Multiple branches encoded in dictionary
// Binary search for the character.
var lo = nodeIdx;
var hi = lo + branchCount - 1;
while (lo <= hi) {
var mid = (lo + hi) >>> 1;
var midVal = decodeTree[mid];
if (midVal < char) {
lo = mid + 1;
}
else if (midVal > char) {
hi = mid - 1;
}
else {
return decodeTree[mid + branchCount];
}
}
return -1;
}
exports.determineBranch = determineBranch;
var htmlDecoder = getDecoder(decode_data_html_js_1.default);
var xmlDecoder = getDecoder(decode_data_xml_js_1.default);
/**
* Decodes an HTML string, allowing for entities not terminated by a semi-colon.
*
* @param str The string to decode.
* @returns The decoded string.
*/
function decodeHTML(str) {
return htmlDecoder(str, false);
}
exports.decodeHTML = decodeHTML;
/**
* Decodes an HTML string, requiring all entities to be terminated by a semi-colon.
*
* @param str The string to decode.
* @returns The decoded string.
*/
function decodeHTMLStrict(str) {
return htmlDecoder(str, true);
}
exports.decodeHTMLStrict = decodeHTMLStrict;
/**
* Decodes an XML string, requiring all entities to be terminated by a semi-colon.
*
* @param str The string to decode.
* @returns The decoded string.
*/
function decodeXML(str) {
return xmlDecoder(str, true);
}
exports.decodeXML = decodeXML;
//# sourceMappingURL=decode.js.map

View File

@@ -0,0 +1,60 @@
"use strict";
// Adapted from https://github.com/mathiasbynens/he/blob/36afe179392226cf1b6ccdb16ebbb7a5a844d93a/src/he.js#L106-L134
var _a;
Object.defineProperty(exports, "__esModule", { value: true });
exports.replaceCodePoint = exports.fromCodePoint = void 0;
var decodeMap = new Map([
[0, 65533],
[128, 8364],
[130, 8218],
[131, 402],
[132, 8222],
[133, 8230],
[134, 8224],
[135, 8225],
[136, 710],
[137, 8240],
[138, 352],
[139, 8249],
[140, 338],
[142, 381],
[145, 8216],
[146, 8217],
[147, 8220],
[148, 8221],
[149, 8226],
[150, 8211],
[151, 8212],
[152, 732],
[153, 8482],
[154, 353],
[155, 8250],
[156, 339],
[158, 382],
[159, 376],
]);
exports.fromCodePoint =
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition, node/no-unsupported-features/es-builtins
(_a = String.fromCodePoint) !== null && _a !== void 0 ? _a : function (codePoint) {
var output = "";
if (codePoint > 0xffff) {
codePoint -= 0x10000;
output += String.fromCharCode(((codePoint >>> 10) & 0x3ff) | 0xd800);
codePoint = 0xdc00 | (codePoint & 0x3ff);
}
output += String.fromCharCode(codePoint);
return output;
};
function replaceCodePoint(codePoint) {
var _a;
if ((codePoint >= 0xd800 && codePoint <= 0xdfff) || codePoint > 0x10ffff) {
return 0xfffd;
}
return (_a = decodeMap.get(codePoint)) !== null && _a !== void 0 ? _a : codePoint;
}
exports.replaceCodePoint = replaceCodePoint;
function decodeCodePoint(codePoint) {
return (0, exports.fromCodePoint)(replaceCodePoint(codePoint));
}
exports.default = decodeCodePoint;
//# sourceMappingURL=decode_codepoint.js.map

View File

@@ -0,0 +1,77 @@
"use strict";
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.encodeNonAsciiHTML = exports.encodeHTML = void 0;
var encode_html_js_1 = __importDefault(require("./generated/encode-html.js"));
var escape_js_1 = require("./escape.js");
var htmlReplacer = /[\t\n!-,./:-@[-`\f{-}$\x80-\uFFFF]/g;
/**
* Encodes all characters in the input using HTML entities. This includes
* characters that are valid ASCII characters in HTML documents, such as `#`.
*
* To get a more compact output, consider using the `encodeNonAsciiHTML`
* function, which will only encode characters that are not valid in HTML
* documents, as well as non-ASCII characters.
*
* If a character has no equivalent entity, a numeric hexadecimal reference
* (eg. `&#xfc;`) will be used.
*/
function encodeHTML(data) {
return encodeHTMLTrieRe(htmlReplacer, data);
}
exports.encodeHTML = encodeHTML;
/**
* Encodes all non-ASCII characters, as well as characters not valid in HTML
* documents using HTML entities. This function will not encode characters that
* are valid in HTML documents, such as `#`.
*
* If a character has no equivalent entity, a numeric hexadecimal reference
* (eg. `&#xfc;`) will be used.
*/
function encodeNonAsciiHTML(data) {
return encodeHTMLTrieRe(escape_js_1.xmlReplacer, data);
}
exports.encodeNonAsciiHTML = encodeNonAsciiHTML;
function encodeHTMLTrieRe(regExp, str) {
var ret = "";
var lastIdx = 0;
var match;
while ((match = regExp.exec(str)) !== null) {
var i = match.index;
ret += str.substring(lastIdx, i);
var char = str.charCodeAt(i);
var next = encode_html_js_1.default.get(char);
if (typeof next === "object") {
// We are in a branch. Try to match the next char.
if (i + 1 < str.length) {
var nextChar = str.charCodeAt(i + 1);
var value = typeof next.n === "number"
? next.n === nextChar
? next.o
: undefined
: next.n.get(nextChar);
if (value !== undefined) {
ret += value;
lastIdx = regExp.lastIndex += 1;
continue;
}
}
next = next.v;
}
// We might have a tree node without a value; skip and use a numeric entitiy.
if (next !== undefined) {
ret += next;
lastIdx = i + 1;
}
else {
var cp = (0, escape_js_1.getCodePoint)(str, i);
ret += "&#x".concat(cp.toString(16), ";");
// Increase by 1 if we have a surrogate pair
lastIdx = regExp.lastIndex += Number(cp !== char);
}
}
return ret + str.substr(lastIdx);
}
//# sourceMappingURL=encode.js.map

View File

@@ -0,0 +1,112 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.escapeText = exports.escapeAttribute = exports.escapeUTF8 = exports.escape = exports.encodeXML = exports.getCodePoint = exports.xmlReplacer = void 0;
exports.xmlReplacer = /["&'<>$\x80-\uFFFF]/g;
var xmlCodeMap = new Map([
[34, "&quot;"],
[38, "&amp;"],
[39, "&apos;"],
[60, "&lt;"],
[62, "&gt;"],
]);
// For compatibility with node < 4, we wrap `codePointAt`
exports.getCodePoint =
// eslint-disable-next-line @typescript-eslint/no-unnecessary-condition
String.prototype.codePointAt != null
? function (str, index) { return str.codePointAt(index); }
: // http://mathiasbynens.be/notes/javascript-encoding#surrogate-formulae
function (c, index) {
return (c.charCodeAt(index) & 0xfc00) === 0xd800
? (c.charCodeAt(index) - 0xd800) * 0x400 +
c.charCodeAt(index + 1) -
0xdc00 +
0x10000
: c.charCodeAt(index);
};
/**
* Encodes all non-ASCII characters, as well as characters not valid in XML
* documents using XML entities.
*
* If a character has no equivalent entity, a
* numeric hexadecimal reference (eg. `&#xfc;`) will be used.
*/
function encodeXML(str) {
var ret = "";
var lastIdx = 0;
var match;
while ((match = exports.xmlReplacer.exec(str)) !== null) {
var i = match.index;
var char = str.charCodeAt(i);
var next = xmlCodeMap.get(char);
if (next !== undefined) {
ret += str.substring(lastIdx, i) + next;
lastIdx = i + 1;
}
else {
ret += "".concat(str.substring(lastIdx, i), "&#x").concat((0, exports.getCodePoint)(str, i).toString(16), ";");
// Increase by 1 if we have a surrogate pair
lastIdx = exports.xmlReplacer.lastIndex += Number((char & 0xfc00) === 0xd800);
}
}
return ret + str.substr(lastIdx);
}
exports.encodeXML = encodeXML;
/**
* Encodes all non-ASCII characters, as well as characters not valid in XML
* documents using numeric hexadecimal reference (eg. `&#xfc;`).
*
* Have a look at `escapeUTF8` if you want a more concise output at the expense
* of reduced transportability.
*
* @param data String to escape.
*/
exports.escape = encodeXML;
function getEscaper(regex, map) {
return function escape(data) {
var match;
var lastIdx = 0;
var result = "";
while ((match = regex.exec(data))) {
if (lastIdx !== match.index) {
result += data.substring(lastIdx, match.index);
}
// We know that this chararcter will be in the map.
result += map.get(match[0].charCodeAt(0));
// Every match will be of length 1
lastIdx = match.index + 1;
}
return result + data.substring(lastIdx);
};
}
/**
* Encodes all characters not valid in XML documents using XML entities.
*
* Note that the output will be character-set dependent.
*
* @param data String to escape.
*/
exports.escapeUTF8 = getEscaper(/[&<>'"]/g, xmlCodeMap);
/**
* Encodes all characters that have to be escaped in HTML attributes,
* following {@link https://html.spec.whatwg.org/multipage/parsing.html#escapingString}.
*
* @param data String to escape.
*/
exports.escapeAttribute = getEscaper(/["&\u00A0]/g, new Map([
[34, "&quot;"],
[38, "&amp;"],
[160, "&nbsp;"],
]));
/**
* Encodes all characters that have to be escaped in HTML text,
* following {@link https://html.spec.whatwg.org/multipage/parsing.html#escapingString}.
*
* @param data String to escape.
*/
exports.escapeText = getEscaper(/[&<>\u00A0]/g, new Map([
[38, "&amp;"],
[60, "&lt;"],
[62, "&gt;"],
[160, "&nbsp;"],
]));
//# sourceMappingURL=escape.js.map

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,9 @@
"use strict";
// Generated using scripts/write-decode-map.ts
Object.defineProperty(exports, "__esModule", { value: true });
exports.default = new Uint16Array(
// prettier-ignore
"\u0200aglq\t\x15\x18\x1b\u026d\x0f\0\0\x12p;\u4026os;\u4027t;\u403et;\u403cuot;\u4022"
.split("")
.map(function (c) { return c.charCodeAt(0); }));
//# sourceMappingURL=decode-data-xml.js.map

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,137 @@
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.decodeXMLStrict = exports.decodeHTML5Strict = exports.decodeHTML4Strict = exports.decodeHTML5 = exports.decodeHTML4 = exports.decodeHTMLStrict = exports.decodeHTML = exports.decodeXML = exports.encodeHTML5 = exports.encodeHTML4 = exports.encodeNonAsciiHTML = exports.encodeHTML = exports.escapeText = exports.escapeAttribute = exports.escapeUTF8 = exports.escape = exports.encodeXML = exports.encode = exports.decodeStrict = exports.decode = exports.EncodingMode = exports.DecodingMode = exports.EntityLevel = void 0;
var decode_js_1 = require("./decode.js");
var encode_js_1 = require("./encode.js");
var escape_js_1 = require("./escape.js");
/** The level of entities to support. */
var EntityLevel;
(function (EntityLevel) {
/** Support only XML entities. */
EntityLevel[EntityLevel["XML"] = 0] = "XML";
/** Support HTML entities, which are a superset of XML entities. */
EntityLevel[EntityLevel["HTML"] = 1] = "HTML";
})(EntityLevel = exports.EntityLevel || (exports.EntityLevel = {}));
/** Determines whether some entities are allowed to be written without a trailing `;`. */
var DecodingMode;
(function (DecodingMode) {
/** Support legacy HTML entities. */
DecodingMode[DecodingMode["Legacy"] = 0] = "Legacy";
/** Do not support legacy HTML entities. */
DecodingMode[DecodingMode["Strict"] = 1] = "Strict";
})(DecodingMode = exports.DecodingMode || (exports.DecodingMode = {}));
var EncodingMode;
(function (EncodingMode) {
/**
* The output is UTF-8 encoded. Only characters that need escaping within
* XML will be escaped.
*/
EncodingMode[EncodingMode["UTF8"] = 0] = "UTF8";
/**
* The output consists only of ASCII characters. Characters that need
* escaping within HTML, and characters that aren't ASCII characters will
* be escaped.
*/
EncodingMode[EncodingMode["ASCII"] = 1] = "ASCII";
/**
* Encode all characters that have an equivalent entity, as well as all
* characters that are not ASCII characters.
*/
EncodingMode[EncodingMode["Extensive"] = 2] = "Extensive";
/**
* Encode all characters that have to be escaped in HTML attributes,
* following {@link https://html.spec.whatwg.org/multipage/parsing.html#escapingString}.
*/
EncodingMode[EncodingMode["Attribute"] = 3] = "Attribute";
/**
* Encode all characters that have to be escaped in HTML text,
* following {@link https://html.spec.whatwg.org/multipage/parsing.html#escapingString}.
*/
EncodingMode[EncodingMode["Text"] = 4] = "Text";
})(EncodingMode = exports.EncodingMode || (exports.EncodingMode = {}));
/**
* Decodes a string with entities.
*
* @param data String to decode.
* @param options Decoding options.
*/
function decode(data, options) {
if (options === void 0) { options = EntityLevel.XML; }
var opts = typeof options === "number" ? { level: options } : options;
if (opts.level === EntityLevel.HTML) {
if (opts.mode === DecodingMode.Strict) {
return (0, decode_js_1.decodeHTMLStrict)(data);
}
return (0, decode_js_1.decodeHTML)(data);
}
return (0, decode_js_1.decodeXML)(data);
}
exports.decode = decode;
/**
* Decodes a string with entities. Does not allow missing trailing semicolons for entities.
*
* @param data String to decode.
* @param options Decoding options.
* @deprecated Use `decode` with the `mode` set to `Strict`.
*/
function decodeStrict(data, options) {
if (options === void 0) { options = EntityLevel.XML; }
var opts = typeof options === "number" ? { level: options } : options;
if (opts.level === EntityLevel.HTML) {
if (opts.mode === DecodingMode.Legacy) {
return (0, decode_js_1.decodeHTML)(data);
}
return (0, decode_js_1.decodeHTMLStrict)(data);
}
return (0, decode_js_1.decodeXML)(data);
}
exports.decodeStrict = decodeStrict;
/**
* Encodes a string with entities.
*
* @param data String to encode.
* @param options Encoding options.
*/
function encode(data, options) {
if (options === void 0) { options = EntityLevel.XML; }
var opts = typeof options === "number" ? { level: options } : options;
// Mode `UTF8` just escapes XML entities
if (opts.mode === EncodingMode.UTF8)
return (0, escape_js_1.escapeUTF8)(data);
if (opts.mode === EncodingMode.Attribute)
return (0, escape_js_1.escapeAttribute)(data);
if (opts.mode === EncodingMode.Text)
return (0, escape_js_1.escapeText)(data);
if (opts.level === EntityLevel.HTML) {
if (opts.mode === EncodingMode.ASCII) {
return (0, encode_js_1.encodeNonAsciiHTML)(data);
}
return (0, encode_js_1.encodeHTML)(data);
}
// ASCII and Extensive are equivalent
return (0, escape_js_1.encodeXML)(data);
}
exports.encode = encode;
var escape_js_2 = require("./escape.js");
Object.defineProperty(exports, "encodeXML", { enumerable: true, get: function () { return escape_js_2.encodeXML; } });
Object.defineProperty(exports, "escape", { enumerable: true, get: function () { return escape_js_2.escape; } });
Object.defineProperty(exports, "escapeUTF8", { enumerable: true, get: function () { return escape_js_2.escapeUTF8; } });
Object.defineProperty(exports, "escapeAttribute", { enumerable: true, get: function () { return escape_js_2.escapeAttribute; } });
Object.defineProperty(exports, "escapeText", { enumerable: true, get: function () { return escape_js_2.escapeText; } });
var encode_js_2 = require("./encode.js");
Object.defineProperty(exports, "encodeHTML", { enumerable: true, get: function () { return encode_js_2.encodeHTML; } });
Object.defineProperty(exports, "encodeNonAsciiHTML", { enumerable: true, get: function () { return encode_js_2.encodeNonAsciiHTML; } });
// Legacy aliases (deprecated)
Object.defineProperty(exports, "encodeHTML4", { enumerable: true, get: function () { return encode_js_2.encodeHTML; } });
Object.defineProperty(exports, "encodeHTML5", { enumerable: true, get: function () { return encode_js_2.encodeHTML; } });
var decode_js_2 = require("./decode.js");
Object.defineProperty(exports, "decodeXML", { enumerable: true, get: function () { return decode_js_2.decodeXML; } });
Object.defineProperty(exports, "decodeHTML", { enumerable: true, get: function () { return decode_js_2.decodeHTML; } });
Object.defineProperty(exports, "decodeHTMLStrict", { enumerable: true, get: function () { return decode_js_2.decodeHTMLStrict; } });
// Legacy aliases (deprecated)
Object.defineProperty(exports, "decodeHTML4", { enumerable: true, get: function () { return decode_js_2.decodeHTML; } });
Object.defineProperty(exports, "decodeHTML5", { enumerable: true, get: function () { return decode_js_2.decodeHTML; } });
Object.defineProperty(exports, "decodeHTML4Strict", { enumerable: true, get: function () { return decode_js_2.decodeHTMLStrict; } });
Object.defineProperty(exports, "decodeHTML5Strict", { enumerable: true, get: function () { return decode_js_2.decodeHTMLStrict; } });
Object.defineProperty(exports, "decodeXMLStrict", { enumerable: true, get: function () { return decode_js_2.decodeXML; } });
//# sourceMappingURL=index.js.map

View File

@@ -0,0 +1,8 @@
var DomHandler = require("./domhandler/index.js");
var Parser = require("./Parser.js");
function parseDocument(data, options) {
var handler = new DomHandler(undefined, options);
new Parser(handler, options).end(data);
return handler.root.children;
}
module.exports = parseDocument;