'use strict'; const generate = require('regjsgen').generate; const parse = require('regjsparser').parse; const regenerate = require('regenerate'); const unicodeMatchProperty = require('unicode-match-property-ecmascript'); const unicodeMatchPropertyValue = require('unicode-match-property-value-ecmascript'); const iuMappings = require('./data/iu-mappings.js'); const ESCAPE_SETS = require('./data/character-class-escape-sets.js'); // Prepare a Regenerate set containing all code points, used for negative // character classes (if any). const UNICODE_SET = regenerate().addRange(0x0, 0x10FFFF); // Without the `u` flag, the range stops at 0xFFFF. // https://mths.be/es6#sec-pattern-semantics const BMP_SET = regenerate().addRange(0x0, 0xFFFF); // Prepare a Regenerate set containing all code points that are supposed to be // matched by `/./u`. https://mths.be/es6#sec-atom const DOT_SET_UNICODE = UNICODE_SET.clone() // all Unicode code points .remove( // minus `LineTerminator`s (https://mths.be/es6#sec-line-terminators): 0x000A, // Line Feed <LF> 0x000D, // Carriage Return <CR> 0x2028, // Line Separator <LS> 0x2029 // Paragraph Separator <PS> ); const getCharacterClassEscapeSet = (character, unicode, ignoreCase) => { if (unicode) { if (ignoreCase) { return ESCAPE_SETS.UNICODE_IGNORE_CASE.get(character); } return ESCAPE_SETS.UNICODE.get(character); } return ESCAPE_SETS.REGULAR.get(character); }; const getUnicodeDotSet = (dotAll) => { return dotAll ? UNICODE_SET : DOT_SET_UNICODE; }; const getUnicodePropertyValueSet = (property, value) => { const path = value ? `${ property }/${ value }` : `Binary_Property/${ property }`; try { return require(`regenerate-unicode-properties/${ path }.js`); } catch (exception) { throw new Error( `Failed to recognize value \`${ value }\` for property ` + `\`${ property }\`.` ); } }; const handleLoneUnicodePropertyNameOrValue = (value) => { // It could be a `General_Category` value or a binary property. // Note: `unicodeMatchPropertyValue` throws on invalid values. try { const property = 'General_Category'; const category = unicodeMatchPropertyValue(property, value); return getUnicodePropertyValueSet(property, category); } catch (exception) {} // It’s not a `General_Category` value, so check if it’s a binary // property. Note: `unicodeMatchProperty` throws on invalid properties. const property = unicodeMatchProperty(value); return getUnicodePropertyValueSet(property); }; const getUnicodePropertyEscapeSet = (value, isNegative) => { const parts = value.split('='); const firstPart = parts[0]; let set; if (parts.length == 1) { set = handleLoneUnicodePropertyNameOrValue(firstPart); } else { // The pattern consists of two parts, i.e. `Property=Value`. const property = unicodeMatchProperty(firstPart); const value = unicodeMatchPropertyValue(property, parts[1]); set = getUnicodePropertyValueSet(property, value); } if (isNegative) { return UNICODE_SET.clone().remove(set); } return set.clone(); }; // Given a range of code points, add any case-folded code points in that range // to a set. regenerate.prototype.iuAddRange = function(min, max) { const $this = this; do { const folded = caseFold(min); if (folded) { $this.add(folded); } } while (++min <= max); return $this; }; const update = (item, pattern) => { let tree = parse(pattern, config.useUnicodeFlag ? 'u' : ''); switch (tree.type) { case 'characterClass': case 'group': case 'value': // No wrapping needed. break; default: // Wrap the pattern in a non-capturing group. tree = wrap(tree, pattern); } Object.assign(item, tree); }; const wrap = (tree, pattern) => { // Wrap the pattern in a non-capturing group. return { 'type': 'group', 'behavior': 'ignore', 'body': [tree], 'raw': `(?:${ pattern })` }; }; const caseFold = (codePoint) => { return iuMappings.get(codePoint) || false; }; const processCharacterClass = (characterClassItem, regenerateOptions) => { const set = regenerate(); for (const item of characterClassItem.body) { switch (item.type) { case 'value': set.add(item.codePoint); if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) { const folded = caseFold(item.codePoint); if (folded) { set.add(folded); } } break; case 'characterClassRange': const min = item.min.codePoint; const max = item.max.codePoint; set.addRange(min, max); if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) { set.iuAddRange(min, max); } break; case 'characterClassEscape': set.add(getCharacterClassEscapeSet( item.value, config.unicode, config.ignoreCase )); break; case 'unicodePropertyEscape': set.add(getUnicodePropertyEscapeSet(item.value, item.negative)); break; // The `default` clause is only here as a safeguard; it should never be // reached. Code coverage tools should ignore it. /* istanbul ignore next */ default: throw new Error(`Unknown term type: ${ item.type }`); } } if (characterClassItem.negative) { update(characterClassItem, `(?!${set.toString(regenerateOptions)})[\\s\\S]`) } else { update(characterClassItem, set.toString(regenerateOptions)); } return characterClassItem; }; const updateNamedReference = (item, index) => { delete item.name; item.matchIndex = index; }; const assertNoUnmatchedReferences = (groups) => { const unmatchedReferencesNames = Object.keys(groups.unmatchedReferences); if (unmatchedReferencesNames.length > 0) { throw new Error(`Unknown group names: ${unmatchedReferencesNames}`); } }; const processTerm = (item, regenerateOptions, groups) => { switch (item.type) { case 'dot': if (config.useDotAllFlag) { break; } else if (config.unicode) { update( item, getUnicodeDotSet(config.dotAll).toString(regenerateOptions) ); } else if (config.dotAll) { // TODO: consider changing this at the regenerate level. update(item, '[\\s\\S]'); } break; case 'characterClass': item = processCharacterClass(item, regenerateOptions); break; case 'unicodePropertyEscape': if (config.unicodePropertyEscape) { update( item, getUnicodePropertyEscapeSet(item.value, item.negative) .toString(regenerateOptions) ); } break; case 'characterClassEscape': update( item, getCharacterClassEscapeSet( item.value, config.unicode, config.ignoreCase ).toString(regenerateOptions) ); break; case 'group': if (item.behavior == 'normal') { groups.lastIndex++; } if (item.name && config.namedGroup) { const name = item.name.value; if (groups.names[name]) { throw new Error( `Multiple groups with the same name (${ name }) are not allowed.` ); } const index = groups.lastIndex; delete item.name; groups.names[name] = index; if (groups.onNamedGroup) { groups.onNamedGroup.call(null, name, index); } if (groups.unmatchedReferences[name]) { groups.unmatchedReferences[name].forEach(reference => { updateNamedReference(reference, index); }); delete groups.unmatchedReferences[name]; } } /* falls through */ case 'alternative': case 'disjunction': case 'quantifier': item.body = item.body.map(term => { return processTerm(term, regenerateOptions, groups); }); break; case 'value': const codePoint = item.codePoint; const set = regenerate(codePoint); if (config.ignoreCase && config.unicode && !config.useUnicodeFlag) { const folded = caseFold(codePoint); if (folded) { set.add(folded); } } update(item, set.toString(regenerateOptions)); break; case 'reference': if (item.name) { const name = item.name.value; const index = groups.names[name]; if (index) { updateNamedReference(item, index); break; } if (!groups.unmatchedReferences[name]) { groups.unmatchedReferences[name] = []; } // Keep track of references used before the corresponding group. groups.unmatchedReferences[name].push(item); } break; case 'anchor': case 'empty': case 'group': // Nothing to do here. break; // The `default` clause is only here as a safeguard; it should never be // reached. Code coverage tools should ignore it. /* istanbul ignore next */ default: throw new Error(`Unknown term type: ${ item.type }`); } return item; }; const config = { 'ignoreCase': false, 'unicode': false, 'dotAll': false, 'useDotAllFlag': false, 'useUnicodeFlag': false, 'unicodePropertyEscape': false, 'namedGroup': false }; const rewritePattern = (pattern, flags, options) => { config.unicode = flags && flags.includes('u'); const regjsparserFeatures = { 'unicodePropertyEscape': config.unicode, 'namedGroups': true, 'lookbehind': options && options.lookbehind }; config.ignoreCase = flags && flags.includes('i'); const supportDotAllFlag = options && options.dotAllFlag; config.dotAll = supportDotAllFlag && flags && flags.includes('s'); config.namedGroup = options && options.namedGroup; config.useDotAllFlag = options && options.useDotAllFlag; config.useUnicodeFlag = options && options.useUnicodeFlag; config.unicodePropertyEscape = options && options.unicodePropertyEscape; if (supportDotAllFlag && config.useDotAllFlag) { throw new Error('`useDotAllFlag` and `dotAllFlag` cannot both be true!'); } const regenerateOptions = { 'hasUnicodeFlag': config.useUnicodeFlag, 'bmpOnly': !config.unicode }; const groups = { 'onNamedGroup': options && options.onNamedGroup, 'lastIndex': 0, 'names': Object.create(null), // { [name]: index } 'unmatchedReferences': Object.create(null) // { [name]: Array<reference> } }; const tree = parse(pattern, flags, regjsparserFeatures); // Note: `processTerm` mutates `tree` and `groups`. processTerm(tree, regenerateOptions, groups); assertNoUnmatchedReferences(groups); return generate(tree); }; module.exports = rewritePattern;