// Module to tokenize a string into CFG symbols /** * Module Dependencies */ var util = require('./util') var symbol = require('./symbol') var maps = require('./maps.json') /** * regexes for Subnormal forms */ var re = { // 12/20 - 12/21, 2012/12 - 2013/12 MMsDDdMMsDD: /(?!\d{1,4}\/\d{1,4}\s*-\s*\d{1,4}\/\d{1,4}\/)(\d{1,4})\/(\d{1,4})\s*-\s*(\d{1,4})\/(\d{1,4})/g, // 12/22 - 23, 2012/10 - 12 MMsDDdDD: /(?!\d{1,4}\/\d{1,4}\s*-\s*\d{1,4}\/)(\d{1,4})\/(\d{1,4})\s*-\s*(\d{1,4})/g, // 12/24, 2012/12 MMsDD: /(?!\d{1,4}\/\d{1,4}\/)(\d{1,4})\/(\d{1,4})/g, // 05:30pm, 0530pm, 1730, 1730pm, 1730[re:h], remove the [re:h] hhcmm: /(\s+\d{1,2}|^\d{1,2})\:?(\d{2})\s*(\S+)*/g } /** * Export `tokenize` */ module.exports = tokenize /** * Parse and tokenize a string into array of valid CFG symbols, in these steps: * 1. parse normal forms * 2. parse subnormal forms * 3. parse english forms * @param {string} str The input string. * @return {JSON} {str, tokensIn, tokensOut, symbols} */ function tokenize (str) { // split num from alphabets str = (' ' + str) .replace(/\s+(\d+)([a-zA-Z]+)/g, ' $1 $2') .replace(/\s+([a-zA-Z]+)(\d+)/g, ' $1 $2') .replace(/\s+/g, ' ') .replace(/^\s+/, '') // 1. 2. parse normal and subnormal forms var p = parseNormal12(str), pStr = p.str, tokens = pStr.split(' '), symbols = [] // clean the non-normal tokens a bit, allow to be wrapped by words only for (var i = 0; i < tokens.length; i++) { if (!tokens[i].match(util.reT)) { tokens[i] = tokens[i].replace(/^\W+/, '').replace(/\W+$/, '') } } // 3. parse english forms for (var i = 0; i < tokens.length; i++) { var tok = tokens[i] var oneGram = tok, twoGram = tok + ' ' + (tokens[i + 1] || ''), oneSym = symbol(oneGram), twoSym = symbol(twoGram) if (twoSym && twoSym.value == oneSym.value) { // if lemmatization must happen for both, // pick the longer, skip next token // skip this once, reset skip i++ symbols.push(symbol(twoGram)) } else { symbols.push(symbol(oneGram)) } } return { str: pStr, tokensOut: p.tokensOut, tokensIn: p.tokensIn, symbols: symbols } } /** * Run 1. parseNormal then 2. parseNormal2, return the parsed string with T-format tokens. * @private * @param {string} str The input string * @return {JSON} Parsed string */ function parseNormal12 (str) { var p1 = parseNormal1(str) // find tokens that are purely normal, and reinject into string var p1TokensOut = p1.tokensOut.filter(notSubnormal) var p1Str = injectNormal(str, p1TokensOut) // now parse the subnormal var p2 = parseNormal2(p1Str, [], []) // the tokens that taken out, and their replacements, in order var pTokensOut = p1.tokensOut.concat(p2.tokensOut) var pTokensIn = p1.tokensIn.concat(p2.tokensIn) return { str: p2.str, tokensOut: pTokensOut, tokensIn: pTokensIn } } /** * 1. Parse normal forms. Try to parse and return a normal Date, parseable from new Date(str), by continuously trimming off its tail and retry until either get a valid date, or string runs out. * Doesn't parse string with length <5 * @private * @param {string} str The input string. * @return {string} A Date in stdT string, or null. */ function parseNormal1 (str) { // keep chopping off tail until either get a valid date, or string runs out // array of parsed date and the string consumed var tokensIn = [], tokensOut = [] // ensure single spacing str = str.replace(/\s+/g, ' ') // tokenize by space var strArr = str.split(/\s+/g) // init the normalDate and head string used var normalDate = null, head = '' // do while there's still string to go while (strArr.length) { head = (head + ' ' + strArr.shift()).trim() try { normalDate = util.stdT(new Date(head)) // Extend head: if parse successful, extend continuously until failure, then that's the longest parseable head string, ... var advanceHead = head + ' ' + strArr[0] while (1) { try { var advanceDate = util.stdT(new Date(advanceHead)) if (advanceDate != 'Invalid Date') { // if advanceDate is parseable, set to current, update heads var normalDate = advanceDate head = head + ' ' + strArr.shift() advanceHead = advanceHead + ' ' + strArr[0] } else { break } } catch (e) { // when fail, just break break } } // Shrink head: from the whole parseable head ..., trim front till we get while (1) { try { if (util.stdT(new Date(head.replace(/^\s*\S+\s*/, ''))) != normalDate) { // front token eaten causes change, dont update head break } else { // update head head = head.replace(/^\s*\S+\s*/, '') } } catch (e) { break } } // only consider a valid parse if the parsed str is long enough if (head.length > 6) { tokensIn.push(normalDate) // get head = only, then reset tokensOut.push(head) } head = '' } catch (e) {} } return { tokensIn: tokensIn, tokensOut: tokensOut } } /** * 2. Parse subnormal forms after parseNormal. Gradually replace tokens of the input string while parseable. * @private */ function parseNormal2 (str, tokensIn, tokensOut) { var m, res if (m = re.MMsDDdMMsDD.exec(str)) { // 12/20 - 12/21 var yMd1 = yMdParse(m[1], m[2]) var yMd2 = yMdParse(m[3], m[4]) res = ' t:' + yMd1 + ',dt: - t:' + yMd2 + ',dt: ' } else if (m = re.MMsDDdDD.exec(str)) { // 12/22 - 23 var yMd1 = yMdParse(m[1], m[2]) var yMd2 = yMdParse(m[1], m[3]) res = ' t:' + yMd1 + ',dt: - t:' + yMd2 + ',dt: ' } else if (m = re.MMsDD.exec(str)) { // if year var yMd = yMdParse(m[1], m[2]) // 12/24 res = ' t:' + yMd + ',dt: ' } else if (m = re.hhcmm.exec(str)) { // 05:30pm, 0530pm, 1730, 1730pm, 1730[re:h], remove the [re:h] res = ' t:' + m[1].trim() + 'h' + m[2] + 'm' + ',dt: ' + (m[3] || '') } else { // exit recursion if hits here return { str: str, tokensIn: tokensIn, tokensOut: tokensOut } } // recurse down till no more substitution (CFG is not cyclic, so ok) tokensOut.push(m[0]) tokensIn.push(res) str = parseNormal2(str.replace(m[0], res), tokensIn, tokensOut).str return { str: str, tokensIn: tokensIn, tokensOut: tokensOut } } // //////////////////// // Helper functions // // //////////////////// /** * Try to parse two tokens for T form into MM/dd, or MM/yyyy if either token hsa length 4. * @private * @param {string} token1 * @param {string} token2 * @return {string} in the form */ function yMdParse (token1, token2) { var part0 = [token1, token2].filter(function (token) { return token.length == 4 }) var part1 = [token1, token2].filter(function (token) { return token.length != 4 }) var y = part0[0] ? part0[0] + 'y' : '' var M = part1[0] + 'M' var d = part1[1] ? part1[1] + 'd' : '' return y + M + d } /** * Check if the dateStr is strictly normal and not subnormal. Used to extract parseNormal2 overrides. * @private * @param {string} dateStr * @return {Boolean} */ function notSubnormal (dateStr) { var subnormalStr = parseNormal2(dateStr, [], []).str // remove T and see if still has words var noT = subnormalStr.replace(/t\:\S*,dt\:\S*(\s*-\s*t\:\S*,dt\:\S*)?/, '') return /\w+/g.exec(noT) != null } /** * Given a string and array of its parsed phrases, convert them into T stdT then T format, and inject into the original string, return. * @private * @param {string} str The original string. * @param {Array} parsedArr The parsed phrases from the string. * @return {string} The string with parsed phrases replaced in T format. * * @example * injectNormal('05 October 2011 14:48 UTC 08/11 2020', [ '05 October 2011 14:48 UTC', '08/11 2020' ]) * // => 't:2011y10M05d14h48m00.000s,dt: t:2020y08M11d04h00m00.000s,dt: ' */ function injectNormal (str, parsedArr) { for (var i = 0; i < parsedArr.length; i++) { var parsed = parsedArr[i] var T = util.stdTtoT(util.stdT(new Date(parsed))) str = str.replace(parsed, T) } return str }