====== Javascript useful libs ====== ===== Converting English number sentences to numeric digits ===== This piece of code converts English number sentences like "one hundred forty two point three" to its numeric representation: "142.3". This is just a copy & paste from a [[https://gist.github.com/getify/83e6fae8f54c1d38f89132a130493f98|Github gist]]. Saving it here for archiving purposes. All credits go to [[https://gist.github.com/getify|Kyle Simpson]] "use strict"; var digits = { "o": "0", "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4", "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9", }; var tens = { "ten": "10", "eleven": "11", "twelve": "12", "thirteen": "13", "fourteen": "14", "fifteen": "15", "sixteen": "16", "seventeen": "17", "eighteen": "18", "nineteen": "19", }; var doubles = { "twenty": "20", "thirty": "30", "forty": "40", "fifty": "50", "sixty": "60", "seventy": "70", "eighty": "80", "ninety": "90", }; var units = [ "hundred", "thousand", "million", "billion", "trillion", "quadrillion", ]; function convert(numstr,separator = "") { var ast = parse(numstr); var numberDigits = ""; var node = ast; while (node) { numberDigits += ( (node.unit == "decimal" ? ("." + (node.value || "0")) : ( (numberDigits != "" ? separator : "") + (node.value || "000") ) ) ); node = node.and; } // normalize leading zeros numberDigits = numberDigits.replace(/^0+/,"").replace(/^\./,"0.") || "0"; return numberDigits; } function parse(numstr) { var words = numstr.trim().replace(/[^\-0-9a-z\s]+/ig,"").toLowerCase().split(/[\s\-]+/).filter(Boolean); // (STEP 1) tokenize the string var tokens = []; var inDecimal = false; for (let word of words) { let curToken = tokens[tokens.length - 1]; if (word == "point" || word == "dot") { if (curToken && !curToken.complete) { if (!curToken.unit) { curToken.unit = "hundred"; } curToken.complete = true; } if (!inDecimal) { inDecimal = true; tokens.push({ type: "point", value: ".", complete: true, }); } else { throw new Error("Invalid! " + word); } } else if (word == "o" || word == "zero") { if (curToken && !curToken.complete) { tokens.push({ type: "digit", value: "0", complete: true, }); curToken.complete = true; } else { tokens.push({ type: "digit", value: "0", complete: true, }); } } else if (word in digits) { if (curToken && !curToken.complete) { // replace a trailing zero (from a double or hundred)? if (curToken.value.endsWith("0")) { curToken.value = curToken.value.slice(0,-1) + digits[word]; curToken.complete = true; } else { tokens.push({ type: "digit", value: digits[word], complete: true, }); curToken.complete = true; } } else { tokens.push({ type: "digit", value: digits[word], complete: true, }); } } else if (word in tens) { if (curToken && !curToken.complete) { // replace two trailing zeros (from a hundred)? if (curToken.value.endsWith("00")) { curToken.value = curToken.value.slice(0,1) + tens[word]; curToken.complete = true; } else { tokens.push({ type: "ten", value: tens[word], complete: true, }); curToken.complete = true; } } // promote a single digit to a complete triple? else if (curToken && !curToken.unit && curToken.type == "digit") { curToken.type = "triple"; curToken.value = curToken.value.slice(0,1) + tens[word]; } else { tokens.push({ type: "ten", value: tens[word], complete: true, }); } } else if (word in doubles) { if (curToken && !curToken.complete) { // replace two trailing zeros (from a triple)? if (curToken.value.endsWith("00")) { curToken.value = curToken.value.slice(0,1) + doubles[word]; // NOTE: leave complete:false since a digit can complete a double } else { tokens.push({ type: "double", value: doubles[word], complete: false, }); curToken.complete = true; } } // promote a single digit to an incomplete triple? else if (curToken && !curToken.unit && curToken.type == "digit") { curToken.type = "triple"; curToken.value = curToken.value.slice(0,1) + doubles[word]; curToken.complete = false; } else { tokens.push({ type: "double", value: doubles[word], complete: false, }); } } else if (!inDecimal) { if (word == "hundred") { if (curToken && !curToken.complete) { curToken.complete = true; tokens.push({ type: "triple", value: "100", complete: false, }); } // promote a single digit to an incomplete triple? else if (curToken && !curToken.unit && curToken.type == "digit") { curToken.type = "triple"; curToken.value = curToken.value.slice(0,1) + "00"; curToken.complete = false; } else { tokens.push({ type: "triple", value: "100", complete: false, }); } } // thousand, million, etc else if (units.includes(word)) { if (curToken) { curToken.unit = word; curToken.complete = true; } else { tokens.push({ type: "digit", unit: word, value: "1", complete: true, }); } } // harmless conjunction word? else if (word == "and") { continue; } // unrecognized/invalid word else { throw new Error("Invalid! " + word); } } // word not allowed while tokenizing decimal values else { throw new Error("Invalid! " + word); } } // (STEP 2) parse the token list into an AST var ast = {}; var curNode = ast; for (let tokenIdx = 0; tokenIdx < tokens.length; tokenIdx++) { let token = tokens[tokenIdx]; let nextToken = tokens[tokenIdx + 1]; // token indicates an assigned unit-place? if (token.unit) { // current node has no assigned unit-place? if (!curNode.unit) { curNode.unit = token.unit; curNode.value = ( curNode == ast ? token.value : token.value.padStart(3,"0") ); let unit = nextUnit(token.unit); if (unit) { // create next placeholder node curNode = curNode.and = { unit, }; } } // token unit same as current node? else if (token.unit == curNode.unit) { // current node is a placeholder that has not yet // been assigned a value from token? if (!curNode.value) { curNode.value = ( curNode == ast ? token.value : token.value.padStart(3,"0") ); let unit = nextUnit(token.unit); if (unit) { // create next placeholder node curNode = curNode.and = { unit, }; } } else { throw new Error("Invalid! " + token.unit); } } // current node is different (higher?) unit place // than token? else { // attempt to generate missing unit node(s) let [ tree, leaf,] = generateMissingUnitNodes(curNode.unit,token.unit); if (tree) { curNode.and = tree.and; curNode = leaf; curNode.value = token.value; } else { throw new Error("Invalid! " + token.unit); } } } // decimal point? else if (token.type == "point") { // current node has no unit-place assigned yet? if (!curNode.unit) { curNode.unit = "hundred"; curNode = curNode.and = { unit: "decimal", value: "", }; } else if (curNode.unit == "hundred") { curNode = curNode.and = { unit: "decimal", value: "", }; } else { // attempt to generate missing unit-place node(s) let [ tree, leaf,] = generateMissingUnitNodes(curNode.unit,"hundred"); if (tree) { curNode.and = tree.and; curNode = leaf; curNode = curNode.and = { unit: "decimal", value: "", }; } else { throw new Error("Invalid! " + token.type); } } } // separate digit? else if (token.type == "digit") { // append digit to the decimal node? if (curNode.unit == "decimal") { // look-ahead to collect all consecutive digits, if any let digitTokens = collectConsecutiveDigits(tokens,tokenIdx); tokenIdx += (digitTokens.length - 1); // add digit token(s) to current node for (let digit of digitTokens) { curNode.value = (curNode.value || "") + digit.value; } } // multiple adjacent (non-decimal) digits? else if ( nextToken && nextToken.type == "digit" ) { // current node is "empty", so we can implicitly // create arbitrary unit-place segment(s) from multiple // digits? if (!curNode.unit) { // look-ahead to collect all consecutive digits let digitTokens = collectConsecutiveDigits(tokens,tokenIdx); tokenIdx += (digitTokens.length - 1); // skip any leading zeros (since we're at the // start of the number) let firstNonZeroDigitIdx = digitTokens.findIndex(digit => digit.value != "0"); if (firstNonZeroDigitIdx > 0) { digitTokens = digitTokens.slice(firstNonZeroDigitIdx); } // any digits remain to be added to the AST? if (digitTokens.length > 0) { // determine how many unit-place groups are needed let numGroups = Math.ceil( Math.min(digitTokens.length,units.length * 3) / 3 ); // determine number of digits in first group let groupSize = ( digitTokens.length > (units.length * 3) ? digitTokens.length - (units.length * 3) + 3 : digitTokens.length % 3 || 3 ); // create the necessary unit-place nodes in the AST let [ tree, leaf ] = generateMissingUnitNodes( units[ Math.min(units.length - 1,numGroups - 1) ], "hundred" ); if (tree) { curNode.unit = tree.unit; curNode.value = ""; if (tree.and) { curNode.and = tree.and; } // fill in the unit-place groups to the AST do { // collect a group of digits into current node let digitGroup = digitTokens.slice(0,groupSize); digitTokens = digitTokens.slice(groupSize); curNode.value = digitGroup.reduce((val,digit) => val + digit.value,""); // more digits to add as a unit-place group? if (curNode.and && digitTokens.length > 0) { curNode = curNode.and; // from here forward, all digit groups are // fixed size of 3 groupSize = 3; } } // keep going while digits remain to be grouped while (digitTokens.length > 0); } } else { // NOTE: should never get here throw new Error("Invalid! " + token.value); } } else { // look-ahead to collect up to 3 consecutive digits let digitTokens = collectConsecutiveDigits(tokens,tokenIdx,/*limit=*/3); tokenIdx += (digitTokens.length - 1); // combine digits into a single value let val = digitTokens.reduce((val,digit) => val + digit.value,""); // assign combined-digits to "hundred" unit-place node curNode = assignHundredUnitPlaceNode( curNode, // zero-pad the value val.padStart(3,"0") ); } } else { // assign single digit to "hundred" unit-place node curNode = assignHundredUnitPlaceNode( curNode, // zero-pad the value token.value.padStart(3,"0") ); } } // stand-alone ten or double token? else if (token.type == "ten" || token.type == "double") { // append numbers to the decimal node? if (curNode.unit == "decimal") { curNode.value += token.value; } // literal/year form: // * "seventeen nineteen" // * "seventeen thirty" // * "twenty fourteen" // * "twenty fifty" else if ( nextToken && (nextToken.type == "ten" || nextToken.type == "double") ) { if (!curNode.unit) { curNode.unit = "thousand"; curNode.value = token.value.slice(0,1); curNode = curNode.and = { unit: "hundred", value: token.value.slice(1) + nextToken.value, }; tokenIdx += 1; // lookahead: 1 spot } else { throw new Error("Invalid! " + token.value); } } // ten/double followed by: // * any 3 digits // * '0' plus another digit else if ( !curNode.unit && nextToken && nextToken.type == "digit" && !nextToken.unit ) { let tokenN2 = tokens[tokenIdx + 2]; let tokenN3 = tokens[tokenIdx + 3]; // any 3 digits if ( tokenN2 && tokenN2.type == "digit" && tokenN3 && tokenN3.type == "digit" ) { curNode.unit = "thousand"; curNode.value = token.value; curNode = curNode.and = { unit: "hundred", value: nextToken.value + tokenN2.value + tokenN3.value, }; tokenIdx += 3; // lookahead: 3 spots } // '0' plus another digit else if ( nextToken.value == "0" && tokenN2 && tokenN2.type == "digit" ) { curNode.unit = "thousand"; curNode.value = token.value.slice(0,1); curNode = curNode.and = { unit: "hundred", value: token.value.slice(1) + nextToken.value + tokenN2.value, }; tokenIdx += 2; // lookahead: 2 spots } else { throw new Error("Invalid! " + token.value); } } // assumed "thousand" unit: // * "thirteen nine forty two" // * "thirty nine two o six" else if ( !curNode.unit && nextToken && nextToken.type == "triple" && !nextToken.unit ) { curNode.unit = "thousand"; curNode.value = token.value; curNode = curNode.and = { unit: "hundred", value: nextToken.value.padStart(3,"0"), }; tokenIdx += 1; // lookahead: 1 spot } else { // assign ten/double value to "hundred" unit-place node curNode = assignHundredUnitPlaceNode( curNode, // zero-pad the value token.value.padStart(3,"0") ); } } else if (token.type == "triple") { if (curNode.unit == "decimal") { curNode.value += token.value; } else { // assign triple value to "hundred" unit-place node curNode = assignHundredUnitPlaceNode( curNode, // zero-pad the value token.value.padStart(3,"0") ); } } else { // NOTE: should never get here throw new Error("Invalid! " + token.type); } } // append missing AST nodes (if any) if (![ "hundred", "decimal" ].includes(curNode.unit)) { let [ tree ] = generateMissingUnitNodes(curNode.unit,"hundred"); if (tree) { curNode.and = tree.and; } else { throw new Error("Invalid! " + curNode.value); } } return ast; } function assignHundredUnitPlaceNode(curNode,val) { if (curNode.unit != "hundred") { // current node is "empty", so we can assign it // as the "hundred" unit-place node if (!curNode.unit && !curNode.value) { curNode.unit = "hundred"; curNode.value = val; return curNode; } else { // attempt to generate missing unit node(s) let [ tree, leaf,] = generateMissingUnitNodes(curNode.unit,"hundred"); if (tree) { curNode.and = tree.and; curNode = leaf; } else { throw new Error("Invalid! " + val); } } } // current node is a placeholder in the "hundred" // unit-place, that has not yet been assigned any // value from a token? if (!curNode.value) { curNode.value = val; return curNode; } else { throw new Error("Invalid! " + val); } } function collectConsecutiveDigits(tokens,tokenIdx,limit = Number.MAX_SAFE_INTEGER) { var digitTokens = [ tokens[tokenIdx], ]; for ( let adjIdx = tokenIdx + 1; ( adjIdx < tokens.length && tokens[adjIdx].type == "digit" && !tokens[adjIdx].unit && digitTokens.length < limit ); adjIdx++ ) { digitTokens.push(tokens[adjIdx]); } return digitTokens; } function generateMissingUnitNodes(curUnit,targetUnit) { var unit = curUnit; var tree = { unit: curUnit, }; var leaf = tree; while (unit && unit != targetUnit) { unit = nextUnit(unit); if (unit) { leaf = leaf.and = { unit, }; } } if (unit && unit == targetUnit) { return [ tree, leaf ]; } return []; } function nextUnit(unit) { var unitIdx = units.indexOf(unit); if (unitIdx > 0) { return units[unitIdx - 1]; } } To run: convert("one hundred five"); // "105" convert("six hundred and fifty three"); // "653" convert("zero zero one two three"); // "123" convert("twelve o three"); // "1203" convert("thirteen zero nine"); // "1309" convert("fifteen sixteen"); // "1516" convert("fourteen ninety two"); // "1492" convert("nineteen ten"); // "1910" convert("twenty twenty"); // "2020" <---- ugh! convert("twenty twenty one"); // "2021" <---- ehhh... convert("twenty twenty two"); // "2022" <---- let's hope! convert("four five two three eight"); // "45238" convert("sixteen thousand three eighty four"); // "16384" convert("seven billion six hundred eighty-one million"); // "7681000000" convert("twenty three trillion and nine"); // "23000000000009" convert("four billion two hundred nine thousand"); // "4000209000" convert("nine hundred ninety nine quadrillion nine ninety nine trillion nine hundred and ninety nine billion nine ninety-nine million nine hundred ninety-nine thousand nine ninety nine"); // "999999999999999999" convert("one two three four five six seven eight nine eight seven six five four three two one two three four five"); // "123456789876543212345" convert("forty two point zero"); // "42.0" convert("three point one four one five nine two six"); // "3.1415926" convert("point"); // "0.0" convert("four point zero o o o zero"); // "4.00000" convert("sixty five thousand five thirty six",","); // "65,536" convert("four billion two hundred nine thousand",","); // "4,000,209,000" convert("forty two",","); // "42" convert("twenty one twenty three",","); // "2,123" convert("one two three four five six seven eight nine eight seven six five four three two one two three four five",","); // "123456,789,876,543,212,345" <---- not a mistake, quadrillion is the highest supported "place"