javascript:libs
This is an old revision of the document!
Javascript useful libs
Converting English number sentences to numeric digits
This piece of code converts English number sentences like “one hundred forty two point three” to its numeric representation: “142.3”.
This is just a copy & paste from a Github gist. Saving it here for archiving. All credits go to Kyle Simpson
"use strict"; var digits = { "o": "0", "zero": "0", "one": "1", "two": "2", "three": "3", "four": "4", "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9", }; var tens = { "ten": "10", "eleven": "11", "twelve": "12", "thirteen": "13", "fourteen": "14", "fifteen": "15", "sixteen": "16", "seventeen": "17", "eighteen": "18", "nineteen": "19", }; var doubles = { "twenty": "20", "thirty": "30", "forty": "40", "fifty": "50", "sixty": "60", "seventy": "70", "eighty": "80", "ninety": "90", }; var units = [ "hundred", "thousand", "million", "billion", "trillion", "quadrillion", ]; function convert(numstr,separator = "") { var ast = parse(numstr); var numberDigits = ""; var node = ast; while (node) { numberDigits += ( (node.unit == "decimal" ? ("." + (node.value || "0")) : ( (numberDigits != "" ? separator : "") + (node.value || "000") ) ) ); node = node.and; } // normalize leading zeros numberDigits = numberDigits.replace(/^0+/,"").replace(/^\./,"0.") || "0"; return numberDigits; } function parse(numstr) { var words = numstr.trim().replace(/[^\-0-9a-z\s]+/ig,"").toLowerCase().split(/[\s\-]+/).filter(Boolean); // (STEP 1) tokenize the string var tokens = []; var inDecimal = false; for (let word of words) { let curToken = tokens[tokens.length - 1]; if (word == "point" || word == "dot") { if (curToken && !curToken.complete) { if (!curToken.unit) { curToken.unit = "hundred"; } curToken.complete = true; } if (!inDecimal) { inDecimal = true; tokens.push({ type: "point", value: ".", complete: true, }); } else { throw new Error("Invalid! " + word); } } else if (word == "o" || word == "zero") { if (curToken && !curToken.complete) { tokens.push({ type: "digit", value: "0", complete: true, }); curToken.complete = true; } else { tokens.push({ type: "digit", value: "0", complete: true, }); } } else if (word in digits) { if (curToken && !curToken.complete) { // replace a trailing zero (from a double or hundred)? if (curToken.value.endsWith("0")) { curToken.value = curToken.value.slice(0,-1) + digits[word]; curToken.complete = true; } else { tokens.push({ type: "digit", value: digits[word], complete: true, }); curToken.complete = true; } } else { tokens.push({ type: "digit", value: digits[word], complete: true, }); } } else if (word in tens) { if (curToken && !curToken.complete) { // replace two trailing zeros (from a hundred)? if (curToken.value.endsWith("00")) { curToken.value = curToken.value.slice(0,1) + tens[word]; curToken.complete = true; } else { tokens.push({ type: "ten", value: tens[word], complete: true, }); curToken.complete = true; } } // promote a single digit to a complete triple? else if (curToken && !curToken.unit && curToken.type == "digit") { curToken.type = "triple"; curToken.value = curToken.value.slice(0,1) + tens[word]; } else { tokens.push({ type: "ten", value: tens[word], complete: true, }); } } else if (word in doubles) { if (curToken && !curToken.complete) { // replace two trailing zeros (from a triple)? if (curToken.value.endsWith("00")) { curToken.value = curToken.value.slice(0,1) + doubles[word]; // NOTE: leave complete:false since a digit can complete a double } else { tokens.push({ type: "double", value: doubles[word], complete: false, }); curToken.complete = true; } } // promote a single digit to an incomplete triple? else if (curToken && !curToken.unit && curToken.type == "digit") { curToken.type = "triple"; curToken.value = curToken.value.slice(0,1) + doubles[word]; curToken.complete = false; } else { tokens.push({ type: "double", value: doubles[word], complete: false, }); } } else if (!inDecimal) { if (word == "hundred") { if (curToken && !curToken.complete) { curToken.complete = true; tokens.push({ type: "triple", value: "100", complete: false, }); } // promote a single digit to an incomplete triple? else if (curToken && !curToken.unit && curToken.type == "digit") { curToken.type = "triple"; curToken.value = curToken.value.slice(0,1) + "00"; curToken.complete = false; } else { tokens.push({ type: "triple", value: "100", complete: false, }); } } // thousand, million, etc else if (units.includes(word)) { if (curToken) { curToken.unit = word; curToken.complete = true; } else { tokens.push({ type: "digit", unit: word, value: "1", complete: true, }); } } // harmless conjunction word? else if (word == "and") { continue; } // unrecognized/invalid word else { throw new Error("Invalid! " + word); } } // word not allowed while tokenizing decimal values else { throw new Error("Invalid! " + word); } } // (STEP 2) parse the token list into an AST var ast = {}; var curNode = ast; for (let tokenIdx = 0; tokenIdx < tokens.length; tokenIdx++) { let token = tokens[tokenIdx]; let nextToken = tokens[tokenIdx + 1]; // token indicates an assigned unit-place? if (token.unit) { // current node has no assigned unit-place? if (!curNode.unit) { curNode.unit = token.unit; curNode.value = ( curNode == ast ? token.value : token.value.padStart(3,"0") ); let unit = nextUnit(token.unit); if (unit) { // create next placeholder node curNode = curNode.and = { unit, }; } } // token unit same as current node? else if (token.unit == curNode.unit) { // current node is a placeholder that has not yet // been assigned a value from token? if (!curNode.value) { curNode.value = ( curNode == ast ? token.value : token.value.padStart(3,"0") ); let unit = nextUnit(token.unit); if (unit) { // create next placeholder node curNode = curNode.and = { unit, }; } } else { throw new Error("Invalid! " + token.unit); } } // current node is different (higher?) unit place // than token? else { // attempt to generate missing unit node(s) let [ tree, leaf,] = generateMissingUnitNodes(curNode.unit,token.unit); if (tree) { curNode.and = tree.and; curNode = leaf; curNode.value = token.value; } else { throw new Error("Invalid! " + token.unit); } } } // decimal point? else if (token.type == "point") { // current node has no unit-place assigned yet? if (!curNode.unit) { curNode.unit = "hundred"; curNode = curNode.and = { unit: "decimal", value: "", }; } else if (curNode.unit == "hundred") { curNode = curNode.and = { unit: "decimal", value: "", }; } else { // attempt to generate missing unit-place node(s) let [ tree, leaf,] = generateMissingUnitNodes(curNode.unit,"hundred"); if (tree) { curNode.and = tree.and; curNode = leaf; curNode = curNode.and = { unit: "decimal", value: "", }; } else { throw new Error("Invalid! " + token.type); } } } // separate digit? else if (token.type == "digit") { // append digit to the decimal node? if (curNode.unit == "decimal") { // look-ahead to collect all consecutive digits, if any let digitTokens = collectConsecutiveDigits(tokens,tokenIdx); tokenIdx += (digitTokens.length - 1); // add digit token(s) to current node for (let digit of digitTokens) { curNode.value = (curNode.value || "") + digit.value; } } // multiple adjacent (non-decimal) digits? else if ( nextToken && nextToken.type == "digit" ) { // current node is "empty", so we can implicitly // create arbitrary unit-place segment(s) from multiple // digits? if (!curNode.unit) { // look-ahead to collect all consecutive digits let digitTokens = collectConsecutiveDigits(tokens,tokenIdx); tokenIdx += (digitTokens.length - 1); // skip any leading zeros (since we're at the // start of the number) let firstNonZeroDigitIdx = digitTokens.findIndex(digit => digit.value != "0"); if (firstNonZeroDigitIdx > 0) { digitTokens = digitTokens.slice(firstNonZeroDigitIdx); } // any digits remain to be added to the AST? if (digitTokens.length > 0) { // determine how many unit-place groups are needed let numGroups = Math.ceil( Math.min(digitTokens.length,units.length * 3) / 3 ); // determine number of digits in first group let groupSize = ( digitTokens.length > (units.length * 3) ? digitTokens.length - (units.length * 3) + 3 : digitTokens.length % 3 || 3 ); // create the necessary unit-place nodes in the AST let [ tree, leaf ] = generateMissingUnitNodes( units[ Math.min(units.length - 1,numGroups - 1) ], "hundred" ); if (tree) { curNode.unit = tree.unit; curNode.value = ""; if (tree.and) { curNode.and = tree.and; } // fill in the unit-place groups to the AST do { // collect a group of digits into current node let digitGroup = digitTokens.slice(0,groupSize); digitTokens = digitTokens.slice(groupSize); curNode.value = digitGroup.reduce((val,digit) => val + digit.value,""); // more digits to add as a unit-place group? if (curNode.and && digitTokens.length > 0) { curNode = curNode.and; // from here forward, all digit groups are // fixed size of 3 groupSize = 3; } } // keep going while digits remain to be grouped while (digitTokens.length > 0); } } else { // NOTE: should never get here throw new Error("Invalid! " + token.value); } } else { // look-ahead to collect up to 3 consecutive digits let digitTokens = collectConsecutiveDigits(tokens,tokenIdx,/*limit=*/3); tokenIdx += (digitTokens.length - 1); // combine digits into a single value let val = digitTokens.reduce((val,digit) => val + digit.value,""); // assign combined-digits to "hundred" unit-place node curNode = assignHundredUnitPlaceNode( curNode, // zero-pad the value val.padStart(3,"0") ); } } else { // assign single digit to "hundred" unit-place node curNode = assignHundredUnitPlaceNode( curNode, // zero-pad the value token.value.padStart(3,"0") ); } } // stand-alone ten or double token? else if (token.type == "ten" || token.type == "double") { // append numbers to the decimal node? if (curNode.unit == "decimal") { curNode.value += token.value; } // literal/year form: // * "seventeen nineteen" // * "seventeen thirty" // * "twenty fourteen" // * "twenty fifty" else if ( nextToken && (nextToken.type == "ten" || nextToken.type == "double") ) { if (!curNode.unit) { curNode.unit = "thousand"; curNode.value = token.value.slice(0,1); curNode = curNode.and = { unit: "hundred", value: token.value.slice(1) + nextToken.value, }; tokenIdx += 1; // lookahead: 1 spot } else { throw new Error("Invalid! " + token.value); } } // ten/double followed by: // * any 3 digits // * '0' plus another digit else if ( !curNode.unit && nextToken && nextToken.type == "digit" && !nextToken.unit ) { let tokenN2 = tokens[tokenIdx + 2]; let tokenN3 = tokens[tokenIdx + 3]; // any 3 digits if ( tokenN2 && tokenN2.type == "digit" && tokenN3 && tokenN3.type == "digit" ) { curNode.unit = "thousand"; curNode.value = token.value; curNode = curNode.and = { unit: "hundred", value: nextToken.value + tokenN2.value + tokenN3.value, }; tokenIdx += 3; // lookahead: 3 spots } // '0' plus another digit else if ( nextToken.value == "0" && tokenN2 && tokenN2.type == "digit" ) { curNode.unit = "thousand"; curNode.value = token.value.slice(0,1); curNode = curNode.and = { unit: "hundred", value: token.value.slice(1) + nextToken.value + tokenN2.value, }; tokenIdx += 2; // lookahead: 2 spots } else { throw new Error("Invalid! " + token.value); } } // assumed "thousand" unit: // * "thirteen nine forty two" // * "thirty nine two o six" else if ( !curNode.unit && nextToken && nextToken.type == "triple" && !nextToken.unit ) { curNode.unit = "thousand"; curNode.value = token.value; curNode = curNode.and = { unit: "hundred", value: nextToken.value.padStart(3,"0"), }; tokenIdx += 1; // lookahead: 1 spot } else { // assign ten/double value to "hundred" unit-place node curNode = assignHundredUnitPlaceNode( curNode, // zero-pad the value token.value.padStart(3,"0") ); } } else if (token.type == "triple") { if (curNode.unit == "decimal") { curNode.value += token.value; } else { // assign triple value to "hundred" unit-place node curNode = assignHundredUnitPlaceNode( curNode, // zero-pad the value token.value.padStart(3,"0") ); } } else { // NOTE: should never get here throw new Error("Invalid! " + token.type); } } // append missing AST nodes (if any) if (![ "hundred", "decimal" ].includes(curNode.unit)) { let [ tree ] = generateMissingUnitNodes(curNode.unit,"hundred"); if (tree) { curNode.and = tree.and; } else { throw new Error("Invalid! " + curNode.value); } } return ast; } function assignHundredUnitPlaceNode(curNode,val) { if (curNode.unit != "hundred") { // current node is "empty", so we can assign it // as the "hundred" unit-place node if (!curNode.unit && !curNode.value) { curNode.unit = "hundred"; curNode.value = val; return curNode; } else { // attempt to generate missing unit node(s) let [ tree, leaf,] = generateMissingUnitNodes(curNode.unit,"hundred"); if (tree) { curNode.and = tree.and; curNode = leaf; } else { throw new Error("Invalid! " + val); } } } // current node is a placeholder in the "hundred" // unit-place, that has not yet been assigned any // value from a token? if (!curNode.value) { curNode.value = val; return curNode; } else { throw new Error("Invalid! " + val); } } function collectConsecutiveDigits(tokens,tokenIdx,limit = Number.MAX_SAFE_INTEGER) { var digitTokens = [ tokens[tokenIdx], ]; for ( let adjIdx = tokenIdx + 1; ( adjIdx < tokens.length && tokens[adjIdx].type == "digit" && !tokens[adjIdx].unit && digitTokens.length < limit ); adjIdx++ ) { digitTokens.push(tokens[adjIdx]); } return digitTokens; } function generateMissingUnitNodes(curUnit,targetUnit) { var unit = curUnit; var tree = { unit: curUnit, }; var leaf = tree; while (unit && unit != targetUnit) { unit = nextUnit(unit); if (unit) { leaf = leaf.and = { unit, }; } } if (unit && unit == targetUnit) { return [ tree, leaf ]; } return []; } function nextUnit(unit) { var unitIdx = units.indexOf(unit); if (unitIdx > 0) { return units[unitIdx - 1]; } }
To run:
convert("one hundred five"); // "105" convert("six hundred and fifty three"); // "653" convert("zero zero one two three"); // "123" convert("twelve o three"); // "1203" convert("thirteen zero nine"); // "1309" convert("fifteen sixteen"); // "1516" convert("fourteen ninety two"); // "1492" convert("nineteen ten"); // "1910" convert("twenty twenty"); // "2020" <---- ugh! convert("twenty twenty one"); // "2021" <---- ehhh... convert("twenty twenty two"); // "2022" <---- let's hope! convert("four five two three eight"); // "45238" convert("sixteen thousand three eighty four"); // "16384" convert("seven billion six hundred eighty-one million"); // "7681000000" convert("twenty three trillion and nine"); // "23000000000009" convert("four billion two hundred nine thousand"); // "4000209000" convert("nine hundred ninety nine quadrillion nine ninety nine trillion nine hundred and ninety nine billion nine ninety-nine million nine hundred ninety-nine thousand nine ninety nine"); // "999999999999999999" convert("one two three four five six seven eight nine eight seven six five four three two one two three four five"); // "123456789876543212345" convert("forty two point zero"); // "42.0" convert("three point one four one five nine two six"); // "3.1415926" convert("point"); // "0.0" convert("four point zero o o o zero"); // "4.00000" convert("sixty five thousand five thirty six",","); // "65,536" convert("four billion two hundred nine thousand",","); // "4,000,209,000" convert("forty two",","); // "42" convert("twenty one twenty three",","); // "2,123" convert("one two three four five six seven eight nine eight seven six five four three two one two three four five",","); // "123456,789,876,543,212,345" <---- not a mistake, quadrillion is the highest supported "place"
javascript/libs.1614180031.txt.gz · Last modified: 2024/11/17 12:59 (external edit)