rogs wiki

my wiki, quick and useful snippets of code

User Tools

Site Tools


Sidebar

javascript:libs

Javascript useful libs

Converting English number sentences to numeric digits

This piece of code converts English number sentences like “one hundred forty two point three” to its numeric representation: “142.3”.

This is just a copy & paste from a Github gist. Saving it here for archiving purposes. All credits go to Kyle Simpson

"use strict";
 
var digits = {
  "o": "0",
  "zero": "0",
  "one": "1",
  "two": "2",
  "three": "3",
  "four": "4",
  "five": "5",
  "six": "6",
  "seven": "7",
  "eight": "8",
  "nine": "9",
};
 
var tens = {
  "ten": "10",
  "eleven": "11",
  "twelve": "12",
  "thirteen": "13",
  "fourteen": "14",
  "fifteen": "15",
  "sixteen": "16",
  "seventeen": "17",
  "eighteen": "18",
  "nineteen": "19",
};
 
var doubles = {
  "twenty": "20",
  "thirty": "30",
  "forty": "40",
  "fifty": "50",
  "sixty": "60",
  "seventy": "70",
  "eighty": "80",
  "ninety": "90",
};
 
var units = [
  "hundred",
  "thousand",
  "million",
  "billion",
  "trillion",
  "quadrillion",
];
 
function convert(numstr,separator = "") {
  var ast = parse(numstr);
 
  var numberDigits = "";
  var node = ast;
  while (node) {
    numberDigits += (
      (node.unit == "decimal" ?
        ("." + (node.value || "0")) :
        (
          (numberDigits != "" ? separator : "") +
          (node.value || "000")
        )
      )
    );
    node = node.and;
  }
 
  // normalize leading zeros
  numberDigits = numberDigits.replace(/^0+/,"").replace(/^\./,"0.") || "0";
 
  return numberDigits;
}
 
function parse(numstr) {
  var words = numstr.trim().replace(/[^\-0-9a-z\s]+/ig,"").toLowerCase().split(/[\s\-]+/).filter(Boolean);
 
  // (STEP 1) tokenize the string
  var tokens = [];
  var inDecimal = false;
  for (let word of words) {
    let curToken = tokens[tokens.length - 1];
 
    if (word == "point" || word == "dot") {
      if (curToken && !curToken.complete) {
        if (!curToken.unit) {
          curToken.unit = "hundred";
        }
        curToken.complete = true;
      }
 
      if (!inDecimal) {
        inDecimal = true;
        tokens.push({ type: "point", value: ".", complete: true, });
      }
      else {
        throw new Error("Invalid! " + word);
      }
    }
    else if (word == "o" || word == "zero") {
      if (curToken && !curToken.complete) {
        tokens.push({ type: "digit", value: "0", complete: true, });
        curToken.complete = true;
      }
      else {
        tokens.push({ type: "digit", value: "0", complete: true, });
      }
    }
    else if (word in digits) {
      if (curToken && !curToken.complete) {
        // replace a trailing zero (from a double or hundred)?
        if (curToken.value.endsWith("0")) {
          curToken.value = curToken.value.slice(0,-1) + digits[word];
          curToken.complete = true;
        }
        else {
          tokens.push({ type: "digit", value: digits[word], complete: true, });
          curToken.complete = true;
        }
      }
      else {
        tokens.push({ type: "digit", value: digits[word], complete: true, });
      }
    }
    else if (word in tens) {
      if (curToken && !curToken.complete) {
        // replace two trailing zeros (from a hundred)?
        if (curToken.value.endsWith("00")) {
          curToken.value = curToken.value.slice(0,1) + tens[word];
          curToken.complete = true;
        }
        else {
          tokens.push({ type: "ten", value: tens[word], complete: true, });
          curToken.complete = true;
        }
      }
      // promote a single digit to a complete triple?
      else if (curToken && !curToken.unit && curToken.type == "digit") {
        curToken.type = "triple";
        curToken.value = curToken.value.slice(0,1) + tens[word];
      }
      else {
        tokens.push({ type: "ten", value: tens[word], complete: true, });
      }
    }
    else if (word in doubles) {
      if (curToken && !curToken.complete) {
        // replace two trailing zeros (from a triple)?
        if (curToken.value.endsWith("00")) {
          curToken.value = curToken.value.slice(0,1) + doubles[word];
          // NOTE: leave complete:false since a digit can complete a double
        }
        else {
          tokens.push({ type: "double", value: doubles[word], complete: false, });
          curToken.complete = true;
        }
      }
      // promote a single digit to an incomplete triple?
      else if (curToken && !curToken.unit && curToken.type == "digit") {
        curToken.type = "triple";
        curToken.value = curToken.value.slice(0,1) + doubles[word];
        curToken.complete = false;
      }
      else {
        tokens.push({ type: "double", value: doubles[word], complete: false, });
      }
    }
    else if (!inDecimal) {
      if (word == "hundred") {
        if (curToken && !curToken.complete) {
          curToken.complete = true;
          tokens.push({ type: "triple", value: "100", complete: false, });
        }
        // promote a single digit to an incomplete triple?
        else if (curToken && !curToken.unit && curToken.type == "digit") {
          curToken.type = "triple";
          curToken.value = curToken.value.slice(0,1) + "00";
          curToken.complete = false;
        }
        else {
          tokens.push({ type: "triple", value: "100", complete: false, });
        }
      }
      // thousand, million, etc
      else if (units.includes(word)) {
        if (curToken) {
          curToken.unit = word;
          curToken.complete = true;
        }
        else {
          tokens.push({ type: "digit", unit: word, value: "1", complete: true, });
        }
      }
      // harmless conjunction word?
      else if (word == "and") {
        continue;
      }
      // unrecognized/invalid word
      else {
        throw new Error("Invalid! " + word);
      }
    }
    // word not allowed while tokenizing decimal values
    else {
      throw new Error("Invalid! " + word);
    }
  }
 
  // (STEP 2) parse the token list into an AST
  var ast = {};
  var curNode = ast;
  for (let tokenIdx = 0; tokenIdx < tokens.length; tokenIdx++) {
    let token = tokens[tokenIdx];
    let nextToken = tokens[tokenIdx + 1];
 
    // token indicates an assigned unit-place?
    if (token.unit) {
      // current node has no assigned unit-place?
      if (!curNode.unit) {
        curNode.unit = token.unit;
        curNode.value = (
          curNode == ast ?
            token.value :
            token.value.padStart(3,"0")
        );
        let unit = nextUnit(token.unit);
        if (unit) {
          // create next placeholder node
          curNode = curNode.and = { unit, };
        }
      }
      // token unit same as current node?
      else if (token.unit == curNode.unit) {
        // current node is a placeholder that has not yet
        // been assigned a value from token?
        if (!curNode.value) {
          curNode.value = (
            curNode == ast ?
              token.value :
              token.value.padStart(3,"0")
          );
          let unit = nextUnit(token.unit);
          if (unit) {
            // create next placeholder node
            curNode = curNode.and = { unit, };
          }
        }
        else {
          throw new Error("Invalid! " + token.unit);
        }
      }
      // current node is different (higher?) unit place
      // than token?
      else {
        // attempt to generate missing unit node(s)
        let [ tree, leaf,] =
            generateMissingUnitNodes(curNode.unit,token.unit);
        if (tree) {
          curNode.and = tree.and;
          curNode = leaf;
          curNode.value = token.value;
        }
        else {
          throw new Error("Invalid! " + token.unit);
        }
      }
    }
    // decimal point?
    else if (token.type == "point") {
      // current node has no unit-place assigned yet?
      if (!curNode.unit) {
        curNode.unit = "hundred";
        curNode = curNode.and = { unit: "decimal", value: "", };
      }
      else if (curNode.unit == "hundred") {
        curNode = curNode.and = { unit: "decimal", value: "", };
      }
      else {
        // attempt to generate missing unit-place node(s)
        let [ tree, leaf,] =
            generateMissingUnitNodes(curNode.unit,"hundred");
        if (tree) {
          curNode.and = tree.and;
          curNode = leaf;
          curNode = curNode.and = { unit: "decimal", value: "", };
        }
        else {
          throw new Error("Invalid! " + token.type);
        }
      }
    }
    // separate digit?
    else if (token.type == "digit") {
      // append digit to the decimal node?
      if (curNode.unit == "decimal") {
        // look-ahead to collect all consecutive digits, if any
        let digitTokens = collectConsecutiveDigits(tokens,tokenIdx);
        tokenIdx += (digitTokens.length - 1);
 
        // add digit token(s) to current node 
        for (let digit of digitTokens) {
          curNode.value = (curNode.value || "") + digit.value;
        }
      }
      // multiple adjacent (non-decimal) digits?
      else if (
        nextToken &&
        nextToken.type == "digit"
      ) {
        // current node is "empty", so we can implicitly
        // create arbitrary unit-place segment(s) from multiple
        // digits?
        if (!curNode.unit) {
          // look-ahead to collect all consecutive digits
          let digitTokens = collectConsecutiveDigits(tokens,tokenIdx);
          tokenIdx += (digitTokens.length - 1);
 
          // skip any leading zeros (since we're at the
          // start of the number)
          let firstNonZeroDigitIdx = digitTokens.findIndex(digit => digit.value != "0");
          if (firstNonZeroDigitIdx > 0) {
            digitTokens = digitTokens.slice(firstNonZeroDigitIdx);
          }
 
          // any digits remain to be added to the AST?
          if (digitTokens.length > 0) {
            // determine how many unit-place groups are needed
            let numGroups = Math.ceil(
              Math.min(digitTokens.length,units.length * 3) / 3
            );
 
            // determine number of digits in first group
            let groupSize = (
              digitTokens.length > (units.length * 3) ?
                digitTokens.length - (units.length * 3) + 3 :
                digitTokens.length % 3 || 3
            );
 
            // create the necessary unit-place nodes in the AST
            let [ tree, leaf ] = generateMissingUnitNodes(
              units[
                Math.min(units.length - 1,numGroups - 1)
              ],
              "hundred"
            );
            if (tree) {
              curNode.unit = tree.unit;
              curNode.value = "";
              if (tree.and) {
                curNode.and = tree.and;
              }
              // fill in the unit-place groups to the AST
              do {
                // collect a group of digits into current node
                let digitGroup = digitTokens.slice(0,groupSize);
                digitTokens = digitTokens.slice(groupSize);
                curNode.value = digitGroup.reduce((val,digit) => val + digit.value,"");
 
                // more digits to add as a unit-place group?
                if (curNode.and && digitTokens.length > 0) {
                  curNode = curNode.and;
                  // from here forward, all digit groups are
                  // fixed size of 3
                  groupSize = 3;
                }
              }
              // keep going while digits remain to be grouped
              while (digitTokens.length > 0);
            }
          }
          else {
            // NOTE: should never get here
            throw new Error("Invalid! " + token.value);
          }
        }
        else {
          // look-ahead to collect up to 3 consecutive digits
          let digitTokens =
              collectConsecutiveDigits(tokens,tokenIdx,/*limit=*/3);
          tokenIdx += (digitTokens.length - 1);
 
          // combine digits into a single value
          let val = digitTokens.reduce((val,digit) => val + digit.value,"");
 
          // assign combined-digits to "hundred" unit-place node
          curNode = assignHundredUnitPlaceNode(
            curNode,
            // zero-pad the value
            val.padStart(3,"0")
          );
        }
      }
      else {
        // assign single digit to "hundred" unit-place node
        curNode = assignHundredUnitPlaceNode(
          curNode,
          // zero-pad the value
          token.value.padStart(3,"0")
        );
      }
    }
    // stand-alone ten or double token?
    else if (token.type == "ten" || token.type == "double") {
      // append numbers to the decimal node?
      if (curNode.unit == "decimal") {
        curNode.value += token.value;
      }
      // literal/year form:
      //   * "seventeen nineteen"
      //   * "seventeen thirty"
      //   * "twenty fourteen"
      //   * "twenty fifty"
      else if (
        nextToken &&
        (nextToken.type == "ten" || nextToken.type == "double")
      ) {
        if (!curNode.unit) {
          curNode.unit = "thousand";
          curNode.value = token.value.slice(0,1);
          curNode = curNode.and = {
            unit: "hundred",
            value: token.value.slice(1) + nextToken.value,
          };
          tokenIdx += 1;  // lookahead: 1 spot
        }
        else {
          throw new Error("Invalid! " + token.value);
        }
      }
      // ten/double followed by:
      //   * any 3 digits
      //   * '0' plus another digit
      else if (
        !curNode.unit &&
        nextToken &&
        nextToken.type == "digit" &&
        !nextToken.unit
      ) {
        let tokenN2 = tokens[tokenIdx + 2];
        let tokenN3 = tokens[tokenIdx + 3];
 
        // any 3 digits
        if (
          tokenN2 &&
          tokenN2.type == "digit" &&
          tokenN3 &&
          tokenN3.type == "digit"
        ) {
          curNode.unit = "thousand";
          curNode.value = token.value;
          curNode = curNode.and = {
            unit: "hundred",
            value: nextToken.value + tokenN2.value + tokenN3.value,
          };
          tokenIdx += 3;  // lookahead: 3 spots
        }
        // '0' plus another digit
        else if (
          nextToken.value == "0" &&
          tokenN2 &&
          tokenN2.type == "digit"
        ) {
          curNode.unit = "thousand";
          curNode.value = token.value.slice(0,1);
          curNode = curNode.and = {
            unit: "hundred",
            value: token.value.slice(1) + nextToken.value + tokenN2.value,
          };
          tokenIdx += 2;  // lookahead: 2 spots
        }
        else {
          throw new Error("Invalid! " + token.value);
        }
      }
      // assumed "thousand" unit:
      //   * "thirteen nine forty two"
      //   * "thirty nine two o six"
      else if (
        !curNode.unit &&
        nextToken &&
        nextToken.type == "triple" &&
        !nextToken.unit        
      ) {
        curNode.unit = "thousand";
        curNode.value = token.value;
        curNode = curNode.and = {
          unit: "hundred",
          value: nextToken.value.padStart(3,"0"),
        };
        tokenIdx += 1;  // lookahead: 1 spot
      }
      else {
        // assign ten/double value to "hundred" unit-place node
        curNode = assignHundredUnitPlaceNode(
          curNode,
          // zero-pad the value
          token.value.padStart(3,"0")
        );        
      }
    }
    else if (token.type == "triple") {
      if (curNode.unit == "decimal") {
        curNode.value += token.value;
      }
      else {
        // assign triple value to "hundred" unit-place node
        curNode = assignHundredUnitPlaceNode(
          curNode,
          // zero-pad the value
          token.value.padStart(3,"0")
        );        
      }
    }
    else {
      // NOTE: should never get here
      throw new Error("Invalid! " + token.type);
    }
  }
 
  // append missing AST nodes (if any)
  if (![ "hundred", "decimal" ].includes(curNode.unit)) {
    let [ tree ] = generateMissingUnitNodes(curNode.unit,"hundred");
    if (tree) {
      curNode.and = tree.and;
    }
    else {
      throw new Error("Invalid! " + curNode.value);
    }
  }
 
  return ast;
}
 
function assignHundredUnitPlaceNode(curNode,val) {
  if (curNode.unit != "hundred") {
    // current node is "empty", so we can assign it
    // as the "hundred" unit-place node
    if (!curNode.unit && !curNode.value) {
      curNode.unit = "hundred";
      curNode.value = val;
      return curNode;
    }
    else {
      // attempt to generate missing unit node(s)
      let [ tree, leaf,] =
          generateMissingUnitNodes(curNode.unit,"hundred");
      if (tree) {
        curNode.and = tree.and;
        curNode = leaf;
      }
      else {
        throw new Error("Invalid! " + val);
      }
    }
  }
 
  // current node is a placeholder in the "hundred"
  // unit-place, that has not yet been assigned any
  // value from a token?
  if (!curNode.value) {
    curNode.value = val;
    return curNode;
  }
  else {
    throw new Error("Invalid! " + val);
  }
}
 
function collectConsecutiveDigits(tokens,tokenIdx,limit = Number.MAX_SAFE_INTEGER) {
  var digitTokens = [ tokens[tokenIdx], ];
  for (
    let adjIdx = tokenIdx + 1;
    (
      adjIdx < tokens.length &&
      tokens[adjIdx].type == "digit" &&
      !tokens[adjIdx].unit &&
      digitTokens.length < limit
    );
    adjIdx++
  ) {
    digitTokens.push(tokens[adjIdx]);
  }
  return digitTokens;
}
 
function generateMissingUnitNodes(curUnit,targetUnit) {
  var unit = curUnit;
  var tree = { unit: curUnit, };
  var leaf = tree;
  while (unit && unit != targetUnit) {
    unit = nextUnit(unit);
    if (unit) {
      leaf = leaf.and = { unit, };
    }
  }
 
  if (unit && unit == targetUnit) {
    return [ tree, leaf ];
  }
 
  return [];
}
 
function nextUnit(unit) {
  var unitIdx = units.indexOf(unit);
  if (unitIdx > 0) {
    return units[unitIdx - 1];
  }
}

To run:

convert("one hundred five");  // "105"
convert("six hundred and fifty three");  // "653"
convert("zero zero one two three");  // "123"
convert("twelve o three");  // "1203"
convert("thirteen zero nine");  // "1309"
convert("fifteen sixteen");  // "1516"
convert("fourteen ninety two");  // "1492"
convert("nineteen ten");  // "1910"
convert("twenty twenty");  // "2020" <---- ugh!
convert("twenty twenty one");  // "2021"  <---- ehhh...
convert("twenty twenty two");  // "2022"  <---- let's hope!
convert("four five two three eight");  // "45238"
convert("sixteen thousand three eighty four");  // "16384"
convert("seven billion six hundred eighty-one million"); // "7681000000"
convert("twenty three trillion and nine");  // "23000000000009"
convert("four billion two hundred nine thousand");  // "4000209000"
convert("nine hundred ninety nine quadrillion nine ninety nine trillion nine hundred and ninety nine billion nine ninety-nine million nine hundred ninety-nine thousand nine ninety nine");  // "999999999999999999"
convert("one two three four five six seven eight nine eight seven six five four three two one two three four five"); // "123456789876543212345"
 
convert("forty two point zero");  // "42.0"
convert("three point one four one five nine two six");  // "3.1415926"
convert("point");  // "0.0"
convert("four point zero o o o zero");  // "4.00000"
 
convert("sixty five thousand five thirty six",",");  // "65,536"
convert("four billion two hundred nine thousand",",");  // "4,000,209,000"
convert("forty two",",");  // "42"
convert("twenty one twenty three",",");  // "2,123"
convert("one two three four five six seven eight nine eight seven six five four three two one two three four five",","); // "123456,789,876,543,212,345" <---- not a mistake, quadrillion is the highest supported "place"
javascript/libs.txt · Last modified: 2021/02/24 12:21 by roger