User Tools

Site Tools


javascript:libs

This is an old revision of the document!


Javascript useful libs

Converting English number sentences to numeric digits

This piece of code converts English number sentences like “one hundred forty two point three” to its numeric representation: “142.3”.

This is just a copy & paste from a Github gist. Saving it here for archiving purposes. All credits go to Kyle Simpson

"use strict";
 
var digits = {
  "o": "0",
  "zero": "0",
  "one": "1",
  "two": "2",
  "three": "3",
  "four": "4",
  "five": "5",
  "six": "6",
  "seven": "7",
  "eight": "8",
  "nine": "9",
};
 
var tens = {
  "ten": "10",
  "eleven": "11",
  "twelve": "12",
  "thirteen": "13",
  "fourteen": "14",
  "fifteen": "15",
  "sixteen": "16",
  "seventeen": "17",
  "eighteen": "18",
  "nineteen": "19",
};
 
var doubles = {
  "twenty": "20",
  "thirty": "30",
  "forty": "40",
  "fifty": "50",
  "sixty": "60",
  "seventy": "70",
  "eighty": "80",
  "ninety": "90",
};
 
var units = [
  "hundred",
  "thousand",
  "million",
  "billion",
  "trillion",
  "quadrillion",
];
 
function convert(numstr,separator = "") {
  var ast = parse(numstr);
 
  var numberDigits = "";
  var node = ast;
  while (node) {
    numberDigits += (
      (node.unit == "decimal" ?
        ("." + (node.value || "0")) :
        (
          (numberDigits != "" ? separator : "") +
          (node.value || "000")
        )
      )
    );
    node = node.and;
  }
 
  // normalize leading zeros
  numberDigits = numberDigits.replace(/^0+/,"").replace(/^\./,"0.") || "0";
 
  return numberDigits;
}
 
function parse(numstr) {
  var words = numstr.trim().replace(/[^\-0-9a-z\s]+/ig,"").toLowerCase().split(/[\s\-]+/).filter(Boolean);
 
  // (STEP 1) tokenize the string
  var tokens = [];
  var inDecimal = false;
  for (let word of words) {
    let curToken = tokens[tokens.length - 1];
 
    if (word == "point" || word == "dot") {
      if (curToken && !curToken.complete) {
        if (!curToken.unit) {
          curToken.unit = "hundred";
        }
        curToken.complete = true;
      }
 
      if (!inDecimal) {
        inDecimal = true;
        tokens.push({ type: "point", value: ".", complete: true, });
      }
      else {
        throw new Error("Invalid! " + word);
      }
    }
    else if (word == "o" || word == "zero") {
      if (curToken && !curToken.complete) {
        tokens.push({ type: "digit", value: "0", complete: true, });
        curToken.complete = true;
      }
      else {
        tokens.push({ type: "digit", value: "0", complete: true, });
      }
    }
    else if (word in digits) {
      if (curToken && !curToken.complete) {
        // replace a trailing zero (from a double or hundred)?
        if (curToken.value.endsWith("0")) {
          curToken.value = curToken.value.slice(0,-1) + digits[word];
          curToken.complete = true;
        }
        else {
          tokens.push({ type: "digit", value: digits[word], complete: true, });
          curToken.complete = true;
        }
      }
      else {
        tokens.push({ type: "digit", value: digits[word], complete: true, });
      }
    }
    else if (word in tens) {
      if (curToken && !curToken.complete) {
        // replace two trailing zeros (from a hundred)?
        if (curToken.value.endsWith("00")) {
          curToken.value = curToken.value.slice(0,1) + tens[word];
          curToken.complete = true;
        }
        else {
          tokens.push({ type: "ten", value: tens[word], complete: true, });
          curToken.complete = true;
        }
      }
      // promote a single digit to a complete triple?
      else if (curToken && !curToken.unit && curToken.type == "digit") {
        curToken.type = "triple";
        curToken.value = curToken.value.slice(0,1) + tens[word];
      }
      else {
        tokens.push({ type: "ten", value: tens[word], complete: true, });
      }
    }
    else if (word in doubles) {
      if (curToken && !curToken.complete) {
        // replace two trailing zeros (from a triple)?
        if (curToken.value.endsWith("00")) {
          curToken.value = curToken.value.slice(0,1) + doubles[word];
          // NOTE: leave complete:false since a digit can complete a double
        }
        else {
          tokens.push({ type: "double", value: doubles[word], complete: false, });
          curToken.complete = true;
        }
      }
      // promote a single digit to an incomplete triple?
      else if (curToken && !curToken.unit && curToken.type == "digit") {
        curToken.type = "triple";
        curToken.value = curToken.value.slice(0,1) + doubles[word];
        curToken.complete = false;
      }
      else {
        tokens.push({ type: "double", value: doubles[word], complete: false, });
      }
    }
    else if (!inDecimal) {
      if (word == "hundred") {
        if (curToken && !curToken.complete) {
          curToken.complete = true;
          tokens.push({ type: "triple", value: "100", complete: false, });
        }
        // promote a single digit to an incomplete triple?
        else if (curToken && !curToken.unit && curToken.type == "digit") {
          curToken.type = "triple";
          curToken.value = curToken.value.slice(0,1) + "00";
          curToken.complete = false;
        }
        else {
          tokens.push({ type: "triple", value: "100", complete: false, });
        }
      }
      // thousand, million, etc
      else if (units.includes(word)) {
        if (curToken) {
          curToken.unit = word;
          curToken.complete = true;
        }
        else {
          tokens.push({ type: "digit", unit: word, value: "1", complete: true, });
        }
      }
      // harmless conjunction word?
      else if (word == "and") {
        continue;
      }
      // unrecognized/invalid word
      else {
        throw new Error("Invalid! " + word);
      }
    }
    // word not allowed while tokenizing decimal values
    else {
      throw new Error("Invalid! " + word);
    }
  }
 
  // (STEP 2) parse the token list into an AST
  var ast = {};
  var curNode = ast;
  for (let tokenIdx = 0; tokenIdx < tokens.length; tokenIdx++) {
    let token = tokens[tokenIdx];
    let nextToken = tokens[tokenIdx + 1];
 
    // token indicates an assigned unit-place?
    if (token.unit) {
      // current node has no assigned unit-place?
      if (!curNode.unit) {
        curNode.unit = token.unit;
        curNode.value = (
          curNode == ast ?
            token.value :
            token.value.padStart(3,"0")
        );
        let unit = nextUnit(token.unit);
        if (unit) {
          // create next placeholder node
          curNode = curNode.and = { unit, };
        }
      }
      // token unit same as current node?
      else if (token.unit == curNode.unit) {
        // current node is a placeholder that has not yet
        // been assigned a value from token?
        if (!curNode.value) {
          curNode.value = (
            curNode == ast ?
              token.value :
              token.value.padStart(3,"0")
          );
          let unit = nextUnit(token.unit);
          if (unit) {
            // create next placeholder node
            curNode = curNode.and = { unit, };
          }
        }
        else {
          throw new Error("Invalid! " + token.unit);
        }
      }
      // current node is different (higher?) unit place
      // than token?
      else {
        // attempt to generate missing unit node(s)
        let [ tree, leaf,] =
            generateMissingUnitNodes(curNode.unit,token.unit);
        if (tree) {
          curNode.and = tree.and;
          curNode = leaf;
          curNode.value = token.value;
        }
        else {
          throw new Error("Invalid! " + token.unit);
        }
      }
    }
    // decimal point?
    else if (token.type == "point") {
      // current node has no unit-place assigned yet?
      if (!curNode.unit) {
        curNode.unit = "hundred";
        curNode = curNode.and = { unit: "decimal", value: "", };
      }
      else if (curNode.unit == "hundred") {
        curNode = curNode.and = { unit: "decimal", value: "", };
      }
      else {
        // attempt to generate missing unit-place node(s)
        let [ tree, leaf,] =
            generateMissingUnitNodes(curNode.unit,"hundred");
        if (tree) {
          curNode.and = tree.and;
          curNode = leaf;
          curNode = curNode.and = { unit: "decimal", value: "", };
        }
        else {
          throw new Error("Invalid! " + token.type);
        }
      }
    }
    // separate digit?
    else if (token.type == "digit") {
      // append digit to the decimal node?
      if (curNode.unit == "decimal") {
        // look-ahead to collect all consecutive digits, if any
        let digitTokens = collectConsecutiveDigits(tokens,tokenIdx);
        tokenIdx += (digitTokens.length - 1);
 
        // add digit token(s) to current node 
        for (let digit of digitTokens) {
          curNode.value = (curNode.value || "") + digit.value;
        }
      }
      // multiple adjacent (non-decimal) digits?
      else if (
        nextToken &&
        nextToken.type == "digit"
      ) {
        // current node is "empty", so we can implicitly
        // create arbitrary unit-place segment(s) from multiple
        // digits?
        if (!curNode.unit) {
          // look-ahead to collect all consecutive digits
          let digitTokens = collectConsecutiveDigits(tokens,tokenIdx);
          tokenIdx += (digitTokens.length - 1);
 
          // skip any leading zeros (since we're at the
          // start of the number)
          let firstNonZeroDigitIdx = digitTokens.findIndex(digit => digit.value != "0");
          if (firstNonZeroDigitIdx > 0) {
            digitTokens = digitTokens.slice(firstNonZeroDigitIdx);
          }
 
          // any digits remain to be added to the AST?
          if (digitTokens.length > 0) {
            // determine how many unit-place groups are needed
            let numGroups = Math.ceil(
              Math.min(digitTokens.length,units.length * 3) / 3
            );
 
            // determine number of digits in first group
            let groupSize = (
              digitTokens.length > (units.length * 3) ?
                digitTokens.length - (units.length * 3) + 3 :
                digitTokens.length % 3 || 3
            );
 
            // create the necessary unit-place nodes in the AST
            let [ tree, leaf ] = generateMissingUnitNodes(
              units[
                Math.min(units.length - 1,numGroups - 1)
              ],
              "hundred"
            );
            if (tree) {
              curNode.unit = tree.unit;
              curNode.value = "";
              if (tree.and) {
                curNode.and = tree.and;
              }
              // fill in the unit-place groups to the AST
              do {
                // collect a group of digits into current node
                let digitGroup = digitTokens.slice(0,groupSize);
                digitTokens = digitTokens.slice(groupSize);
                curNode.value = digitGroup.reduce((val,digit) => val + digit.value,"");
 
                // more digits to add as a unit-place group?
                if (curNode.and && digitTokens.length > 0) {
                  curNode = curNode.and;
                  // from here forward, all digit groups are
                  // fixed size of 3
                  groupSize = 3;
                }
              }
              // keep going while digits remain to be grouped
              while (digitTokens.length > 0);
            }
          }
          else {
            // NOTE: should never get here
            throw new Error("Invalid! " + token.value);
          }
        }
        else {
          // look-ahead to collect up to 3 consecutive digits
          let digitTokens =
              collectConsecutiveDigits(tokens,tokenIdx,/*limit=*/3);
          tokenIdx += (digitTokens.length - 1);
 
          // combine digits into a single value
          let val = digitTokens.reduce((val,digit) => val + digit.value,"");
 
          // assign combined-digits to "hundred" unit-place node
          curNode = assignHundredUnitPlaceNode(
            curNode,
            // zero-pad the value
            val.padStart(3,"0")
          );
        }
      }
      else {
        // assign single digit to "hundred" unit-place node
        curNode = assignHundredUnitPlaceNode(
          curNode,
          // zero-pad the value
          token.value.padStart(3,"0")
        );
      }
    }
    // stand-alone ten or double token?
    else if (token.type == "ten" || token.type == "double") {
      // append numbers to the decimal node?
      if (curNode.unit == "decimal") {
        curNode.value += token.value;
      }
      // literal/year form:
      //   * "seventeen nineteen"
      //   * "seventeen thirty"
      //   * "twenty fourteen"
      //   * "twenty fifty"
      else if (
        nextToken &&
        (nextToken.type == "ten" || nextToken.type == "double")
      ) {
        if (!curNode.unit) {
          curNode.unit = "thousand";
          curNode.value = token.value.slice(0,1);
          curNode = curNode.and = {
            unit: "hundred",
            value: token.value.slice(1) + nextToken.value,
          };
          tokenIdx += 1;  // lookahead: 1 spot
        }
        else {
          throw new Error("Invalid! " + token.value);
        }
      }
      // ten/double followed by:
      //   * any 3 digits
      //   * '0' plus another digit
      else if (
        !curNode.unit &&
        nextToken &&
        nextToken.type == "digit" &&
        !nextToken.unit
      ) {
        let tokenN2 = tokens[tokenIdx + 2];
        let tokenN3 = tokens[tokenIdx + 3];
 
        // any 3 digits
        if (
          tokenN2 &&
          tokenN2.type == "digit" &&
          tokenN3 &&
          tokenN3.type == "digit"
        ) {
          curNode.unit = "thousand";
          curNode.value = token.value;
          curNode = curNode.and = {
            unit: "hundred",
            value: nextToken.value + tokenN2.value + tokenN3.value,
          };
          tokenIdx += 3;  // lookahead: 3 spots
        }
        // '0' plus another digit
        else if (
          nextToken.value == "0" &&
          tokenN2 &&
          tokenN2.type == "digit"
        ) {
          curNode.unit = "thousand";
          curNode.value = token.value.slice(0,1);
          curNode = curNode.and = {
            unit: "hundred",
            value: token.value.slice(1) + nextToken.value + tokenN2.value,
          };
          tokenIdx += 2;  // lookahead: 2 spots
        }
        else {
          throw new Error("Invalid! " + token.value);
        }
      }
      // assumed "thousand" unit:
      //   * "thirteen nine forty two"
      //   * "thirty nine two o six"
      else if (
        !curNode.unit &&
        nextToken &&
        nextToken.type == "triple" &&
        !nextToken.unit        
      ) {
        curNode.unit = "thousand";
        curNode.value = token.value;
        curNode = curNode.and = {
          unit: "hundred",
          value: nextToken.value.padStart(3,"0"),
        };
        tokenIdx += 1;  // lookahead: 1 spot
      }
      else {
        // assign ten/double value to "hundred" unit-place node
        curNode = assignHundredUnitPlaceNode(
          curNode,
          // zero-pad the value
          token.value.padStart(3,"0")
        );        
      }
    }
    else if (token.type == "triple") {
      if (curNode.unit == "decimal") {
        curNode.value += token.value;
      }
      else {
        // assign triple value to "hundred" unit-place node
        curNode = assignHundredUnitPlaceNode(
          curNode,
          // zero-pad the value
          token.value.padStart(3,"0")
        );        
      }
    }
    else {
      // NOTE: should never get here
      throw new Error("Invalid! " + token.type);
    }
  }
 
  // append missing AST nodes (if any)
  if (![ "hundred", "decimal" ].includes(curNode.unit)) {
    let [ tree ] = generateMissingUnitNodes(curNode.unit,"hundred");
    if (tree) {
      curNode.and = tree.and;
    }
    else {
      throw new Error("Invalid! " + curNode.value);
    }
  }
 
  return ast;
}
 
function assignHundredUnitPlaceNode(curNode,val) {
  if (curNode.unit != "hundred") {
    // current node is "empty", so we can assign it
    // as the "hundred" unit-place node
    if (!curNode.unit && !curNode.value) {
      curNode.unit = "hundred";
      curNode.value = val;
      return curNode;
    }
    else {
      // attempt to generate missing unit node(s)
      let [ tree, leaf,] =
          generateMissingUnitNodes(curNode.unit,"hundred");
      if (tree) {
        curNode.and = tree.and;
        curNode = leaf;
      }
      else {
        throw new Error("Invalid! " + val);
      }
    }
  }
 
  // current node is a placeholder in the "hundred"
  // unit-place, that has not yet been assigned any
  // value from a token?
  if (!curNode.value) {
    curNode.value = val;
    return curNode;
  }
  else {
    throw new Error("Invalid! " + val);
  }
}
 
function collectConsecutiveDigits(tokens,tokenIdx,limit = Number.MAX_SAFE_INTEGER) {
  var digitTokens = [ tokens[tokenIdx], ];
  for (
    let adjIdx = tokenIdx + 1;
    (
      adjIdx < tokens.length &&
      tokens[adjIdx].type == "digit" &&
      !tokens[adjIdx].unit &&
      digitTokens.length < limit
    );
    adjIdx++
  ) {
    digitTokens.push(tokens[adjIdx]);
  }
  return digitTokens;
}
 
function generateMissingUnitNodes(curUnit,targetUnit) {
  var unit = curUnit;
  var tree = { unit: curUnit, };
  var leaf = tree;
  while (unit && unit != targetUnit) {
    unit = nextUnit(unit);
    if (unit) {
      leaf = leaf.and = { unit, };
    }
  }
 
  if (unit && unit == targetUnit) {
    return [ tree, leaf ];
  }
 
  return [];
}
 
function nextUnit(unit) {
  var unitIdx = units.indexOf(unit);
  if (unitIdx > 0) {
    return units[unitIdx - 1];
  }
}

To run:

convert("one hundred five");  // "105"
convert("six hundred and fifty three");  // "653"
convert("zero zero one two three");  // "123"
convert("twelve o three");  // "1203"
convert("thirteen zero nine");  // "1309"
convert("fifteen sixteen");  // "1516"
convert("fourteen ninety two");  // "1492"
convert("nineteen ten");  // "1910"
convert("twenty twenty");  // "2020" <---- ugh!
convert("twenty twenty one");  // "2021"  <---- ehhh...
convert("twenty twenty two");  // "2022"  <---- let's hope!
convert("four five two three eight");  // "45238"
convert("sixteen thousand three eighty four");  // "16384"
convert("seven billion six hundred eighty-one million"); // "7681000000"
convert("twenty three trillion and nine");  // "23000000000009"
convert("four billion two hundred nine thousand");  // "4000209000"
convert("nine hundred ninety nine quadrillion nine ninety nine trillion nine hundred and ninety nine billion nine ninety-nine million nine hundred ninety-nine thousand nine ninety nine");  // "999999999999999999"
convert("one two three four five six seven eight nine eight seven six five four three two one two three four five"); // "123456789876543212345"
 
convert("forty two point zero");  // "42.0"
convert("three point one four one five nine two six");  // "3.1415926"
convert("point");  // "0.0"
convert("four point zero o o o zero");  // "4.00000"
 
convert("sixty five thousand five thirty six",",");  // "65,536"
convert("four billion two hundred nine thousand",",");  // "4,000,209,000"
convert("forty two",",");  // "42"
convert("twenty one twenty three",",");  // "2,123"
convert("one two three four five six seven eight nine eight seven six five four three two one two three four five",","); // "123456,789,876,543,212,345" <---- not a mistake, quadrillion is the highest supported "place"
javascript/libs.1614180093.txt.gz · Last modified: 2023/01/01 18:43 (external edit)