From 858e7513fc269c21e26b14dc963054c78ddcce1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?St=C3=A9phane=20Goetz?= <onigoetz@onigoetz.ch>
Date: Sat, 19 Oct 2024 16:43:36 +0200
Subject: [PATCH] Optimize parseText (#327)

---
 packages/benchmark-messageformat/README.md | 94 +++++++++++-----------
 packages/messageformat/src/parser.ts       | 92 ++++++++++++---------
 2 files changed, 101 insertions(+), 85 deletions(-)

diff --git a/packages/benchmark-messageformat/README.md b/packages/benchmark-messageformat/README.md
index f76c67c..20f6661 100644
--- a/packages/benchmark-messageformat/README.md
+++ b/packages/benchmark-messageformat/README.md
@@ -16,11 +16,11 @@ Sources can be found in `src`, measure taken on 07/12/2023 with latest available
 | Npm Package                                           | Version    | Size | Comment     |
 | ----------------------------------------------------- | ---------- | ---- | ----------- |
 | @ffz/icu-msgparser (+ custom renderer)                | 2.0.0      | 9.4K |             |
-| @onigoetz/messageformat (+ @onigoetz/intl-formatters) | 1.0.0-rc.2 | 8.1K |             |
+| @onigoetz/messageformat (+ @onigoetz/intl-formatters) | 1.0.0-rc.2 | 8K   |             |
 | @onigoetz/messageformat (+ @onigoetz/make-plural)     | 1.0.0-rc.2 | 11K  |             |
 | format-message-parse                                  | 6.2.4      | 22K  | Uses peg.js |
 | @onigoetz/messageformat (+ make-plural)               | 1.0.0-rc.2 | 23K  |             |
-| @onigoetz/messageformat (+ @phensley/plurals)         | 1.0.0-rc.2 | 41K  |             |
+| @onigoetz/messageformat (+ @phensley/plurals)         | 1.0.0-rc.2 | 40K  |             |
 | intl-messageformat                                    | 10.5.14    | 55K  | Uses peg.js |
 | @phensley/messageformat                               | 1.9.0      | 54K  |             |
 | @messageformat/core                                   | 3.4.0      | 74K  | Uses peg.js |
@@ -50,7 +50,7 @@ The benchmark is applied to 4 different strings, which for the simple cases shou
 >
 > - Node.js v20.9.0
 > - Apple M2 CPU
-> - October 18, 2024
+> - October 19, 2024
 
 ## Simple String
 
@@ -60,18 +60,18 @@ const input = [`Hello, world!`, {}];
 // Renders: `Hello, world!`
 ```
 
-| Name                                                  |   ops/sec | MoE     | Runs sampled |
-| ----------------------------------------------------- | --------: | ------- | ------------ |
-| **format-message-parse**                              | 8,895,545 | ± 0.23% | 94           |
-| @onigoetz/messageformat (+ @phensley/plurals)         | 8,596,907 | ± 0.16% | 99           |
-| @onigoetz/messageformat (+ @onigoetz/make-plural)     | 8,583,838 | ± 0.14% | 101          |
-| @onigoetz/messageformat (+ @onigoetz/intl-formatters) | 8,396,261 | ± 2.03% | 94           |
-| @onigoetz/messageformat (+ make-plural)               | 8,222,727 | ± 1.48% | 97           |
-| @phensley/messageformat                               | 8,079,695 | ± 0.22% | 99           |
-| @ffz/icu-msgparser (+ custom renderer)                | 5,662,192 | ± 0.16% | 97           |
-| @messageformat/core                                   | 1,715,496 | ± 0.13% | 97           |
-| intl-messageformat                                    |   240,408 | ± 0.63% | 93           |
-| globalize                                             |    37,391 | ± 0.31% | 96           |
+| Name                                                  |    ops/sec | MoE     | Runs sampled |
+| ----------------------------------------------------- | ---------: | ------- | ------------ |
+| **@onigoetz/messageformat (+ @onigoetz/make-plural)** | 18,624,709 | ± 0.64% | 93           |
+| @onigoetz/messageformat (+ @phensley/plurals)         | 18,440,109 | ± 1.91% | 97           |
+| @onigoetz/messageformat (+ @onigoetz/intl-formatters) | 18,181,841 | ± 1.13% | 92           |
+| @onigoetz/messageformat (+ make-plural)               | 17,425,302 | ± 3.11% | 91           |
+| format-message-parse                                  |  8,697,924 | ± 0.26% | 100          |
+| @phensley/messageformat                               |  7,948,798 | ± 0.35% | 100          |
+| @ffz/icu-msgparser (+ custom renderer)                |  5,693,123 | ± 0.28% | 97           |
+| @messageformat/core                                   |  1,635,413 | ± 2.11% | 96           |
+| intl-messageformat                                    |    189,554 | ± 9.22% | 79           |
+| globalize                                             |     34,190 | ± 5.54% | 93           |
 
 ## With one variable
 
@@ -88,16 +88,16 @@ const input = [
 
 | Name                                                      |   ops/sec | MoE     | Runs sampled |
 | --------------------------------------------------------- | --------: | ------- | ------------ |
-| **@onigoetz/messageformat (+ @onigoetz/intl-formatters)** | 6,414,862 | ± 0.11% | 98           |
-| @onigoetz/messageformat (+ make-plural)                   | 6,380,084 | ± 0.31% | 98           |
-| @onigoetz/messageformat (+ @phensley/plurals)             | 6,233,602 | ± 0.22% | 98           |
-| @onigoetz/messageformat (+ @onigoetz/make-plural)         | 6,228,841 | ± 0.20% | 100          |
-| format-message-parse                                      | 3,978,459 | ± 0.19% | 99           |
-| @phensley/messageformat                                   | 3,399,981 | ± 0.13% | 99           |
-| @ffz/icu-msgparser (+ custom renderer)                    | 3,361,547 | ± 0.27% | 100          |
-| @messageformat/core                                       |   880,358 | ± 0.26% | 100          |
-| intl-messageformat                                        |   216,110 | ± 0.54% | 95           |
-| globalize                                                 |    36,506 | ± 0.32% | 97           |
+| **@onigoetz/messageformat (+ @onigoetz/intl-formatters)** | 7,561,041 | ± 0.19% | 93           |
+| @onigoetz/messageformat (+ make-plural)                   | 7,431,574 | ± 0.14% | 100          |
+| @onigoetz/messageformat (+ @onigoetz/make-plural)         | 7,349,978 | ± 0.37% | 98           |
+| @onigoetz/messageformat (+ @phensley/plurals)             | 6,904,747 | ± 0.51% | 97           |
+| format-message-parse                                      | 4,019,700 | ± 0.21% | 100          |
+| @ffz/icu-msgparser (+ custom renderer)                    | 3,378,807 | ± 0.27% | 97           |
+| @phensley/messageformat                                   | 3,359,162 | ± 0.16% | 99           |
+| @messageformat/core                                       |   853,873 | ± 1.96% | 89           |
+| intl-messageformat                                        |   217,803 | ± 0.61% | 96           |
+| globalize                                                 |    36,601 | ± 0.17% | 98           |
 
 ## With plurals
 
@@ -116,16 +116,16 @@ const input = [
 
 | Name                                                  | ops/sec | MoE     | Runs sampled |
 | ----------------------------------------------------- | ------: | ------- | ------------ |
-| **@onigoetz/messageformat (+ @phensley/plurals)**     | 940,533 | ± 0.19% | 98           |
-| @onigoetz/messageformat (+ make-plural)               | 919,852 | ± 0.14% | 98           |
-| @onigoetz/messageformat (+ @onigoetz/intl-formatters) | 772,695 | ± 0.25% | 98           |
-| @phensley/messageformat                               | 551,128 | ± 0.13% | 96           |
-| @messageformat/core                                   | 185,781 | ± 0.11% | 98           |
-| @onigoetz/messageformat (+ @onigoetz/make-plural)     | 152,541 | ± 0.10% | 100          |
-| @ffz/icu-msgparser (+ custom renderer)                | 130,129 | ± 0.10% | 97           |
-| format-message-parse                                  |  81,703 | ± 0.28% | 97           |
-| intl-messageformat                                    |  48,687 | ± 2.22% | 92           |
-| globalize                                             |  27,151 | ± 0.17% | 97           |
+| **@onigoetz/messageformat (+ @phensley/plurals)**     | 982,506 | ± 0.19% | 99           |
+| @onigoetz/messageformat (+ make-plural)               | 960,514 | ± 0.14% | 95           |
+| @onigoetz/messageformat (+ @onigoetz/intl-formatters) | 806,120 | ± 0.08% | 101          |
+| @phensley/messageformat                               | 547,569 | ± 0.35% | 100          |
+| @messageformat/core                                   | 183,629 | ± 0.18% | 94           |
+| @onigoetz/messageformat (+ @onigoetz/make-plural)     | 154,218 | ± 0.07% | 100          |
+| @ffz/icu-msgparser (+ custom renderer)                | 129,434 | ± 0.08% | 97           |
+| format-message-parse                                  |  82,619 | ± 0.16% | 99           |
+| intl-messageformat                                    |  48,941 | ± 2.50% | 91           |
+| globalize                                             |  27,024 | ± 0.30% | 95           |
 
 ## With select and plurals
 
@@ -171,15 +171,15 @@ const input = [`
   `
 ```
 
-| Name                                                  | ops/sec | MoE      | Runs sampled |
-| ----------------------------------------------------- | ------: | -------- | ------------ |
-| **@onigoetz/messageformat (+ @phensley/plurals)**     | 140,415 | ± 0.07%  | 99           |
-| @onigoetz/messageformat (+ make-plural)               | 136,452 | ± 0.11%  | 98           |
-| @onigoetz/messageformat (+ @onigoetz/intl-formatters) | 135,955 | ± 0.11%  | 99           |
-| @onigoetz/messageformat (+ @onigoetz/make-plural)     |  79,580 | ± 0.07%  | 98           |
-| @phensley/messageformat                               |  53,396 | ± 0.16%  | 101          |
-| @ffz/icu-msgparser (+ custom renderer)                |  29,484 | ± 0.72%  | 95           |
-| @messageformat/core                                   |  29,510 | ± 0.93%  | 95           |
-| intl-messageformat                                    |  15,706 | ± 1.99%  | 89           |
-| format-message-parse                                  |  15,356 | ± 12.31% | 85           |
-| globalize                                             |   8,250 | ± 1.98%  | 91           |
+| Name                                                  | ops/sec | MoE     | Runs sampled |
+| ----------------------------------------------------- | ------: | ------- | ------------ |
+| **@onigoetz/messageformat (+ @phensley/plurals)**     | 180,818 | ± 0.33% | 100          |
+| @onigoetz/messageformat (+ make-plural)               | 178,976 | ± 0.22% | 99           |
+| @onigoetz/messageformat (+ @onigoetz/intl-formatters) | 172,978 | ± 0.25% | 97           |
+| @onigoetz/messageformat (+ @onigoetz/make-plural)     |  89,534 | ± 0.15% | 99           |
+| @phensley/messageformat                               |  53,231 | ± 0.07% | 101          |
+| @messageformat/core                                   |  30,664 | ± 0.20% | 96           |
+| @ffz/icu-msgparser (+ custom renderer)                |  29,430 | ± 1.11% | 95           |
+| intl-messageformat                                    |  17,116 | ± 1.02% | 95           |
+| format-message-parse                                  |  17,090 | ± 2.95% | 95           |
+| globalize                                             |   8,766 | ± 0.08% | 100          |
diff --git a/packages/messageformat/src/parser.ts b/packages/messageformat/src/parser.ts
index 0bac3c4..8eb7da1 100644
--- a/packages/messageformat/src/parser.ts
+++ b/packages/messageformat/src/parser.ts
@@ -64,10 +64,6 @@ function expected(char: string, context: Context): SyntaxError {
   );
 }
 
-function peek(context: Context): number {
-  return context.msg.charCodeAt(context.i + 1);
-}
-
 function get(context: Context): number {
   return context.msg.charCodeAt(context.i);
 }
@@ -160,17 +156,23 @@ function add(context: Context, token: Token): number {
 }
 
 /**
- * Parse text, stop or not at separators, stop or not at spaces, stop or not at #
- * Could use some cleanup :/
+ * Parse text
+ *
+ * Stops when it finds an open `{`, close `}` or sub-variable character `#` character. except if preceded by escape characters
+ *
+ * Returns string without escape characters
  *
  * @param context
  * @param specialHash
  */
 function parseText(context: Context, specialHash = false): string {
-  let out = "";
+  let start = context.i;
+
+  // Stores all the escape characters to remove once we reached the end of the text
+  const toRemove = [];
 
   while (context.i < context.l) {
-    const char = get(context);
+    let char = get(context);
     if (
       char === CHAR_OPEN ||
       char === CHAR_CLOSE ||
@@ -179,46 +181,60 @@ function parseText(context: Context, specialHash = false): string {
       break;
     }
 
+    if (char !== CHAR_ESCAPE) {
+      context.i++;
+      continue;
+    }
+
+    // Since it's an escape, jump to the next character
+    ++context.i;
+    char = get(context);
+
     if (char === CHAR_ESCAPE) {
+      // Escaped Escape Character
+      // Remove one of the two escape characters
+      toRemove.unshift(context.i - start);
       ++context.i;
-      let next = get(context);
-      if (next === CHAR_ESCAPE) {
-        // Escaped Escape Character
-        out += String.fromCharCode(next);
-        ++context.i;
-      } else if (
-        next === CHAR_OPEN ||
-        next === CHAR_CLOSE ||
-        (specialHash && next === CHAR_SUB_VAR)
-      ) {
-        // Special Character
-        out += String.fromCharCode(next);
-        while (++context.i < context.l) {
-          next = get(context);
-          if (next === CHAR_ESCAPE) {
-            // Check for an escaped escape character, and don't
-            // stop if we encounter one.
-            next = peek(context);
-            if (next === CHAR_ESCAPE) {
-              out += String.fromCharCode(next);
-              ++context.i;
-            } else {
-              ++context.i;
-              break;
-            }
-          } else {
-            out += String.fromCharCode(next);
+
+      console.log();
+    } else if (
+      char === CHAR_OPEN ||
+      char === CHAR_CLOSE ||
+      (specialHash && char === CHAR_SUB_VAR)
+    ) {
+      toRemove.unshift(context.i - start - 1);
+
+      // Special Character
+      // Escaping a special character will move forward the string until it finds the next
+      // escape character (unless it's an escaped escape character)
+      while (++context.i < context.l) {
+        char = get(context);
+
+        if (char === CHAR_ESCAPE) {
+          // Always ignore the escape character itself
+          toRemove.unshift(context.i - start);
+
+          // If we find a second escape character, we continue, otherwise we stop
+          ++context.i;
+          char = get(context);
+          if (char !== CHAR_ESCAPE) {
+            break;
           }
         }
-      } else {
-        out += String.fromCharCode(char);
       }
     } else {
+      // This is not escaping a special character, we keep it
       ++context.i;
-      out += String.fromCharCode(char);
     }
   }
 
+  let out = context.msg.substring(start, context.i);
+
+  // Remove all escapes from the final string
+  for (const idx of toRemove) {
+    out = out.substring(0, idx) + out.substring(idx + 1);
+  }
+
   return out;
 }