From 245f43f52c22515d9d967acac949d770d008f7c3 Mon Sep 17 00:00:00 2001
From: Joshua Horton <joshua_horton@sil.org>
Date: Wed, 13 May 2026 15:13:42 -0500
Subject: [PATCH] change(web): adjust TokenizationCorrector spec

Build-bot: skip build:web
Test-bot: skip
---
 .../main/correction/tokenization-corrector.ts | 75 ++++++++++++++-----
 .../correction/tokenization-result-mapping.ts | 58 ++++++++------
 .../worker-thread/src/main/predict-helpers.ts | 15 +++-
 .../tokenization-corrector.tests.ts           | 52 ++++---------
 .../predict-from-correction-sequence.tests.ts | 11 +--
 5 files changed, 124 insertions(+), 87 deletions(-)
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts
index 835f9eef388..9ac64ef4e49 100644
--- a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts
+++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts
@@ -14,7 +14,7 @@ import { ContextToken } from "./context-token.js";
 import { CorrectionSearchable, PathResult } from "./correction-searchable.js";
 import { ContextTokenization } from "./context-tokenization.js";
 import { QuotientNodeFinalizer } from "./quotient-node-finalizer.js";
-import { TokenizationResultMapping } from "./tokenization-result-mapping.js";
+import { TokenizationResult, TokenizationResultMapping } from "./tokenization-result-mapping.js";
 import { EDIT_DISTANCE_COST_SCALE } from "./distance-modeler.js";
 import { MAX_EDIT_THRESHOLD_FACTOR } from "./search-quotient-spur.js";
 
@@ -46,7 +46,7 @@ export type TokenResult = {
  * all correctable tokens, generating corrections for the full represented
  * range.
  */
-export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray<TokenResult>, TokenizationResultMapping> {
+export class TokenizationCorrector implements CorrectionSearchable<TokenizationResult, TokenizationResultMapping> {
   public readonly tokenization: ContextTokenization;
   private readonly tailCorrectionLength: number;
 
@@ -56,6 +56,8 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
   private _predictable?: QuotientNodeFinalizer;
   private _generatedTokenResults: Map<number, TokenResult>;
   private _previousResults: TokenizationResultMapping[] = [];
+  private _correctableCodepoints: number = 0;
+  private _correctablesMatched = 0;
 
   // fully private
   public readonly modelsCorrectables: boolean;
@@ -65,6 +67,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
   private lastTotalCost: number;
   private handleHasBeenCalled: boolean = false;
   private predictableMatchFound: boolean = false;
+  private matchableTokenCount = 0;
 
   get currentCost(): number {
     const correctable = this.selectionQueue.peek();
@@ -106,6 +109,10 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
     return this._correctables.map((c) => this.tokenLookupMap.get(c.spaceId));
   }
 
+  get correctableCodepoints(): number {
+    return this._correctableCodepoints;
+  }
+
   /**
    * Returns the token, if it exists, that is considered "predictable".
    *
@@ -142,6 +149,10 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
     return this._previousResults;
   };
 
+  get matchedTokenCount() {
+    return this._correctablesMatched + (this.predictableMatchFound ? 1 : 0);
+  }
+
   /**
    * Constructs an instance of TokenizationCorrector for finding corrections for
    * correctable tokens within the specified section of an existing
@@ -156,7 +167,7 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
   constructor(
     tokenization: ContextTokenization,
     tailCorrectionLength: number,
-    filterClosure: (token: ContextToken) => boolean
+    filterClosure: (token: ContextToken, index?: number) => boolean
   ) {
     this.tokenization = tokenization;
     this.tailCorrectionLength = tailCorrectionLength;
@@ -175,16 +186,23 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
     this.tokenLookupMap = new Map();
     let modelsCorrectables = false;
 
+    // 0 index:  the first index in range to be modeled, as split off from the main tokenization.
     orderedTokens.forEach((token, index) => {
       // New issue:  this mangles the space IDs!  We almost certainly need some
       // sort of proper map to the source token.
       const searchModule = new QuotientNodeFinalizer(token.searchModule, index == orderedTokens.length - 1);
       this.tokenLookupMap.set(searchModule.spaceId, token);
-      const passesFilter = filterClosure(token);
+      // Index within the token subset being examined.
+      const passesFilter = filterClosure(token, index);
       modelsCorrectables ||= passesFilter;
       if(!passesFilter) {
         this._uncorrectables.push(searchModule);
-      } else if(index == tailCorrectionLength - 1) {
+        return;
+      }
+
+      this.matchableTokenCount++;
+      this._correctableCodepoints += searchModule.codepointLength;
+      if(index == tailCorrectionLength - 1) {
         // The sole assignment case for this field.  It may only be assigned for
         // the final token, and only if its text is of a form considered
         // correctable by the filter.
@@ -270,13 +288,19 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
         // It is possible that the editable tokenization range exists entirely of
         // tokens considered to be uncorrectable.
         this.handleHasBeenCalled = true;
-        const results = this.collateResults();
-        this._previousResults.push(results);
-        return {
-          'type': 'complete',
-          cost: this.lastTotalCost,
-          mapping: results
-        };
+
+        // If no matchables exist, there's no prediction to do; don't make a return.
+        if(this.matchedTokenCount > 0) {
+          const results = this.collateResults();
+          this._previousResults.push(results);
+          return {
+            'type': 'complete',
+            cost: this.lastTotalCost,
+            mapping: results
+          };
+        } else {
+          return { type: 'none' };
+        }
       }
     }
 
@@ -314,6 +338,8 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
         });
       }
 
+      this._correctableCodepoints -= correctableToUpdate.codepointLength;
+
       // We can make no further predictions if we've exhausted all search options.
       // If we've reached this case, we're likely at the end of the search
       // (unless correction for a correctable is still possible).
@@ -331,6 +357,8 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
 
       if(correctionIsThePredictable) {
         this.predictableMatchFound = true;
+      } else {
+        this._correctablesMatched++;
       }
 
       // Either way, update the token -> correction-string map with the obtained result.
@@ -363,8 +391,8 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
       this.selectionQueue.enqueue(this._predictable);
     }
 
-    const correctionResults = this.collateResults();
-    if(correctionResults.matchedResult.findIndex((c) => c == undefined) != -1) {
+    // If any token lacks a matching lookup value, abort.
+    if([...this.tokenLookupMap.keys()].find((k) => !this._generatedTokenResults.has(k))) {
       return {
         type: 'intermediate',
         cost: tokenizationCost
@@ -376,12 +404,19 @@ export class TokenizationCorrector implements CorrectionSearchable<ReadonlyArray
     // If there was no result obtained from the predictable and a result was previously found,
     // that indicates no further predictions may be found.
     if(tokenResult.type != 'none' || !correctionIsThePredictable || !this.predictableMatchFound) {
-      this._previousResults.push(correctionResults);
-      return {
-        type: 'complete',
-        cost: tokenizationCost,
-        mapping: correctionResults
-      };
+      if(this.matchedTokenCount > 0) {
+        const correctionResults = this.collateResults();
+        this._previousResults.push(correctionResults);
+        return {
+          type: 'complete',
+          cost: tokenizationCost,
+          mapping: correctionResults
+        };
+      } else {
+        return {
+          type: 'none'
+        }
+      }
     } else {
       return {
         type: 'none'
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-result-mapping.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-result-mapping.ts
index c5588424afe..da8f4c42a02 100644
--- a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-result-mapping.ts
+++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-result-mapping.ts
@@ -1,37 +1,51 @@
 import { CorrectionResultMapping } from "./correction-result-mapping.js";
 import { TokenizationCorrector, TokenResult } from './tokenization-corrector.js';
 
-export class TokenizationResultMapping implements CorrectionResultMapping<ReadonlyArray<TokenResult>> {
+export interface TokenizationResult {
+  tokenCorrections: ReadonlyArray<TokenResult>,
+  totalEditCount: number,
+  totalEditableCodepoints: number
+}
+
+export class TokenizationResultMapping implements CorrectionResultMapping<TokenizationResult> {
   readonly matchingSpace: TokenizationCorrector;
-  readonly matchedResult: ReadonlyArray<TokenResult>;
+  readonly matchedResult: TokenizationResult;
 
   constructor(tokenization: TokenResult[], corrector?: TokenizationCorrector) {
     this.matchingSpace = corrector;
-    this.matchedResult = tokenization;
+
+    this.matchedResult = {
+      tokenCorrections: tokenization,
+      totalEditCount: tokenization.reduce((accum, curr) => accum + curr.knownCost, 0),
+      // If based on a legacy/custom model not using traversals, we don't
+      // support edit operations (for correction) beyond the direct results of
+      // the most recent input distribution.
+      totalEditableCodepoints: corrector?.correctableCodepoints ?? 0
+    }
   }
 
   get spaceId(): number {
     return this.matchingSpace?.tokenization.spaceId;
   }
 
-  // /**
-  //  * Gets the number of Damerau-Levenshtein edits needed to reach the node's
-  //  * matchString from the output induced by the input sequence used to reach it.
-  //  *
-  //  * (This is scaled by `SearchSpace.EDIT_DISTANCE_COST_SCALE` when included in
-  //  * `totalCost`.)
-  //  */
-  // get knownCost(): number {
-  //   return this.node.editCount;
-  // }
-
-  // /**
-  //  * Gets the "input sampling cost" of the edge, which should be considered as the
-  //  * negative log-likelihood of the input path taken to reach the node.
-  //  */
-  // get inputSamplingCost(): number {
-  //   return this.node.inputSamplingCost;
-  // }
+  /**
+   * Gets the number of Damerau-Levenshtein edits needed to reach the node's
+   * matchString from the output induced by the input sequence used to reach it.
+   *
+   * (This is scaled by `SearchSpace.EDIT_DISTANCE_COST_SCALE` when included in
+   * `totalCost`.)
+   */
+  get knownCost(): number {
+    return this.matchedResult.totalEditCount;
+  }
+
+  /**
+   * Gets the "input sampling cost" of the edge, which should be considered as the
+   * negative log-likelihood of the input path taken to reach the node.
+   */
+  get inputSamplingCost(): number {
+    return this.matchedResult.tokenCorrections.reduce((accum, curr) => accum + curr.inputSamplingCost, 0);
+  }
 
   /**
    * Gets the "total cost" of the edge, which should be considered as the
@@ -40,6 +54,6 @@ export class TokenizationResultMapping implements CorrectionResultMapping<Readon
    * to the resulting output.
    */
   get totalCost(): number {
-    return this.matchedResult.reduce((total, curr) => total + curr.totalCost, 0);
+    return this.matchedResult.tokenCorrections.reduce((total, curr) => total + curr.totalCost, 0);
   }
 }
\ No newline at end of file
diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
index 6ef54116737..aa0cd46db3b 100644
--- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
+++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts
@@ -544,8 +544,8 @@ export function buildCorrectionSequence(
   const orderedTokens = tokenizationCorrection.matchingSpace?.orderedTokens;
   const tokens: PredictionParameters['tokens'] = [];
 
-  for(let i = 0; i < tokenizationCorrection.matchedResult.length; i++) {
-    const correction = tokenizationCorrection.matchedResult[i];
+  for(let i = 0; i < tokenizationCorrection.matchedResult.tokenCorrections.length; i++) {
+    const correction = tokenizationCorrection.matchedResult.tokenCorrections[i];
     /* If we're dealing with the FIRST keystroke of a new sequence, we'll **dramatically** boost
      * the exponent to ensure only VERY nearby corrections have a chance of winning, and only if
      * there are significantly more likely words.  We only need this to allow very minor fat-finger
@@ -817,9 +817,18 @@ export function predictFromCorrectionSequence(
 
   const predictionComponents = correctionTokens.map((correctionToken, i) => {
     const correctionTransform = correctionToken.correction.sample;
-    const predictions = lexicalModel.predict(correctionTransform, currentContext);
+    let predictions = lexicalModel.predict(correctionTransform, currentContext);
     const transitionId = correctionTransform.id;
 
+    // Ensure codepointLength == prediction codepoint length if i does not match the tail!
+    // Filter out cases that do not conform to this condition.
+    if(i != correctionTokens.length - 1) {
+      predictions = predictions.filter((p) => {
+        const codepointLength = KMWString.length(correctionToken.correction.sample.insert);
+        return KMWString.length(p.sample.transform.insert) == codepointLength;
+      });
+    }
+
     // Failsafe:  if there are no matching predictions, create a fake prediction
     // matching the original text.
     if(predictions.length != 0) {
diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/tokenization-corrector.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/tokenization-corrector.tests.ts
index 4cf66137e74..8949eb7a166 100644
--- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/tokenization-corrector.tests.ts
+++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/tokenization-corrector.tests.ts
@@ -29,7 +29,8 @@ import {
   SubstitutionQuotientSpur,
   TokenizationCorrector,
   TokenResult,
-  TokenizationResultMapping
+  TokenizationResultMapping,
+  TokenizationResult
 } from '@keymanapp/lm-worker/test-index';
 
 import Distribution = LexicalModelTypes.Distribution;
@@ -302,7 +303,7 @@ describe('TokenizationCorrector', () => {
       assert.equal(searchResult.type, 'complete');
       if(searchResult.type == 'complete') {
         const mapping = searchResult.mapping;
-        const tokenResults = mapping.matchedResult;
+        const tokenResults = mapping.matchedResult.tokenCorrections;
         assert.isNotNaN(searchResult.cost);
         assert.equal(searchResult.cost, searchResult.mapping.totalCost);
         assert.equal(tokenResults.length, 1);
@@ -327,7 +328,7 @@ describe('TokenizationCorrector', () => {
       assert.equal(searchResult.type, 'none');
     });
 
-    it('finds a default correction for a single correctable token without a model match', () => {
+    it('returns no result when a single correctable token lacks a model match', () => {
       const fixture = buildFixture_therefore();
 
       const theref = fixture.theref.tail;
@@ -371,23 +372,6 @@ describe('TokenizationCorrector', () => {
         searchResult = instance.handleNextNode();
       } while(searchResult.type == 'intermediate');
 
-      assert.equal(searchResult.type, 'complete');
-      if(searchResult.type == 'complete') {
-        const mapping = searchResult.mapping;
-        const tokenResults = mapping.matchedResult;
-        assert.isNotNaN(searchResult.cost);
-        assert.equal(searchResult.cost, searchResult.mapping.totalCost);
-        assert.equal(tokenResults.length, 1);
-        assert.sameOrderedMembers(tokenResults.map((r) => r.matchString), ['therefxyz']);
-
-        // Now that an entry has been found, verify the corrector's state.
-        assert.isNotOk(instance.predictableToken); // should become an uncorrectable.
-        assert.isTrue(instance.generatedTokenResults.has(therefxyz));
-        assert.equal(instance.generatedTokenResults.get(therefxyz), tokenResults[0]);
-      }
-
-      // There should be no further possible suggestions.
-      searchResult = instance.handleNextNode();
       assert.equal(searchResult.type, 'none');
     });
 
@@ -411,7 +395,7 @@ describe('TokenizationCorrector', () => {
       let firstResults: ReadonlyArray<TokenResult>;
       if(searchResult.type == 'complete') {
         const mapping = searchResult.mapping;
-        const tokenResults = mapping.matchedResult;
+        const tokenResults = mapping.matchedResult.tokenCorrections;
         firstResults = tokenResults;
         assert.isNotNaN(searchResult.cost);
         assert.equal(searchResult.cost, searchResult.mapping.totalCost);
@@ -434,7 +418,7 @@ describe('TokenizationCorrector', () => {
         searchResult = instance.handleNextNode();
         if(searchResult.type == 'complete') {
           const mapping = searchResult.mapping;
-          const tokenResults = mapping.matchedResult;
+          const tokenResults = mapping.matchedResult.tokenCorrections;
 
           // Verify that the first (bound) token is not altered further.
           // It should receive no further correction attempts.
@@ -445,7 +429,7 @@ describe('TokenizationCorrector', () => {
       } while(searchResult.type != 'none');
     });
 
-    it('immediately returns a single result when the only represented token is uncorrectable', () => {
+    it('immediately returns with no result when the only represented token is uncorrectable', () => {
       const fixture = buildFixture_terminalWhitespace();
 
       const tokenization = fixture.spaceOnly;
@@ -457,13 +441,7 @@ describe('TokenizationCorrector', () => {
       );
 
       const searchResult = instance.handleNextNode();
-      assert.equal(searchResult.type, 'complete');
-      if(searchResult.type == 'complete') {
-        assert.equal(searchResult.mapping.matchedResult[0].matchString, ' ');
-      }
-
-      const nilResult = instance.handleNextNode();
-      assert.equal(nilResult.type, 'none');
+      assert.equal(searchResult.type, 'none');
     });
 
     it('returns a single result when the final token is uncorrectable', () => {
@@ -484,8 +462,8 @@ describe('TokenizationCorrector', () => {
 
       assert.equal(searchResult.type, 'complete');
       if(searchResult.type == 'complete') {
-        assert.equal(searchResult.mapping.matchedResult[0].matchString, 'space');
-        assert.equal(searchResult.mapping.matchedResult[1].matchString, ' ');
+        assert.equal(searchResult.mapping.matchedResult.tokenCorrections[0].matchString, 'space');
+        assert.equal(searchResult.mapping.matchedResult.tokenCorrections[1].matchString, ' ');
       }
 
       const nilResult = instance.handleNextNode();
@@ -502,20 +480,20 @@ describe('TokenizationCorrector', () => {
         let haveSeenSingleTokenCorrection = false;
         let haveSeenThreeTokenCorrection = false;
         for await(let phraseMatch of getBestMatches<
-          ReadonlyArray<TokenResult>,
+          TokenizationResult,
           TokenizationResultMapping,
           TokenizationCorrector
           >(correctors, buildTestTimer())) {
 
-          if(phraseMatch.matchedResult.length == 1) {
+          if(phraseMatch.matchedResult.tokenCorrections.length == 1) {
             if(!haveSeenSingleTokenCorrection) {
-              assert.sameOrderedMembers(phraseMatch.matchedResult.map((t) => t.matchString), ['theref' /* -ore */]);
+              assert.sameOrderedMembers(phraseMatch.matchedResult.tokenCorrections.map((t) => t.matchString), ['theref' /* -ore */]);
             }
 
             haveSeenSingleTokenCorrection = true;
-          } else if(phraseMatch.matchedResult.length == 3) {
+          } else if(phraseMatch.matchedResult.tokenCorrections.length == 3) {
             if(!haveSeenThreeTokenCorrection) {
-              assert.sameOrderedMembers(phraseMatch.matchedResult.map((t) => t.matchString), ['the', ' ', 'ef' /* -fort */]);
+              assert.sameOrderedMembers(phraseMatch.matchedResult.tokenCorrections.map((t) => t.matchString), ['the', ' ', 'ef' /* -fort */]);
             }
             haveSeenThreeTokenCorrection = true;
           }
diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-correction-sequence.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-correction-sequence.tests.ts
index a31371ac01f..5f751f50d07 100644
--- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-correction-sequence.tests.ts
+++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-correction-sequence.tests.ts
@@ -542,8 +542,9 @@ describe('predictFromCorrectionSequence', () => {
       ];
 
       const expected_prediction_p = dummied_suggestion_sequences
-        .map((dist) => {
-          return dist[0]
+        .map((dist, i) => {
+          // There is no valid 'g' entry corresponding to token index 0.
+          return i == 0 ? null : dist[0]
         }).reduce((accum, curr) => {
           return accum * (curr ? curr.p : Math.exp(-EDIT_DISTANCE_COST_SCALE))
         }, 1);
@@ -551,11 +552,11 @@ describe('predictFromCorrectionSequence', () => {
       const expected_predictions: Suggestion[] = [
         {
           transform: {
-            insert: 'golden',
+            insert: 'g',
             deleteLeft: 0,
             id: transitionID
           },
-          displayAs: 'golden',
+          displayAs: 'g',
           transformId: transitionID
         }, {
           transform: {
@@ -589,7 +590,7 @@ describe('predictFromCorrectionSequence', () => {
       predictions.forEach((entry) => assert.equal(entry.metadata.probabilities.correction, parameters.tokens.reduce((accum, curr) => accum * curr.correction.p, 1)));
       predictions.sort(tupleDisplayOrderSort);
 
-      assert.deepEqual(predictions[0].components.map((c) => c.prediction.transform.insert), ['golden', ' ', 'apple']);
+      assert.deepEqual(predictions[0].components.map((c) => c.prediction.transform.insert), ['g', ' ', 'apple']);
       assert.sameDeepOrderedMembers(predictions[0].components.map((entry) => entry.prediction), expected_predictions);
 
       assert.approximately(predictions[0].metadata.probabilities.prediction, expected_prediction_p, 0.00001);