From 245f43f52c22515d9d967acac949d770d008f7c3 Mon Sep 17 00:00:00 2001 From: Joshua Horton Date: Wed, 13 May 2026 15:13:42 -0500 Subject: [PATCH] change(web): adjust TokenizationCorrector spec Build-bot: skip build:web Test-bot: skip --- .../main/correction/tokenization-corrector.ts | 75 ++++++++++++++----- .../correction/tokenization-result-mapping.ts | 58 ++++++++------ .../worker-thread/src/main/predict-helpers.ts | 15 +++- .../tokenization-corrector.tests.ts | 52 ++++--------- .../predict-from-correction-sequence.tests.ts | 11 +-- 5 files changed, 124 insertions(+), 87 deletions(-) diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts index 835f9eef388..9ac64ef4e49 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-corrector.ts @@ -14,7 +14,7 @@ import { ContextToken } from "./context-token.js"; import { CorrectionSearchable, PathResult } from "./correction-searchable.js"; import { ContextTokenization } from "./context-tokenization.js"; import { QuotientNodeFinalizer } from "./quotient-node-finalizer.js"; -import { TokenizationResultMapping } from "./tokenization-result-mapping.js"; +import { TokenizationResult, TokenizationResultMapping } from "./tokenization-result-mapping.js"; import { EDIT_DISTANCE_COST_SCALE } from "./distance-modeler.js"; import { MAX_EDIT_THRESHOLD_FACTOR } from "./search-quotient-spur.js"; @@ -46,7 +46,7 @@ export type TokenResult = { * all correctable tokens, generating corrections for the full represented * range. */ -export class TokenizationCorrector implements CorrectionSearchable, TokenizationResultMapping> { +export class TokenizationCorrector implements CorrectionSearchable { public readonly tokenization: ContextTokenization; private readonly tailCorrectionLength: number; @@ -56,6 +56,8 @@ export class TokenizationCorrector implements CorrectionSearchable; private _previousResults: TokenizationResultMapping[] = []; + private _correctableCodepoints: number = 0; + private _correctablesMatched = 0; // fully private public readonly modelsCorrectables: boolean; @@ -65,6 +67,7 @@ export class TokenizationCorrector implements CorrectionSearchable this.tokenLookupMap.get(c.spaceId)); } + get correctableCodepoints(): number { + return this._correctableCodepoints; + } + /** * Returns the token, if it exists, that is considered "predictable". * @@ -142,6 +149,10 @@ export class TokenizationCorrector implements CorrectionSearchable boolean + filterClosure: (token: ContextToken, index?: number) => boolean ) { this.tokenization = tokenization; this.tailCorrectionLength = tailCorrectionLength; @@ -175,16 +186,23 @@ export class TokenizationCorrector implements CorrectionSearchable { // New issue: this mangles the space IDs! We almost certainly need some // sort of proper map to the source token. const searchModule = new QuotientNodeFinalizer(token.searchModule, index == orderedTokens.length - 1); this.tokenLookupMap.set(searchModule.spaceId, token); - const passesFilter = filterClosure(token); + // Index within the token subset being examined. + const passesFilter = filterClosure(token, index); modelsCorrectables ||= passesFilter; if(!passesFilter) { this._uncorrectables.push(searchModule); - } else if(index == tailCorrectionLength - 1) { + return; + } + + this.matchableTokenCount++; + this._correctableCodepoints += searchModule.codepointLength; + if(index == tailCorrectionLength - 1) { // The sole assignment case for this field. It may only be assigned for // the final token, and only if its text is of a form considered // correctable by the filter. @@ -270,13 +288,19 @@ export class TokenizationCorrector implements CorrectionSearchable 0) { + const results = this.collateResults(); + this._previousResults.push(results); + return { + 'type': 'complete', + cost: this.lastTotalCost, + mapping: results + }; + } else { + return { type: 'none' }; + } } } @@ -314,6 +338,8 @@ export class TokenizationCorrector implements CorrectionSearchable correction-string map with the obtained result. @@ -363,8 +391,8 @@ export class TokenizationCorrector implements CorrectionSearchable c == undefined) != -1) { + // If any token lacks a matching lookup value, abort. + if([...this.tokenLookupMap.keys()].find((k) => !this._generatedTokenResults.has(k))) { return { type: 'intermediate', cost: tokenizationCost @@ -376,12 +404,19 @@ export class TokenizationCorrector implements CorrectionSearchable 0) { + const correctionResults = this.collateResults(); + this._previousResults.push(correctionResults); + return { + type: 'complete', + cost: tokenizationCost, + mapping: correctionResults + }; + } else { + return { + type: 'none' + } + } } else { return { type: 'none' diff --git a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-result-mapping.ts b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-result-mapping.ts index c5588424afe..da8f4c42a02 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-result-mapping.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/correction/tokenization-result-mapping.ts @@ -1,37 +1,51 @@ import { CorrectionResultMapping } from "./correction-result-mapping.js"; import { TokenizationCorrector, TokenResult } from './tokenization-corrector.js'; -export class TokenizationResultMapping implements CorrectionResultMapping> { +export interface TokenizationResult { + tokenCorrections: ReadonlyArray, + totalEditCount: number, + totalEditableCodepoints: number +} + +export class TokenizationResultMapping implements CorrectionResultMapping { readonly matchingSpace: TokenizationCorrector; - readonly matchedResult: ReadonlyArray; + readonly matchedResult: TokenizationResult; constructor(tokenization: TokenResult[], corrector?: TokenizationCorrector) { this.matchingSpace = corrector; - this.matchedResult = tokenization; + + this.matchedResult = { + tokenCorrections: tokenization, + totalEditCount: tokenization.reduce((accum, curr) => accum + curr.knownCost, 0), + // If based on a legacy/custom model not using traversals, we don't + // support edit operations (for correction) beyond the direct results of + // the most recent input distribution. + totalEditableCodepoints: corrector?.correctableCodepoints ?? 0 + } } get spaceId(): number { return this.matchingSpace?.tokenization.spaceId; } - // /** - // * Gets the number of Damerau-Levenshtein edits needed to reach the node's - // * matchString from the output induced by the input sequence used to reach it. - // * - // * (This is scaled by `SearchSpace.EDIT_DISTANCE_COST_SCALE` when included in - // * `totalCost`.) - // */ - // get knownCost(): number { - // return this.node.editCount; - // } - - // /** - // * Gets the "input sampling cost" of the edge, which should be considered as the - // * negative log-likelihood of the input path taken to reach the node. - // */ - // get inputSamplingCost(): number { - // return this.node.inputSamplingCost; - // } + /** + * Gets the number of Damerau-Levenshtein edits needed to reach the node's + * matchString from the output induced by the input sequence used to reach it. + * + * (This is scaled by `SearchSpace.EDIT_DISTANCE_COST_SCALE` when included in + * `totalCost`.) + */ + get knownCost(): number { + return this.matchedResult.totalEditCount; + } + + /** + * Gets the "input sampling cost" of the edge, which should be considered as the + * negative log-likelihood of the input path taken to reach the node. + */ + get inputSamplingCost(): number { + return this.matchedResult.tokenCorrections.reduce((accum, curr) => accum + curr.inputSamplingCost, 0); + } /** * Gets the "total cost" of the edge, which should be considered as the @@ -40,6 +54,6 @@ export class TokenizationResultMapping implements CorrectionResultMapping total + curr.totalCost, 0); + return this.matchedResult.tokenCorrections.reduce((total, curr) => total + curr.totalCost, 0); } } \ No newline at end of file diff --git a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts index 6ef54116737..aa0cd46db3b 100644 --- a/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts +++ b/web/src/engine/predictive-text/worker-thread/src/main/predict-helpers.ts @@ -544,8 +544,8 @@ export function buildCorrectionSequence( const orderedTokens = tokenizationCorrection.matchingSpace?.orderedTokens; const tokens: PredictionParameters['tokens'] = []; - for(let i = 0; i < tokenizationCorrection.matchedResult.length; i++) { - const correction = tokenizationCorrection.matchedResult[i]; + for(let i = 0; i < tokenizationCorrection.matchedResult.tokenCorrections.length; i++) { + const correction = tokenizationCorrection.matchedResult.tokenCorrections[i]; /* If we're dealing with the FIRST keystroke of a new sequence, we'll **dramatically** boost * the exponent to ensure only VERY nearby corrections have a chance of winning, and only if * there are significantly more likely words. We only need this to allow very minor fat-finger @@ -817,9 +817,18 @@ export function predictFromCorrectionSequence( const predictionComponents = correctionTokens.map((correctionToken, i) => { const correctionTransform = correctionToken.correction.sample; - const predictions = lexicalModel.predict(correctionTransform, currentContext); + let predictions = lexicalModel.predict(correctionTransform, currentContext); const transitionId = correctionTransform.id; + // Ensure codepointLength == prediction codepoint length if i does not match the tail! + // Filter out cases that do not conform to this condition. + if(i != correctionTokens.length - 1) { + predictions = predictions.filter((p) => { + const codepointLength = KMWString.length(correctionToken.correction.sample.insert); + return KMWString.length(p.sample.transform.insert) == codepointLength; + }); + } + // Failsafe: if there are no matching predictions, create a fake prediction // matching the original text. if(predictions.length != 0) { diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/tokenization-corrector.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/tokenization-corrector.tests.ts index 4cf66137e74..8949eb7a166 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/tokenization-corrector.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/correction-search/tokenization-corrector.tests.ts @@ -29,7 +29,8 @@ import { SubstitutionQuotientSpur, TokenizationCorrector, TokenResult, - TokenizationResultMapping + TokenizationResultMapping, + TokenizationResult } from '@keymanapp/lm-worker/test-index'; import Distribution = LexicalModelTypes.Distribution; @@ -302,7 +303,7 @@ describe('TokenizationCorrector', () => { assert.equal(searchResult.type, 'complete'); if(searchResult.type == 'complete') { const mapping = searchResult.mapping; - const tokenResults = mapping.matchedResult; + const tokenResults = mapping.matchedResult.tokenCorrections; assert.isNotNaN(searchResult.cost); assert.equal(searchResult.cost, searchResult.mapping.totalCost); assert.equal(tokenResults.length, 1); @@ -327,7 +328,7 @@ describe('TokenizationCorrector', () => { assert.equal(searchResult.type, 'none'); }); - it('finds a default correction for a single correctable token without a model match', () => { + it('returns no result when a single correctable token lacks a model match', () => { const fixture = buildFixture_therefore(); const theref = fixture.theref.tail; @@ -371,23 +372,6 @@ describe('TokenizationCorrector', () => { searchResult = instance.handleNextNode(); } while(searchResult.type == 'intermediate'); - assert.equal(searchResult.type, 'complete'); - if(searchResult.type == 'complete') { - const mapping = searchResult.mapping; - const tokenResults = mapping.matchedResult; - assert.isNotNaN(searchResult.cost); - assert.equal(searchResult.cost, searchResult.mapping.totalCost); - assert.equal(tokenResults.length, 1); - assert.sameOrderedMembers(tokenResults.map((r) => r.matchString), ['therefxyz']); - - // Now that an entry has been found, verify the corrector's state. - assert.isNotOk(instance.predictableToken); // should become an uncorrectable. - assert.isTrue(instance.generatedTokenResults.has(therefxyz)); - assert.equal(instance.generatedTokenResults.get(therefxyz), tokenResults[0]); - } - - // There should be no further possible suggestions. - searchResult = instance.handleNextNode(); assert.equal(searchResult.type, 'none'); }); @@ -411,7 +395,7 @@ describe('TokenizationCorrector', () => { let firstResults: ReadonlyArray; if(searchResult.type == 'complete') { const mapping = searchResult.mapping; - const tokenResults = mapping.matchedResult; + const tokenResults = mapping.matchedResult.tokenCorrections; firstResults = tokenResults; assert.isNotNaN(searchResult.cost); assert.equal(searchResult.cost, searchResult.mapping.totalCost); @@ -434,7 +418,7 @@ describe('TokenizationCorrector', () => { searchResult = instance.handleNextNode(); if(searchResult.type == 'complete') { const mapping = searchResult.mapping; - const tokenResults = mapping.matchedResult; + const tokenResults = mapping.matchedResult.tokenCorrections; // Verify that the first (bound) token is not altered further. // It should receive no further correction attempts. @@ -445,7 +429,7 @@ describe('TokenizationCorrector', () => { } while(searchResult.type != 'none'); }); - it('immediately returns a single result when the only represented token is uncorrectable', () => { + it('immediately returns with no result when the only represented token is uncorrectable', () => { const fixture = buildFixture_terminalWhitespace(); const tokenization = fixture.spaceOnly; @@ -457,13 +441,7 @@ describe('TokenizationCorrector', () => { ); const searchResult = instance.handleNextNode(); - assert.equal(searchResult.type, 'complete'); - if(searchResult.type == 'complete') { - assert.equal(searchResult.mapping.matchedResult[0].matchString, ' '); - } - - const nilResult = instance.handleNextNode(); - assert.equal(nilResult.type, 'none'); + assert.equal(searchResult.type, 'none'); }); it('returns a single result when the final token is uncorrectable', () => { @@ -484,8 +462,8 @@ describe('TokenizationCorrector', () => { assert.equal(searchResult.type, 'complete'); if(searchResult.type == 'complete') { - assert.equal(searchResult.mapping.matchedResult[0].matchString, 'space'); - assert.equal(searchResult.mapping.matchedResult[1].matchString, ' '); + assert.equal(searchResult.mapping.matchedResult.tokenCorrections[0].matchString, 'space'); + assert.equal(searchResult.mapping.matchedResult.tokenCorrections[1].matchString, ' '); } const nilResult = instance.handleNextNode(); @@ -502,20 +480,20 @@ describe('TokenizationCorrector', () => { let haveSeenSingleTokenCorrection = false; let haveSeenThreeTokenCorrection = false; for await(let phraseMatch of getBestMatches< - ReadonlyArray, + TokenizationResult, TokenizationResultMapping, TokenizationCorrector >(correctors, buildTestTimer())) { - if(phraseMatch.matchedResult.length == 1) { + if(phraseMatch.matchedResult.tokenCorrections.length == 1) { if(!haveSeenSingleTokenCorrection) { - assert.sameOrderedMembers(phraseMatch.matchedResult.map((t) => t.matchString), ['theref' /* -ore */]); + assert.sameOrderedMembers(phraseMatch.matchedResult.tokenCorrections.map((t) => t.matchString), ['theref' /* -ore */]); } haveSeenSingleTokenCorrection = true; - } else if(phraseMatch.matchedResult.length == 3) { + } else if(phraseMatch.matchedResult.tokenCorrections.length == 3) { if(!haveSeenThreeTokenCorrection) { - assert.sameOrderedMembers(phraseMatch.matchedResult.map((t) => t.matchString), ['the', ' ', 'ef' /* -fort */]); + assert.sameOrderedMembers(phraseMatch.matchedResult.tokenCorrections.map((t) => t.matchString), ['the', ' ', 'ef' /* -fort */]); } haveSeenThreeTokenCorrection = true; } diff --git a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-correction-sequence.tests.ts b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-correction-sequence.tests.ts index a31371ac01f..5f751f50d07 100644 --- a/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-correction-sequence.tests.ts +++ b/web/src/test/auto/headless/engine/predictive-text/worker-thread/prediction-helpers/predict-from-correction-sequence.tests.ts @@ -542,8 +542,9 @@ describe('predictFromCorrectionSequence', () => { ]; const expected_prediction_p = dummied_suggestion_sequences - .map((dist) => { - return dist[0] + .map((dist, i) => { + // There is no valid 'g' entry corresponding to token index 0. + return i == 0 ? null : dist[0] }).reduce((accum, curr) => { return accum * (curr ? curr.p : Math.exp(-EDIT_DISTANCE_COST_SCALE)) }, 1); @@ -551,11 +552,11 @@ describe('predictFromCorrectionSequence', () => { const expected_predictions: Suggestion[] = [ { transform: { - insert: 'golden', + insert: 'g', deleteLeft: 0, id: transitionID }, - displayAs: 'golden', + displayAs: 'g', transformId: transitionID }, { transform: { @@ -589,7 +590,7 @@ describe('predictFromCorrectionSequence', () => { predictions.forEach((entry) => assert.equal(entry.metadata.probabilities.correction, parameters.tokens.reduce((accum, curr) => accum * curr.correction.p, 1))); predictions.sort(tupleDisplayOrderSort); - assert.deepEqual(predictions[0].components.map((c) => c.prediction.transform.insert), ['golden', ' ', 'apple']); + assert.deepEqual(predictions[0].components.map((c) => c.prediction.transform.insert), ['g', ' ', 'apple']); assert.sameDeepOrderedMembers(predictions[0].components.map((entry) => entry.prediction), expected_predictions); assert.approximately(predictions[0].metadata.probabilities.prediction, expected_prediction_p, 0.00001);