src/lib/shared/structured-signals.ts

Total Symbols
14
Lines of Code
244
Avg Complexity
4.0
Avg Coverage
98.5%

File Relationships

graph LR normalizeAndDedupe["normalizeAndDedupe"] normalizeToken["normalizeToken"] normalizeDomFeatures["normalizeDomFeatures"] buildMetadataTokens["buildMetadataTokens"] normalizeStructuredSignals["normalizeStructuredSignals"] inferLanguageFromSignals["inferLanguageFromSignals"] buildFeatureText["buildFeatureText"] repeatTokens["repeatTokens"] structuredSignalsFromField["structuredSignalsFromField"] normalizeAndDedupe -->|calls| normalizeToken normalizeDomFeatures -->|calls| normalizeToken buildMetadataTokens -->|calls| normalizeDomFeatures normalizeStructuredSignals -->|calls| normalizeAndDedupe inferLanguageFromSignals -->|calls| normalizeToken buildFeatureText -->|calls| normalizeStructuredSignals buildFeatureText -->|calls| repeatTokens buildFeatureText -->|calls| buildMetadataTokens buildFeatureText -->|calls| normalizeToken structuredSignalsFromField -->|calls| normalizeAndDedupe structuredSignalsFromField -->|calls| normalizeToken click normalizeAndDedupe "../symbols/1c7d0c2fccdeffca.html" click normalizeToken "../symbols/acab84a19ef8c65c.html" click normalizeDomFeatures "../symbols/d69f9e503064213a.html" click buildMetadataTokens "../symbols/b53293dbab8d60b7.html" click normalizeStructuredSignals "../symbols/a249025c87ece06b.html" click inferLanguageFromSignals "../symbols/e08bf6c443fa8af7.html" click buildFeatureText "../symbols/3eec77682b581a7e.html" click repeatTokens "../symbols/cf610726ab412844.html" click structuredSignalsFromField "../symbols/ab699edf98df2b23.html"

Architecture violations

View all

  • [warning] max-cyclomatic-complexity: 'structuredSignalsFromField' has cyclomatic complexity 15 (max 10)

Symbols by Kind

function 11
interface 3

All Symbols

Name Kind Visibility Status Lines Signature
StructuredSignals interface exported- 14-21 interface StructuredSignals
StructuredSignalContext interface exported- 24-28 interface StructuredSignalContext
BuildFeatureTextOptions interface exported- 31-38 interface BuildFeatureTextOptions
normalizeToken function - 49-57 normalizeToken(value: string): : string
normalizeAndDedupe function - 59-71 normalizeAndDedupe(values: string[]): : string[]
repeatTokens function - 73-78 repeatTokens(values: string[], weight: number): : string[]
normalizeDomFeatures function - 80-94 normalizeDomFeatures(domFeatures?: DomFeatureHints): : DomFeatureHints
buildMetadataTokens function - 96-118 buildMetadataTokens(context?: StructuredSignalContext): : string[]
normalizeStructuredSignals function exported- 121-129 normalizeStructuredSignals( signals: StructuredSignals, ): : StructuredSignals
fromFlatSignals function exported- 132-138 fromFlatSignals(signals: string): : StructuredSignals
inferLanguageFromSignals function exported- 141-150 inferLanguageFromSignals(signals: string): : TrainingLanguage
inferCategoryFromType function exported- 153-161 inferCategoryFromType(type: FieldType): : FieldCategory
buildFeatureText function exported- 167-190 buildFeatureText( signals: StructuredSignals, context?: StructuredSignalContext, options: BuildFeatureTextOptions = DEFAULT_BUILD_OPTIONS, ): : string
structuredSignalsFromField function exported- 196-243 structuredSignalsFromField(field: Partial<FormField>): : { signals: StructuredSignals; context: StructuredSignalContext; }

Full Source

import type {
  DomFeatureHints,
  FieldCategory,
  FieldType,
  FormField,
  TrainingLanguage,
} from "@/types";
import { FIELD_TYPES_BY_CATEGORY } from "@/types";

/**
 * Structured signal layers extracted from a form field.
 * Used as input for feature text building and classifier training.
 */
export interface StructuredSignals {
  /** High-relevance tokens: label, name, id, placeholder */
  primary: string[];
  /** Medium-relevance tokens: autocomplete attribute */
  secondary: string[];
  /** Low-relevance tokens: input type, required, pattern, maxlength */
  structural: string[];
}

/** Optional context about the field’s category, language, and DOM features. */
export interface StructuredSignalContext {
  category?: FieldCategory;
  language?: TrainingLanguage;
  domFeatures?: DomFeatureHints;
}

/** Options for controlling how feature text is built from structured signals. */
export interface BuildFeatureTextOptions {
  includeSecondary?: boolean;
  includeStructural?: boolean;
  includeMetadata?: boolean;
  primaryWeight?: number;
  secondaryWeight?: number;
  structuralWeight?: number;
}

const DEFAULT_BUILD_OPTIONS: Required<BuildFeatureTextOptions> = {
  includeSecondary: true,
  includeStructural: true,
  includeMetadata: true,
  primaryWeight: 3,
  secondaryWeight: 2,
  structuralWeight: 1,
};

function normalizeToken(value: string): string {
  return value
    .toLowerCase()
    .normalize("NFD")
    .replace(/[\u0300-\u036f]/g, "")
    .replace(/[^a-z0-9\s]+/g, " ")
    .replace(/\s+/g, " ")
    .trim();
}

function normalizeAndDedupe(values: string[]): string[] {
  const seen = new Set<string>();
  const output: string[] = [];

  for (const value of values) {
    const token = normalizeToken(value);
    if (!token || seen.has(token)) continue;
    seen.add(token);
    output.push(token);
  }

  return output;
}

function repeatTokens(values: string[], weight: number): string[] {
  if (weight <= 0 || values.length === 0) return [];
  const output: string[] = [];
  for (let i = 0; i < weight; i++) output.push(...values);
  return output;
}

function normalizeDomFeatures(domFeatures?: DomFeatureHints): DomFeatureHints {
  if (!domFeatures) return {};
  return {
    inputType: domFeatures.inputType
      ? normalizeToken(domFeatures.inputType)
      : undefined,
    maxLength:
      typeof domFeatures.maxLength === "number" && domFeatures.maxLength > 0
        ? domFeatures.maxLength
        : undefined,
    pattern: domFeatures.pattern
      ? normalizeToken(domFeatures.pattern)
      : undefined,
  };
}

function buildMetadataTokens(context?: StructuredSignalContext): string[] {
  if (!context) return [];
  const tokens: string[] = [];

  if (context.category && context.category !== "unknown") {
    tokens.push(`__cat_${context.category}`);
  }

  if (context.language) {
    tokens.push(`__lang_${context.language}`);
  }

  const dom = normalizeDomFeatures(context.domFeatures);
  if (dom.inputType) tokens.push(`__input_${dom.inputType}`);
  if (dom.pattern) tokens.push("__has_pattern");
  if (dom.maxLength !== undefined) {
    const bucket =
      dom.maxLength <= 4 ? "tiny" : dom.maxLength <= 14 ? "short" : "long";
    tokens.push(`__maxlen_${bucket}`);
  }

  return tokens;
}

/** Normalizes all signal tokens (lowercase, strip accents, deduplicate). */
export function normalizeStructuredSignals(
  signals: StructuredSignals,
): StructuredSignals {
  return {
    primary: normalizeAndDedupe(signals.primary),
    secondary: normalizeAndDedupe(signals.secondary),
    structural: normalizeAndDedupe(signals.structural),
  };
}

/** Wraps a flat signal string into a {@link StructuredSignals} with only primary tokens. */
export function fromFlatSignals(signals: string): StructuredSignals {
  return {
    primary: signals ? [signals] : [],
    secondary: [],
    structural: [],
  };
}

/** Heuristically infers the language (`pt`, `en`, `es`) from signal text. */
export function inferLanguageFromSignals(signals: string): TrainingLanguage {
  const normalized = normalizeToken(signals);
  if (/\b(el|la|correo|telefono|direccion|apellido)\b/.test(normalized)) {
    return "es";
  }
  if (/\b(the|your|email|phone|address|name|zip|state)\b/.test(normalized)) {
    return "en";
  }
  return "pt";
}

/** Maps a {@link FieldType} to its canonical {@link FieldCategory}. */
export function inferCategoryFromType(type: FieldType): FieldCategory {
  for (const [category, types] of Object.entries(
    FIELD_TYPES_BY_CATEGORY,
  ) as Array<[FieldCategory, FieldType[]]>) {
    if (types.includes(type)) return category;
  }

  return "unknown";
}

/**
 * Builds a weighted, normalized feature text from structured signals.
 * Tokens are repeated proportionally to their layer’s weight.
 */
export function buildFeatureText(
  signals: StructuredSignals,
  context?: StructuredSignalContext,
  options: BuildFeatureTextOptions = DEFAULT_BUILD_OPTIONS,
): string {
  const cfg = { ...DEFAULT_BUILD_OPTIONS, ...options };
  const normalized = normalizeStructuredSignals(signals);

  const tokens: string[] = [];
  tokens.push(...repeatTokens(normalized.primary, cfg.primaryWeight));

  if (cfg.includeSecondary) {
    tokens.push(...repeatTokens(normalized.secondary, cfg.secondaryWeight));
  }
  if (cfg.includeStructural) {
    tokens.push(...repeatTokens(normalized.structural, cfg.structuralWeight));
  }

  if (cfg.includeMetadata) {
    tokens.push(...buildMetadataTokens(context));
  }

  return normalizeToken(tokens.join(" "));
}

/**
 * Extracts {@link StructuredSignals} and {@link StructuredSignalContext}
 * from a partial `FormField` object.
 */
export function structuredSignalsFromField(field: Partial<FormField>): {
  signals: StructuredSignals;
  context: StructuredSignalContext;
} {
  const signals: StructuredSignals = {
    primary: [
      field.label ?? "",
      field.name ?? "",
      field.id ?? "",
      field.placeholder ?? "",
      field.contextSignals ?? "",
    ],
    secondary: [field.autocomplete ?? ""],
    structural: [
      field.inputType ?? "",
      field.required ? "required" : "",
      field.pattern ?? "",
      typeof field.maxLength === "number" ? `maxlength_${field.maxLength}` : "",
    ],
  };

  const normalizedPrimary = normalizeAndDedupe(signals.primary);
  if (normalizedPrimary.length === 0 && field.contextSignals) {
    normalizedPrimary.push(normalizeToken(field.contextSignals));
  }

  const normalizedSignals: StructuredSignals = {
    primary: normalizedPrimary,
    secondary: normalizeAndDedupe(signals.secondary),
    structural: normalizeAndDedupe(signals.structural),
  };

  return {
    signals: normalizedSignals,
    context: {
      category: field.category,
      language:
        field.languageDetected && field.languageDetected !== "unknown"
          ? field.languageDetected
          : undefined,
      domFeatures: {
        inputType: field.inputType,
        maxLength: field.maxLength,
        pattern: field.pattern,
      },
    },
  };
}