src/lib/form/detectors/strategies/keyword-classifier.ts

Total Symbols
5
Lines of Code
153
Avg Complexity
2.8
Avg Coverage
100.0%

File Relationships

graph LR matchesRule["matchesRule"] escapeRegex["escapeRegex"] detect["detect"] normalize["normalize"] matchesRule -->|calls| escapeRegex detect -->|calls| normalize detect -->|calls| matchesRule click matchesRule "../symbols/1a959a10048b5e94.html" click escapeRegex "../symbols/f946f576c045998d.html" click detect "../symbols/3ad5e2f09986771f.html" click normalize "../symbols/87c87498f93bf3ff.html"

Symbols by Kind

function 3
interface 1
method 1

All Symbols

Name Kind Visibility Status Lines Signature
normalize function - 26-34 normalize(text: string): : string
escapeRegex function - 36-38 escapeRegex(s: string): : string
KeywordRule interface - 42-52 interface KeywordRule
matchesRule function - 54-64 matchesRule(normalized: string, rule: KeywordRule): : boolean
detect method - 138-151 detect(field: FormField): : ClassifierResult | null

Full Source

/**
 * Keyword-based field classifier for Brazilian/Portuguese forms.
 *
 * Runs BEFORE the TensorFlow classifier to handle common Portuguese patterns
 * that the ML model may under-score due to language/training-data bias.
 *
 * Rules are evaluated in order — more specific rules must come first to
 * avoid short patterns (e.g. "cpf") shadowing compound ones ("cpfcnpj").
 *
 * Accuracy note: substring matching is used for long patterns (≥4 chars) and
 * whole-word matching for short codes ("rg", "obs", "cep", "tel", "cpf").
 */

import type { FormField, FieldType } from "@/types";
import type { FieldClassifier, ClassifierResult } from "../pipeline";

// ── Normalisation ─────────────────────────────────────────────────────────────

/**
 * Normalises a signals string for keyword matching:
 *   - lowercases
 *   - strips diacritics (ã → a, é → e, etc.)
 *   - replaces common separators (*, -, _, ., /) with spaces
 *   - collapses runs of whitespace
 */
function normalize(text: string): string {
  return text
    .toLowerCase()
    .normalize("NFD")
    .replace(/[\u0300-\u036f]/g, "")
    .replace(/[*\-_./\\|]/g, " ")
    .replace(/\s+/g, " ")
    .trim();
}

function escapeRegex(s: string): string {
  return s.replace(/[.*+?^${}()|[\]\\]/g, "\\$&");
}

// ── Rule types ────────────────────────────────────────────────────────────────

interface KeywordRule {
  /** Normalized substrings (or whole-word tokens) to look for */
  patterns: string[];
  /** FieldType to assign when matched */
  type: FieldType;
  /**
   * When true, each pattern must appear as a complete word (not a substring
   * of another word). Use for short codes like "rg", "obs", "cep".
   */
  wholeWord?: boolean;
}

function matchesRule(normalized: string, rule: KeywordRule): boolean {
  for (const pattern of rule.patterns) {
    if (rule.wholeWord) {
      const re = new RegExp(`(?<![a-z0-9])${escapeRegex(pattern)}(?![a-z0-9])`);
      if (re.test(normalized)) return true;
    } else {
      if (normalized.includes(pattern)) return true;
    }
  }
  return false;
}

const KEYWORD_RULES: KeywordRule[] = [
  // ── Termos / newsletter (MUST precede email, cidade, etc.) ─────────────────
  // These labels contain words like "e-mail" or "cidade" but are checkboxes;
  // returning null lets the html-type detector classify them as checkbox.
  // We intentionally skip them by returning a non-interfering type OR we can
  // just not match them at all — the simplest approach is to detect them early
  // and return a "checkbox" type so the filler handles them correctly.
  {
    patterns: [
      "aceito os termos",
      "aceito o termos",
      "li e aceito",
      "concordo com os termos",
      "termos de uso",
      "termos e condicoes",
      "politica de privacidade",
      "aceito a politica",
      "terms of service",
      "terms and conditions",
      "privacy policy",
      "i agree",
      "i accept",
    ],
    type: "checkbox",
  },

  // ── Text / description area patterns ───────────────────────────────────────
  // Substring match for long patterns (≥ 4 chars). These describe free-text
  // fields in Portuguese/English forms and should resolve to "text".
  {
    patterns: [
      "observacao",
      "observacoes",
      "descricao",
      "mensagem",
      "message",
      "comentario",
      "comentarios",
      "anotacao",
      "anotacoes",
      "notas",
      "sugestao",
      "sugestoes",
      "feedback",
      "detalhe",
      "detalhes",
      "historico",
    ],
    type: "text",
  },

  // ── "obs" short-code — whole-word only ─────────────────────────────────────
  // Must be a complete word so that "observar" (obs inside a longer word) is
  // NOT matched. The normalize() function converts "obs-campo" / "obs_campo"
  // to "obs campo" so the word-boundary check works correctly.
  {
    patterns: ["obs"],
    type: "text",
    wholeWord: true,
  },
];

// ── Classifier ────────────────────────────────────────────────────────────────

/**
 * Keyword-based field classifier.
 * Returns a result with confidence=1.0 when a known Portuguese/Brazilian pattern
 * is matched, otherwise returns null to let the next classifier try.
 */
export const keywordClassifier: FieldClassifier = {
  name: "keyword",

  detect(field: FormField): ClassifierResult | null {
    const raw = field.contextSignals ?? "";
    if (!raw.trim()) return null;

    const normalized = normalize(raw);

    for (const rule of KEYWORD_RULES) {
      if (matchesRule(normalized, rule)) {
        return { type: rule.type, confidence: 1.0 };
      }
    }

    return null;
  },
};