src/lib/dataset/training-data.ts

Total Symbols
11
Lines of Code
132
Avg Complexity
1.5
Avg Coverage
100.0%

Symbols by Kind

function 10
interface 1

All Symbols

Name Kind Visibility Status Lines Signature
FlattenSignalsOptions interface exported- 18-25 interface FlattenSignalsOptions
normalizeTrainingSample function - 47-53 normalizeTrainingSample(sample: TrainingSample): : TrainingSample
normalizeStructuredSignals function exported- 60-64 normalizeStructuredSignals( signals: StructuredSignals, ): : StructuredSignals
flattenStructuredSignals function exported- 67-72 flattenStructuredSignals( signals: StructuredSignals, options: FlattenSignalsOptions = DEFAULT_FLATTEN_OPTIONS, ): : string
toTrainingSignalText function exported- 81-87 toTrainingSignalText(sample: TrainingSample): : string
getTrainingSamplesByDifficulty function exported- 90-94 getTrainingSamplesByDifficulty( difficulty: TrainingDifficulty, ): : TrainingSample[]
getTrainingSamplesByType function exported- 97-99 getTrainingSamplesByType(type: FieldType): : TrainingSample[]
getTrainingDistribution function exported- 102-108 getTrainingDistribution(): : Record<string, number>
getTrainingV2ByDifficulty function exported- 111-117 getTrainingV2ByDifficulty( difficulty: TrainingDifficulty, ): : TrainingSample[]
getTrainingV2ByType function exported- 120-122 getTrainingV2ByType(type: FieldType): : TrainingSample[]
getTrainingV2Distribution function exported- 125-131 getTrainingV2Distribution(): : Record<string, number>

Full Source

import type {
  FieldCategory,
  FieldType,
  TrainingDifficulty,
  TrainingSample,
} from "@/types";
import { FIELD_TYPES_BY_CATEGORY } from "@/types";
import {
  buildFeatureText,
  normalizeStructuredSignals as normalizeSignalsShared,
  type StructuredSignals,
} from "@/lib/shared/structured-signals";
import { ALL_TRAINING_SAMPLES } from "./data";

export type { StructuredSignals } from "@/lib/shared/structured-signals";

/** Options for controlling how structured signals are flattened to text. */
export interface FlattenSignalsOptions {
  includeSecondary?: boolean;
  includeStructural?: boolean;
  includeMetadata?: boolean;
  primaryWeight?: number;
  secondaryWeight?: number;
  structuralWeight?: number;
}

const DEFAULT_FLATTEN_OPTIONS: Required<FlattenSignalsOptions> = {
  includeSecondary: true,
  includeStructural: true,
  includeMetadata: false,
  primaryWeight: 1,
  secondaryWeight: 1,
  structuralWeight: 1,
};

const CATEGORY_BY_TYPE: Partial<Record<FieldType, FieldCategory>> =
  Object.entries(FIELD_TYPES_BY_CATEGORY).reduce(
    (acc, [category, types]) => {
      for (const type of types) {
        acc[type] = category as FieldCategory;
      }
      return acc;
    },
    {} as Partial<Record<FieldType, FieldCategory>>,
  );

function normalizeTrainingSample(sample: TrainingSample): TrainingSample {
  return {
    ...sample,
    signals: normalizeStructuredSignals(sample.signals),
    category: sample.category ?? CATEGORY_BY_TYPE[sample.type] ?? "unknown",
  };
}

const BUILTIN_TRAINING_SAMPLES: TrainingSample[] = ALL_TRAINING_SAMPLES.map(
  normalizeTrainingSample,
);

/** Normalizes structured signal tokens (delegates to shared implementation). */
export function normalizeStructuredSignals(
  signals: StructuredSignals,
): StructuredSignals {
  return normalizeSignalsShared(signals);
}

/** Flattens structured signals into a single feature-text string. */
export function flattenStructuredSignals(
  signals: StructuredSignals,
  options: FlattenSignalsOptions = DEFAULT_FLATTEN_OPTIONS,
): string {
  return buildFeatureText(signals, undefined, options);
}

/** All built-in training samples with structured signals (V2 format). */
export const TRAINING_SAMPLES_V2: TrainingSample[] = BUILTIN_TRAINING_SAMPLES;

/** Alias for {@link TRAINING_SAMPLES_V2} — used by classifiers and dataset tools. */
export const TRAINING_SAMPLES: TrainingSample[] = TRAINING_SAMPLES_V2;

/** Converts a training sample into a flat feature-text string for the classifier. */
export function toTrainingSignalText(sample: TrainingSample): string {
  return buildFeatureText(sample.signals, {
    category: sample.category,
    language: sample.language,
    domFeatures: sample.domFeatures,
  });
}

/** Filters training samples by difficulty level. */
export function getTrainingSamplesByDifficulty(
  difficulty: TrainingDifficulty,
): TrainingSample[] {
  return TRAINING_SAMPLES.filter((sample) => sample.difficulty === difficulty);
}

/** Filters training samples by field type. */
export function getTrainingSamplesByType(type: FieldType): TrainingSample[] {
  return TRAINING_SAMPLES.filter((sample) => sample.type === type);
}

/** Returns a count of training samples grouped by field type. */
export function getTrainingDistribution(): Record<string, number> {
  const distribution: Record<string, number> = {};
  for (const sample of TRAINING_SAMPLES) {
    distribution[sample.type] = (distribution[sample.type] || 0) + 1;
  }
  return distribution;
}

/** Filters V2 training samples by difficulty. */
export function getTrainingV2ByDifficulty(
  difficulty: TrainingDifficulty,
): TrainingSample[] {
  return TRAINING_SAMPLES_V2.filter(
    (sample) => sample.difficulty === difficulty,
  );
}

/** Filters V2 training samples by field type. */
export function getTrainingV2ByType(type: FieldType): TrainingSample[] {
  return TRAINING_SAMPLES_V2.filter((sample) => sample.type === type);
}

/** Returns a count of V2 training samples grouped by field type. */
export function getTrainingV2Distribution(): Record<string, number> {
  const distribution: Record<string, number> = {};
  for (const sample of TRAINING_SAMPLES_V2) {
    distribution[sample.type] = (distribution[sample.type] || 0) + 1;
  }
  return distribution;
}