checkDatasetHealth function exported ✓ 100.0%

Last updated: 2026-02-24T21:07:57.579Z

Metrics

LOC: 33 Complexity: 5 Params: 1 Coverage: 100.0% (14/14 lines, 2x executed)

Signature

checkDatasetHealth(minSamplesPerType = 3): : DatasetHealthReport

Summary

Run health checks on the combined dataset. Call this during development to find gaps or data leakage.

Source Code

export function checkDatasetHealth(minSamplesPerType = 3): DatasetHealthReport {
  const dist = getTrainingDistribution();

  const underrepresentedTypes = Object.entries(dist)
    .filter(([, count]) => count < minSamplesPerType)
    .map(([type]) => type);

  const coveredTypes = new Set(Object.keys(dist));
  const missingTypes = KNOWN_TYPES.filter((t) => !coveredTypes.has(t));

  // Check for data leakage (exact signal match between splits)
  const trainSignals = new Set(
    TRAINING_SAMPLES.map((s) => toTrainingSignalText(s)),
  );
  const leakedSignals: string[] = [];

  for (const vs of VALIDATION_SAMPLES) {
    if (trainSignals.has(vs.signals)) leakedSignals.push(vs.signals);
  }
  for (const ts of TEST_SAMPLES) {
    if (trainSignals.has(ts.signals)) leakedSignals.push(ts.signals);
  }

  return {
    totalSamples:
      TRAINING_SAMPLES.length + VALIDATION_SAMPLES.length + TEST_SAMPLES.length,
    typeCounts: dist,
    underrepresentedTypes,
    missingTypes,
    hasLeakage: leakedSignals.length > 0,
    leakedSignals,
  };
}

Dependencies (Outgoing)

graph LR checkDatasetHealth["checkDatasetHealth"] getTrainingDistribution["getTrainingDistribution"] toTrainingSignalText["toTrainingSignalText"] checkDatasetHealth -->|calls| getTrainingDistribution checkDatasetHealth -->|calls| toTrainingSignalText style checkDatasetHealth fill:#dbeafe,stroke:#2563eb,stroke-width:2px click checkDatasetHealth "3be8e84862588bff.html" click getTrainingDistribution "bcb2653a7652dea2.html" click toTrainingSignalText "c362e39aba47223c.html"

Impact (Incoming)

graph LR checkDatasetHealth["checkDatasetHealth"] buildKeywordsFromDictionary["buildKeywordsFromDictionary"] validateClassifier["validateClassifier"] testClassifier["testClassifier"] EvalMisclassified["EvalMisclassified"] buildKeywordsFromDictionary -->|uses| checkDatasetHealth validateClassifier -->|calls| checkDatasetHealth testClassifier -->|calls| checkDatasetHealth EvalMisclassified -->|uses| checkDatasetHealth style checkDatasetHealth fill:#dbeafe,stroke:#2563eb,stroke-width:2px click checkDatasetHealth "3be8e84862588bff.html" click buildKeywordsFromDictionary "dfb7d9bfef2aba38.html" click validateClassifier "86cd7e4340374af1.html" click testClassifier "172c1c69c62be40a.html" click EvalMisclassified "4e0d00ef54656ad2.html"