#!/usr/bin/env python3 """ Validation script for new MIT-licensed dataset transformers. Checks that all new transformer methods exist and are callable. """ from warbler_cda.utils.hf_warbler_ingest import HFWarblerIngestor import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) def main(): print("\n" + "=" * 60) print("Validating New MIT-Licensed Dataset Transformers") print("=" * 60) try: ingestor = HFWarblerIngestor() print("āœ“ HFWarblerIngestor initialized successfully") except Exception as e: print(f"āœ— Failed to initialize HFWarblerIngestor: {e}") return False required_transformers = [ "transform_arxiv", "transform_prompt_report", "transform_novels", "transform_manuals", "transform_enterprise", "transform_portuguese_education", ] required_helpers = [ "_create_arxiv_content", "_create_prompt_report_content", "_create_novel_content", "_create_manual_content", "_create_enterprise_content", "_create_portuguese_content", "_chunk_text", ] all_good = True print("\nšŸ“‹ Checking Transformer Methods:") for transformer_name in required_transformers: if hasattr(ingestor, transformer_name): method = getattr(ingestor, transformer_name) if callable(method): print(f" āœ“ {transformer_name}") else: print(f" āœ— {transformer_name} (exists but not callable)") all_good = False else: print(f" āœ— {transformer_name} (missing)") all_good = False print("\nšŸ”§ Checking Helper Methods:") for helper_name in required_helpers: if hasattr(ingestor, helper_name): method = getattr(ingestor, helper_name) if callable(method): print(f" āœ“ {helper_name}") else: print(f" āœ— {helper_name} (exists but not callable)") all_good = False else: print(f" āœ— {helper_name} (missing)") all_good = False print("\nšŸ“Š Testing Basic Transformer Structure:") test_doc = { "content_id": "test/001", "content": "Test content for validation", "metadata": { "pack": "test-pack", "source_dataset": "test/dataset", "license": "MIT", "realm_type": "test", "realm_label": "test", "lifecycle_stage": "emergence", "activity_level": 0.5, "dialogue_type": "test", }, } required_keys = ["content_id", "content", "metadata"] required_metadata_keys = ["pack", "source_dataset", "license", "realm_type", "realm_label"] for key in required_keys: if key in test_doc: print(f" āœ“ Document has '{key}'") else: print(f" āœ— Document missing '{key}'") all_good = False for key in required_metadata_keys: if key in test_doc["metadata"]: print(f" āœ“ Metadata has '{key}'") else: print(f" āœ— Metadata missing '{key}'") all_good = False print("\nšŸ” Testing _chunk_text Helper:") try: test_text = "word " * 3000 chunks = ingestor._chunk_text(test_text, chunk_size=100) print(f" āœ“ Successfully chunked text into {len(chunks)} chunks") if all(isinstance(chunk, str) for chunk in chunks): print(f" āœ“ All chunks are strings") else: print(f" āœ— Some chunks are not strings") all_good = False except Exception as e: print(f" āœ— _chunk_text failed: {e}") all_good = False print("\n" + "=" * 60) if all_good: print("āœ… ALL VALIDATIONS PASSED") print("=" * 60) return True else: print("āŒ SOME VALIDATIONS FAILED") print("=" * 60) return False if __name__ == "__main__": success = main() sys.exit(0 if success else 1)