# CROISSANT VALIDATION REPORT ================================================================================ ## VALIDATION RESULTS -------------------------------------------------------------------------------- Starting validation for file: croissant ### JSON Format Validation ✓ The URL returned valid JSON. ### Croissant Schema Validation ✓ The dataset passes Croissant validation. ### Records Generation Test ✗ Record set 'default' failed: An error occured during the sequential generation of the dataset, more specifically during the operation Read(parquet-files-for-config-default) Traceback (most recent call last): File "/Users/jvanscho/Documents/croissant-checker/validation.py", line 49, in validate_records _ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records))) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/func_timeout/dafunc.py", line 108, in func_timeout raise_exception(exception) File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/func_timeout/py3_raise.py", line 7, in raise_exception raise exception[0] from None File "/Users/jvanscho/Documents/croissant-checker/validation.py", line 49, in _ = func_timeout.func_timeout(WAIT_TIME, lambda: next(iter(records))) ^^^^^^^^^^^^^^^^^^^ File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/datasets.py", line 171, in __iter__ yield from execute_operations_sequentially( File "/Users/jvanscho/miniconda3/lib/python3.12/site-packages/mlcroissant/_src/operation_graph/execute.py", line 72, in execute_operations_sequentially raise GenerationError( mlcroissant._src.core.issues.GenerationError: An error occured during the sequential generation of the dataset, more specifically during the operation Read(parquet-files-for-config-default) ## JSON-LD REFERENCE ================================================================================ ```json { "@context": { "@language": "en", "@vocab": "https://schema.org/", "citeAs": "cr:citeAs", "column": "cr:column", "conformsTo": "dct:conformsTo", "cr": "http://mlcommons.org/croissant/", "data": { "@id": "cr:data", "@type": "@json" }, "dataBiases": "cr:dataBiases", "dataCollection": "cr:dataCollection", "dataType": { "@id": "cr:dataType", "@type": "@vocab" }, "dct": "http://purl.org/dc/terms/", "extract": "cr:extract", "field": "cr:field", "fileProperty": "cr:fileProperty", "fileObject": "cr:fileObject", "fileSet": "cr:fileSet", "format": "cr:format", "includes": "cr:includes", "isLiveDataset": "cr:isLiveDataset", "jsonPath": "cr:jsonPath", "key": "cr:key", "md5": "cr:md5", "parentField": "cr:parentField", "path": "cr:path", "personalSensitiveInformation": "cr:personalSensitiveInformation", "recordSet": "cr:recordSet", "references": "cr:references", "regex": "cr:regex", "repeated": "cr:repeated", "replace": "cr:replace", "sc": "https://schema.org/", "separator": "cr:separator", "source": "cr:source", "subField": "cr:subField", "transform": "cr:transform", "@base": "cr_base_iri/" }, "@type": "sc:Dataset", "distribution": [ { "@type": "cr:FileObject", "@id": "repo", "name": "repo", "description": "The Hugging Face git repository.", "contentUrl": "https://huggingface.co/datasets/facebook/natural_reasoning/tree/refs%2Fconvert%2Fparquet", "encodingFormat": "git+https", "sha256": "https://github.com/mlcommons/croissant/issues/80" }, { "@type": "cr:FileSet", "@id": "parquet-files-for-config-default", "containedIn": { "@id": "repo" }, "encodingFormat": "application/x-parquet", "includes": "default/*/*.parquet" } ], "recordSet": [ { "@type": "cr:RecordSet", "dataType": "cr:Split", "key": { "@id": "default_splits/split_name" }, "@id": "default_splits", "name": "default_splits", "description": "Splits for the default config.", "field": [ { "@type": "cr:Field", "@id": "default_splits/split_name", "dataType": "sc:Text" } ], "data": [ { "default_splits/split_name": "train" } ] }, { "@type": "cr:RecordSet", "@id": "default", "description": "facebook/natural_reasoning - 'default' subset", "field": [ { "@type": "cr:Field", "@id": "default/split", "dataType": "sc:Text", "source": { "fileSet": { "@id": "parquet-files-for-config-default" }, "extract": { "fileProperty": "fullpath" }, "transform": { "regex": "default/(?:partial-)?(train)/.+parquet$" } }, "references": { "field": { "@id": "default_splits/split_name" } } }, { "@type": "cr:Field", "@id": "default/question", "dataType": "sc:Text", "source": { "fileSet": { "@id": "parquet-files-for-config-default" }, "extract": { "column": "question" } } }, { "@type": "cr:Field", "@id": "default/reference_answer", "dataType": "sc:Text", "source": { "fileSet": { "@id": "parquet-files-for-config-default" }, "extract": { "column": "reference_answer" } } }, { "@type": "cr:Field", "@id": "default/responses", "subField": [ { "@type": "cr:Field", "@id": "default/responses/response_model", "dataType": "sc:Text", "source": { "fileSet": { "@id": "parquet-files-for-config-default" }, "extract": { "column": "responses" }, "transform": { "jsonPath": "response_model" } } }, { "@type": "cr:Field", "@id": "default/responses/response", "dataType": "sc:Text", "source": { "fileSet": { "@id": "parquet-files-for-config-default" }, "extract": { "column": "responses" }, "transform": { "jsonPath": "response" } } } ], "repeated": true } ] } ], "conformsTo": "http://mlcommons.org/croissant/1.0", "name": "natural_reasoning", "description": "NaturalReasoning is a large-scale dataset for general reasoning tasks. It consists of high-quality challenging reasoning questions backtranslated from pretraining corpora DCLM and FineMath. The questions have been deduplicated and decontaminated from popular reasoning benchmarks including MATH, GPQA, MMLU-Pro, MMLU-STEM. For each question, we extract the reference final answer from the original document from the pretraining corpora if possible. We also provide a model-generated response from\u2026 See the full description on the dataset page: https://huggingface.co/datasets/facebook/natural_reasoning.", "alternateName": [ "facebook/natural_reasoning", "Natural Reasoning" ], "creator": { "@type": "Organization", "name": "AI at Meta", "url": "https://huggingface.co/facebook" }, "keywords": [ "text-generation", "English", "cc-by-nc-4.0", "1M - 10M", "json", "Text", "Datasets", "pandas", "Croissant", "Polars", "arxiv:2502.13124", "\ud83c\uddfa\ud83c\uddf8 Region: US" ], "license": "https://choosealicense.com/licenses/cc-by-nc-4.0/", "url": "https://huggingface.co/datasets/facebook/natural_reasoning" } ```