Spaces:
Sleeping
Sleeping
| """Data loading, cleaning and preprocessing for ArXiv dataset.""" | |
| import os | |
| import json | |
| import gzip | |
| import pandas as pd | |
| from langchain_core.documents import Document | |
| from .config import DATA_PATH | |
| from .text_processing import clean_text | |
| def load_hf_dataset(num_records=50000, dataset_name="CShorten/ML-ArXiv-Papers"): | |
| """Load ArXiv papers from Hugging Face dataset. | |
| Args: | |
| num_records: Number of records to load | |
| dataset_name: Hugging Face dataset identifier | |
| Returns: | |
| pandas DataFrame with the papers | |
| """ | |
| try: | |
| from datasets import load_dataset | |
| print(f"Loading {num_records} records from {dataset_name}...") | |
| # Load dataset from Hugging Face | |
| dataset = load_dataset(dataset_name, split="train", streaming=False) | |
| # Convert to pandas DataFrame | |
| if num_records and num_records < len(dataset): | |
| df = dataset.select(range(num_records)).to_pandas() | |
| else: | |
| df = dataset.to_pandas() | |
| print(f"Loaded {len(df)} records from Hugging Face dataset") | |
| return df | |
| except ImportError: | |
| raise ImportError("Please install the datasets library: pip install datasets") | |
| except Exception as e: | |
| raise ValueError(f"Failed to load Hugging Face dataset: {e}") | |
| def _open_file(file_path): | |
| """Open file with appropriate mode and encoding.""" | |
| if file_path.endswith('.gz'): | |
| return gzip.open(file_path, 'rt', encoding='utf-8-sig') | |
| return open(file_path, 'r', encoding='utf-8-sig') | |
| def _parse_json_line(line): | |
| """Parse a single JSON line, return None if invalid.""" | |
| s = line.strip() | |
| if not s: | |
| return None | |
| try: | |
| return json.loads(s) | |
| except json.JSONDecodeError: | |
| return None | |
| def _try_full_json_array(file_path, num_records): | |
| """Try to load the file as a full JSON array.""" | |
| try: | |
| with _open_file(file_path) as f: | |
| data = json.load(f) | |
| if not isinstance(data, list): | |
| raise ValueError("Top-level JSON is not a list.") | |
| return pd.DataFrame(data[:num_records]) | |
| except Exception as e: | |
| raise ValueError( | |
| "Failed to parse dataset. Expected JSON Lines or a JSON array." | |
| ) from e | |
| def _parse_lines(file_path, num_records): | |
| """Parse lines from file as JSONL, fallback to JSON array if needed.""" | |
| records = [] | |
| with _open_file(file_path) as f: | |
| for line in f: | |
| if len(records) >= num_records: | |
| break | |
| record = _parse_json_line(line) | |
| if record is not None: | |
| records.append(record) | |
| elif not records: | |
| # First non-empty line failed, try full-file JSON array | |
| return _try_full_json_array(file_path, num_records) | |
| return records | |
| def load_data_subset(file_path, num_records=50000): | |
| """Load up to num_records from a JSON Lines file. | |
| - Skips empty/BOM-prefixed lines. | |
| - Uses UTF-8 with BOM tolerance. | |
| - Raises a clear error if file is empty or unreadable. | |
| """ | |
| if not os.path.exists(file_path) or os.path.getsize(file_path) == 0: | |
| raise FileNotFoundError(f"Dataset not found or empty: {file_path}") | |
| try: | |
| records = _parse_lines(file_path, num_records) | |
| except UnicodeDecodeError: | |
| # Retry with default encoding if needed | |
| records = [] | |
| with open(file_path, 'r') as f: | |
| for line in f: | |
| if len(records) >= num_records: | |
| break | |
| record = _parse_json_line(line) | |
| if record is not None: | |
| records.append(record) | |
| if isinstance(records, pd.DataFrame): | |
| return records | |
| if not records: | |
| raise ValueError( | |
| "No valid records were parsed from the dataset. Ensure the file is JSONL or a JSON array." | |
| ) | |
| return pd.DataFrame(records) | |
| def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame: | |
| """Preprocess the dataframe from Hugging Face or local file.""" | |
| # Handle different date column names | |
| date_col = None | |
| if 'update_date' in df.columns: | |
| date_col = 'update_date' | |
| elif 'updated' in df.columns: | |
| date_col = 'updated' | |
| elif 'published' in df.columns: | |
| date_col = 'published' | |
| if date_col: | |
| df[date_col] = pd.to_datetime(df[date_col], errors='coerce') | |
| df['year'] = df[date_col].dt.year | |
| elif 'year' not in df.columns: | |
| # If no date column exists, set year to None | |
| df['year'] = None | |
| # Ensure required columns exist | |
| if 'abstract' in df.columns: | |
| df = df.dropna(subset=['abstract']) | |
| df = df[df['abstract'].str.strip() != ''] | |
| return df | |
| def df_to_documents( | |
| df: pd.DataFrame, | |
| lowercase: bool = False, | |
| remove_stopwords: bool = False | |
| ): | |
| """Convert dataframe to LangChain documents.""" | |
| documents = [] | |
| for _, row in df.iterrows(): | |
| # Get title and abstract | |
| title = str(row.get('title', '')) | |
| abstract = str(row.get('abstract', '')) | |
| title_clean = clean_text(title, lowercase=lowercase, remove_stopwords=remove_stopwords) | |
| abstract_clean = clean_text(abstract, lowercase=lowercase, remove_stopwords=remove_stopwords) | |
| page_content = f"Title: {title_clean}\n\nAbstract: {abstract_clean}" | |
| # Handle categories - can be string or list | |
| categories_raw = row.get('categories', 'N/A') or 'N/A' | |
| if isinstance(categories_raw, list): | |
| categories_str = ' '.join(categories_raw) if categories_raw else 'N/A' | |
| primary_category = categories_raw[0] if categories_raw else 'N/A' | |
| else: | |
| categories_str = str(categories_raw) | |
| primary_category = categories_str.split()[0] if categories_str != 'N/A' else 'N/A' | |
| # Build metadata | |
| metadata = { | |
| "id": row.get('id', 'N/A'), | |
| "title": title, # Keep original title in metadata | |
| "authors": row.get('authors', 'N/A'), | |
| "year": int(row.get('year')) if not pd.isna(row.get('year')) else None, | |
| "categories": categories_str, | |
| "primary_category": primary_category | |
| } | |
| documents.append(Document(page_content=page_content, metadata=metadata)) | |
| return documents | |