import pandas as pd import numpy as np from pymatgen.core import Composition, Element as PymatgenElement import ast import re import logging from .constants import KNOWN_ELEMENT_SYMBOLS, MAGPIE_FEATURIZER, MAGPIE_LABELS, matminer_available # --- Formula Cleaning and Standardization --- def clean_formula_string_advanced(formula_str_original): if not isinstance(formula_str_original, str): return formula_str_original cleaned = formula_str_original.strip() paren_match = re.search(r'\(([^()]+)\)[^()]*$', cleaned) if paren_match: potential_formula_in_parens = paren_match.group(1).strip() part_before_parens = cleaned[:paren_match.start()].strip() if len(potential_formula_in_parens) > 1 and re.search(r"[A-Z]", potential_formula_in_parens) and re.fullmatch(r"[A-Za-z0-9\.\(\)\[\]]+", potential_formula_in_parens): if not part_before_parens or " " in part_before_parens or len(part_before_parens) > len(potential_formula_in_parens) + 5 or (part_before_parens.isalpha() and len(part_before_parens)>4) or re.fullmatch(r"\d+(\.\d+)?", part_before_parens) or re.fullmatch(r"\d*N", part_before_parens, re.IGNORECASE): cleaned = potential_formula_in_parens elif not re.search(r"[A-Za-z]", part_before_parens) and re.search(r"\d", part_before_parens): cleaned = potential_formula_in_parens cleaned = re.sub(r"^[αΑβΒγΓδΔεΕζΖηΗθΘιΙκΚλΛμΜνΝξΞοΟπΠρΡσΣτΤυΥφΦχΧψΨωΩ]-", "", cleaned) cleaned = re.sub(r"^[a-zA-Z]-", "", cleaned) cleaned = re.sub(r"[·*]\s*\d*(\.\d+)?[nNxX]?\s*H2O", "", cleaned) cleaned = re.sub(r"\s*\(\s*H2O\s*\)\s*\d*(\.\d+)?", "", cleaned) cleaned = re.sub(r"·\s*H2O", "", cleaned) cleaned = re.sub(r"\s*\(\s*(?:\d*N|\d+(?:\.\d+)?%?|solution|gas|powder|aq|amorphous|amorph|polytype|phase|\d{1,4})\s*\)\s*$", "", cleaned, flags=re.IGNORECASE) cleaned = re.sub(r"^\s*\(\s*\d+(\.\d+)?\s*\)\s*(?=[A-Z])", "", cleaned) def replace_frac(match): try: num = float(match.group(1)); den = float(match.group(2)); return str(round(num / den, 4)) if den != 0 else match.group(0) except: return match.group(0) cleaned = re.sub(r"(?<=[A-Za-z\d\)])(\d+)\s*/\s*(\d+)", replace_frac, cleaned) cleaned = re.sub(r"^(\d+)\s*/\s*(\d+)", replace_frac, cleaned) cleaned = re.sub(r"^\s*\(?[a-zA-Z\s]+\)?-", "", cleaned); cleaned = re.sub(r"^[a-zA-Z]+-", "", cleaned) cleaned = cleaned.strip(" .,;·*()") return cleaned def is_plausible_formula_for_pymatgen(cleaned_formula_str, entry_identifier): if not isinstance(cleaned_formula_str, str) or not cleaned_formula_str.strip(): return False if '+' in cleaned_formula_str or '==' in cleaned_formula_str or '->' in cleaned_formula_str or ';' in cleaned_formula_str: return False variable_indicators = [r"[A-Za-z]\d*\s*[-+*]\s*[xyzδδn]", r"[xyzδδn]\s*[-+*]", r"[A-Za-z]\d*\(\s*\d*\s*[-+]\s*[xyzδδn]\s*\)?", r"(? 0 else np.nan features['sum_atomic_weight'] = comp.weight for key, val_list in current_element_values_for_stats.items(): clean_val_list = [v for v in val_list if pd.notna(v)] if clean_val_list: features[f'range_{key}'] = np.max(clean_val_list) - np.min(clean_val_list); features[f'min_{key}'] = np.min(clean_val_list); features[f'max_{key}'] = np.max(clean_val_list); features[f'var_{key}'] = np.var(clean_val_list) else: for stat in ['range_', 'min_', 'max_', 'var_']: features[f'{stat}{key}'] = np.nan if matminer_available and MAGPIE_FEATURIZER: try: magpie_vals = MAGPIE_FEATURIZER.featurize(comp) for i, label in enumerate(MAGPIE_LABELS): features[label] = magpie_vals[i] except: pass except: features['is_stoichiometric_formula'] = False elif isinstance(formula_input, dict) and formula_input.get('type') == 'elements_only': features['is_stoichiometric_formula'] = False elements_present = formula_input.get('elements', set()) valid_elements = [el for el in elements_present if el in df_elements_processed.index] features['num_elements_in_formula'] = len(valid_elements) if valid_elements: element_props_subset = df_elements_processed.loc[valid_elements] unweighted_props_to_calc = ['Atomic_Weight', 'Electronegativity', 'Atomic_Radius', 'Melting_Point', 'Density', 'avg_valence', 'valence_electrons_estimated'] for prop_col in unweighted_props_to_calc: if prop_col in element_props_subset.columns: clean_vals = element_props_subset[prop_col].dropna() if not clean_vals.empty: features[f'avg_{prop_col.lower()}_unweighted'] = clean_vals.mean() features[f'min_{prop_col.lower()}_unweighted'] = clean_vals.min() features[f'max_{prop_col.lower()}_unweighted'] = clean_vals.max() features[f'var_{prop_col.lower()}_unweighted'] = clean_vals.var() final_features = default_feature_dict.copy(); final_features.update(features) return final_features