| import pandas as pd | |
| import numpy as np | |
| import re | |
| from .constants import ATMOSPHERE_CONFIG, MIXING_METHOD_CONFIG | |
| def _extract_numerical_value_from_op_condition(condition_entry, target_keys=['value', 'max_value', 'values']): | |
| if isinstance(condition_entry, list) and condition_entry: | |
| if isinstance(condition_entry[0], dict): | |
| for key in target_keys: | |
| val = condition_entry[0].get(key) | |
| if val is not None: | |
| if isinstance(val, list) and val: | |
| try: return float(val[0]) | |
| except: continue | |
| try: return float(val) | |
| except: continue | |
| elif isinstance(condition_entry[0], (int, float, np.number)): | |
| try: return float(condition_entry[0]) | |
| except: pass | |
| elif isinstance(condition_entry, dict): | |
| for key in target_keys: | |
| val = condition_entry.get(key) | |
| if val is not None: | |
| if isinstance(val, list) and val: | |
| try: return float(val[0]) | |
| except: continue | |
| try: return float(val) | |
| except: continue | |
| elif isinstance(condition_entry, (int, float, np.number)): | |
| try: return float(condition_entry) | |
| except: pass | |
| return np.nan | |
| def _extract_atmosphere_from_op(op_conditions_dict, op_string, atm_config_local, entry_identifier): | |
| atm_specific = atm_config_local["default_specific"] | |
| atm_category = atm_config_local["default_category"] | |
| found_atm = False | |
| if isinstance(op_conditions_dict, dict): | |
| atm_source_key_val = op_conditions_dict.get('atmosphere') | |
| if not atm_source_key_val and 'text' in op_conditions_dict and isinstance(op_conditions_dict['text'], str) : | |
| atm_source_key_val = op_conditions_dict['text'] | |
| if atm_source_key_val: | |
| atm_str_to_parse = None | |
| if isinstance(atm_source_key_val, list) and atm_source_key_val: atm_str_to_parse = str(atm_source_key_val[0]) | |
| elif isinstance(atm_source_key_val, str): atm_str_to_parse = atm_source_key_val | |
| elif isinstance(atm_source_key_val, dict): atm_str_to_parse = str(atm_source_key_val.get('gas', atm_source_key_val.get('value', ''))) | |
| if atm_str_to_parse: | |
| atm_str_lower = atm_str_to_parse.lower() | |
| for pattern_regex, specific, category in atm_config_local["patterns"]: | |
| if specific.lower() == atm_str_lower or re.search(pattern_regex, atm_str_to_parse, re.IGNORECASE): | |
| atm_specific, atm_category, found_atm = specific, category, True; break | |
| if not found_atm and '/' in atm_str_lower: atm_specific, atm_category, found_atm = atm_str_to_parse, "Mixed", True | |
| if not found_atm and isinstance(op_string, str) and op_string: | |
| for pattern_regex, specific, category in atm_config_local["patterns"]: | |
| if re.search(pattern_regex, op_string, re.IGNORECASE): | |
| atm_specific, atm_category, found_atm = specific, category, True; break | |
| return atm_specific, atm_category | |
| def _extract_mixing_method_from_op(op_dict, op_string, mix_config_local, entry_identifier): | |
| mix_method = mix_config_local["default_method"] | |
| op_type = str(op_dict.get('type', '')).lower() | |
| if isinstance(op_string, str) and op_string: | |
| for pattern_regex, method_name in mix_config_local["patterns"]: | |
| if re.search(pattern_regex, op_string, re.IGNORECASE): return method_name | |
| if 'mix' in op_type or 'grind' in op_type or 'mill' in op_type: | |
| for pattern_regex, method_name in mix_config_local["patterns"]: | |
| if re.search(pattern_regex, op_type, re.IGNORECASE): return method_name | |
| if op_type.strip() and op_type not in ["mixing", "liquidgrinding", "solutionmixing", "grinding"]: return op_type | |
| return mix_method | |
| def _extract_thermal_conditions(conditions_dict, op_string, entry_identifier): | |
| temps, durs = [], [] | |
| if isinstance(conditions_dict, dict): | |
| temp_data = conditions_dict.get('heating_temperature') | |
| if temp_data: | |
| extracted_temp = _extract_numerical_value_from_op_condition(temp_data) | |
| if pd.notna(extracted_temp): temps.append(extracted_temp) | |
| dur_data = conditions_dict.get('heating_time') | |
| if dur_data: | |
| extracted_dur = _extract_numerical_value_from_op_condition(dur_data) | |
| if pd.notna(extracted_dur): durs.append(extracted_dur) | |
| return temps, durs | |
| def parse_single_operation_detailed_for_input(op_dict_raw, entry_identifier="predict_op"): | |
| if not isinstance(op_dict_raw, dict): return {} | |
| op_type_lower = str(op_dict_raw.get('type', 'UnknownType')).lower() | |
| op_string_lower = str(op_dict_raw.get('string', '')).lower() | |
| conditions = op_dict_raw.get('conditions', {}) | |
| op_features = {} | |
| temps, durs = _extract_thermal_conditions(conditions, op_string_lower, entry_identifier) | |
| op_features['op_temp_C_list'], op_features['op_duration_h_list'] = temps, durs | |
| op_features['op_atmosphere_specific'], op_features['op_atmosphere_category'] = _extract_atmosphere_from_op(conditions, op_string_lower, ATMOSPHERE_CONFIG, entry_identifier) | |
| op_features['op_mixing_method'] = _extract_mixing_method_from_op(op_dict_raw, op_string_lower, MIXING_METHOD_CONFIG, entry_identifier) | |
| op_features['op_is_heating'] = any(k in op_type_lower for k in ['heat', 'anneal', 'sinter', 'calcination']) | |
| op_features['op_is_mixing'] = 'mix' in op_type_lower or op_features['op_mixing_method'] != MIXING_METHOD_CONFIG["default_method"] | |
| op_features['op_is_grinding'] = any(k in op_type_lower for k in ['grind', 'mill']) or 'pulverize' in op_string_lower or op_features['op_mixing_method'] in ['grinding', 'ball_milling', 'planetary_milling', 'attritor_milling', 'shaker_milling', 'mortar_pestle'] | |
| op_features['op_is_shaping'] = 'shap' in op_type_lower | |
| op_features['op_is_drying'] = 'dry' in op_type_lower or 'drying' in op_type_lower | |
| op_features['op_is_quenching'] = 'quench' in op_type_lower | |
| op_features['op_is_annealing'] = 'anneal' in op_type_lower or 'anneal' in op_string_lower | |
| op_features['op_is_sintering'] = 'sinter' in op_type_lower or 'sinter' in op_string_lower | |
| op_features['op_is_calcination'] = any(k in op_type_lower for k in ['calcine', 'calcination']) or 'calcination' in op_string_lower | |
| return op_features | |
| def generate_process_features_for_input(operations_simplified_list, all_possible_atm_categories, all_possible_mix_methods): | |
| aggregated_ops_features = { | |
| 'proc_total_heating_duration_h': 0.0, 'proc_max_temperature_C': np.nan, | |
| 'proc_min_temperature_C': np.nan, 'proc_avg_temperature_C': np.nan, | |
| 'proc_primary_heating_temp_C': np.nan, | |
| 'proc_num_total_steps': 0, 'proc_num_heating_steps': 0, | |
| 'proc_num_mixing_steps': 0, 'proc_num_grinding_steps': 0, | |
| 'proc_has_annealing': False, 'proc_has_sintering': False, | |
| 'proc_has_calcination': False, 'proc_has_quenching': False, | |
| 'proc_has_shaping': False, 'proc_has_drying': False, | |
| } | |
| for cat in all_possible_atm_categories: aggregated_ops_features[f"ops_atm_cat_{cat}"] = 0 | |
| for meth in all_possible_mix_methods: aggregated_ops_features[f"ops_mix_meth_{meth}"] = 0 | |
| if not isinstance(operations_simplified_list, list): operations_simplified_list = [] | |
| aggregated_ops_features['proc_num_total_steps'] = len(operations_simplified_list) | |
| all_temps_in_reaction, heating_steps_details_for_reaction, mixing_methods_found_in_reaction = [], [], [] | |
| atm_set_for_reaction_flag = False | |
| parsed_atm_category_for_input = ATMOSPHERE_CONFIG["default_category"] | |
| parsed_mix_method_for_input = MIXING_METHOD_CONFIG["default_method"] | |
| for op_idx, op_dict_raw in enumerate(operations_simplified_list): | |
| op_features = parse_single_operation_detailed_for_input(op_dict_raw, f"predict_op_{op_idx}") | |
| if op_features.get('op_temp_C_list'): all_temps_in_reaction.extend(op_features['op_temp_C_list']) | |
| if op_features.get('op_is_heating'): | |
| aggregated_ops_features['proc_num_heating_steps'] += 1 | |
| if op_features.get('op_duration_h_list'): aggregated_ops_features['proc_total_heating_duration_h'] += np.nansum(op_features['op_duration_h_list']) | |
| heating_steps_details_for_reaction.append({'temp': np.nanmax(op_features['op_temp_C_list']) if op_features.get('op_temp_C_list') and len(op_features['op_temp_C_list']) > 0 else np.nan, | |
| 'duration': np.nansum(op_features.get('op_duration_h_list', [0.0])), | |
| 'atm_category': op_features.get('op_atmosphere_category'), | |
| 'is_anneal': op_features.get('op_is_annealing'), 'is_sinter': op_features.get('op_is_sintering'), 'is_calcine': op_features.get('op_is_calcination')}) | |
| if op_features.get('op_is_mixing'): | |
| aggregated_ops_features['proc_num_mixing_steps'] += 1 | |
| current_mix_method = op_features.get('op_mixing_method', MIXING_METHOD_CONFIG["default_method"]) | |
| if current_mix_method != MIXING_METHOD_CONFIG["default_method"]: mixing_methods_found_in_reaction.append(current_mix_method) | |
| if op_features.get('op_is_grinding'): aggregated_ops_features['proc_num_grinding_steps'] += 1 | |
| if op_features.get('op_is_shaping'): aggregated_ops_features['proc_has_shaping'] = True | |
| if op_features.get('op_is_sintering'): aggregated_ops_features['proc_has_sintering'] = True | |
| if op_features.get('op_is_drying'): aggregated_ops_features['proc_has_drying'] = True | |
| if op_features.get('op_is_quenching'): aggregated_ops_features['proc_has_quenching'] = True | |
| if op_features.get('op_is_annealing'): aggregated_ops_features['proc_has_annealing'] = True | |
| if op_features.get('op_is_calcination'): aggregated_ops_features['proc_has_calcination'] = True | |
| if not atm_set_for_reaction_flag and op_features.get('op_atmosphere_category') != ATMOSPHERE_CONFIG["default_category"]: | |
| parsed_atm_category_for_input = op_features['op_atmosphere_category'] | |
| atm_set_for_reaction_flag = True | |
| if heating_steps_details_for_reaction: | |
| primary_heat_step = max(heating_steps_details_for_reaction, key=lambda x: (x['temp'] if pd.notna(x['temp']) else -float('inf'), x['duration'])) | |
| if pd.notna(primary_heat_step['temp']): aggregated_ops_features['proc_primary_heating_temp_C'] = primary_heat_step['temp'] | |
| if not atm_set_for_reaction_flag and primary_heat_step.get('atm_category') != ATMOSPHERE_CONFIG["default_category"]: | |
| parsed_atm_category_for_input = primary_heat_step['atm_category'] | |
| if mixing_methods_found_in_reaction: | |
| parsed_mix_method_for_input = mixing_methods_found_in_reaction[0] | |
| atm_ohe_col = f"ops_atm_cat_{parsed_atm_category_for_input}" | |
| if atm_ohe_col in aggregated_ops_features: aggregated_ops_features[atm_ohe_col] = 1 | |
| mix_ohe_col = f"ops_mix_meth_{parsed_mix_method_for_input}" | |
| if mix_ohe_col in aggregated_ops_features: aggregated_ops_features[mix_ohe_col] = 1 | |
| if all_temps_in_reaction : | |
| aggregated_ops_features['proc_max_temperature_C'] = np.nanmax(all_temps_in_reaction) | |
| aggregated_ops_features['proc_min_temperature_C'] = np.nanmin(all_temps_in_reaction) | |
| aggregated_ops_features['proc_avg_temperature_C'] = np.nanmean(all_temps_in_reaction) | |
| if aggregated_ops_features['proc_num_heating_steps'] == 0 or pd.isna(aggregated_ops_features['proc_total_heating_duration_h']) or aggregated_ops_features['proc_total_heating_duration_h'] == 0: | |
| aggregated_ops_features['proc_total_heating_duration_h'] = np.nan | |
| return aggregated_ops_features | |
| def generate_stoichiometry_features_for_input(reactants_simplified, products_simplified, standardize_fn_local): | |
| stoich_features = {} | |
| max_r, max_p = 3, 2 | |
| for i in range(max_r): stoich_features[f'reactant{i+1}_coeff'] = np.nan | |
| for i in range(max_p): stoich_features[f'product{i+1}_coeff'] = np.nan | |
| stoich_features['num_reactants_in_reaction'] = len(reactants_simplified) if reactants_simplified else 0 | |
| if reactants_simplified: | |
| for i, r_item in enumerate(reactants_simplified[:max_r]): | |
| if isinstance(r_item, dict): | |
| stoich_features[f'reactant{i+1}_coeff'] = float(r_item.get('amount')) if pd.notna(r_item.get('amount')) else np.nan | |
| stoich_features['num_products_in_reaction'] = len(products_simplified) if products_simplified else 0 | |
| if products_simplified: | |
| for i, p_item in enumerate(products_simplified[:max_p]): | |
| if isinstance(p_item, dict): | |
| stoich_features[f'product{i+1}_coeff'] = float(p_item.get('amount')) if pd.notna(p_item.get('amount')) else np.nan | |
| return stoich_features | |