import os import sys current_dir = os.getcwd() sys.path.insert(0, current_dir) from modules.shared import * def assemble_locations() -> List[Dict]: """ Reads JSON Lines (JSONL) files containing location data and combines their contents into a single list. This function processes multiple JSONL files stored in the `ISTAT_DATA_PATH` directory, each representing a type of location (e.g., geographic areas, regions, provinces). The contents of these files are read, deserialized, and aggregated into a single list. Args: None: This function does not take any arguments. It relies on a predefined `ISTAT_DATA_PATH` constant and file naming convention. Returns: List[Dict]: A list of dictionaries, where each dictionary represents a location extracted from the input JSONL files. Raises: FileNotFoundError: If any of the expected JSONL files are missing. IOError: If there are issues reading the files. json.JSONDecodeError: If a line in any of the files cannot be parsed as valid JSON. Notes: - The `ISTAT_DATA_PATH` constant must be defined elsewhere in the code. - File names are expected to follow the naming convention specified in the `file_names` list. """ file_names = ['_geographic_areas', '_regions', '_provinces'] locations = [] for name in file_names: file_path = f"{ISTAT_DATA_PATH}/{name}.jsonl" with open(file_path, 'r', encoding='utf-8') as file: for line in file: data = json.loads(line) locations.append(data) return locations def query_api(url: str) -> Optional[str]: """ Sends a GET request to the specified URL and returns the response content as a string. This function attempts to fetch the content from the provided URL using a GET request. If the request is successful, the content of the response is returned as a UTF-8 encoded string. If an error occurs (e.g., network issues, invalid URL, or a non-2xx status code), the function prints an error message and returns `None`. Args: url (str): The URL to which the GET request is sent. This should be a valid HTTP or HTTPS URL. Returns: Optional[str]: The content of the response as a UTF-8 encoded string if the request is successful, or `None` if an error occurs. Raises: requests.RequestException: This exception is caught internally if there is an issue with the request (e.g., network problem, invalid URL, or non-2xx HTTP status code). Instead of propagating the exception, an error message is printed. Notes: - Ensure that the `requests` library is installed in your environment to use this function. """ try: response = requests.get(url) response.raise_for_status() # Raises an HTTPError for bad responses response.encoding = 'utf-8' return response.text except requests.RequestException as e: print(f"An error occurred: {e}") return None def combine_ages(age_code: str) -> str: """ Generates a sequence of age codes based on the provided `age_code`. This function interprets different formats of age codes and generates a sequence of age codes as a string, joined by " + ". Args: age_code (str): The input code representing an age or age group. Supported formats: - `YX`: Represents an exact age (e.g., "Y65"). - `Y_GEX`: Represents ages X and over up to 100+ (e.g., "Y_GE18"). - `Y_UNX`: Represents all ages until X (e.g., "Y_UN18"). - `YX-Z`: Represents a range of ages from X to Z (e.g., "Y23-42"). Returns: str: A string representing the sequence of age codes, joined by " + ". Rules: 1. `YX` -> Return the same string (e.g., "Y65" -> "Y65"). 2. `Y_GEX` -> Generate a sequence starting from `YX` to `Y_GE100` (e.g., "Y_GE18" -> "Y18 + Y19 + ... + Y_GE100"). 3. `Y_UNX` -> Generate a sequence from `Y0` to `Y(X-1)` (e.g., "Y_UN18" -> "Y0 + Y1 + ... + Y17"). 4. `YX-Z` -> Generate a sequence from `YX` to `YZ` (e.g., "Y23-42" -> "Y23 + Y24 + ... + Y42"). Example: combine_ages("Y_GE18") -> "Y18 + Y19 + ... + Y_GE100" combine_ages("Y_UN18") -> "Y0 + Y1 + ... + Y17" combine_ages("Y23-25") -> "Y23 + Y24 + Y25" combine_ages("Y65") -> "Y65" """ if age_code.startswith("Y_"): if age_code.startswith("Y_GE"): # Rule 1: Y_GEX -> YX + Y(X+1) + ... + Y_GE100 start_age = int(age_code[4:]) age_list = [f"Y{age}" for age in range(start_age, 100)] age_list.append("Y_GE100") return "+".join(age_list) elif age_code.startswith("Y_UN"): # Rule 2: Y_UNX -> Y0 + Y1 + ... + Y(X-1) end_age = int(age_code[4:]) age_list = [f"Y{age}" for age in range(end_age)] return "+".join(age_list) elif "-" in age_code: # Rule 3: YX-Z -> YX + Y(X+1) + ... + YZ start_age, end_age = map(int, age_code[1:].split('-')) age_list = [f"Y{age}" for age in range(start_age, end_age + 1)] return "+".join(age_list) else: # Rule 0: YX -> YX return age_code def transform_age_code(age_code: str) -> Optional[str]: """ Transforms an age code into a simplified human-readable format. This function converts various age codes into a more user-friendly format: - "TOTAL" becomes "total". - "Y_GE100" becomes "100+". - Age codes in the format "Y" (e.g., "Y0", "Y99") are converted to their numeric representation (e.g., "0", "99"). - For unsupported formats, the function returns `None`. Args: age_code (str): The age code to be transformed. Returns: Optional[str]: The transformed age code, or `None` if the input is not a recognized format. Examples: transform_age_code("TOTAL") -> "total" transform_age_code("Y_GE100") -> "100+" transform_age_code("Y25") -> "25" transform_age_code("INVALID") -> None """ if age_code == 'TOTAL': return 'total' elif age_code == 'Y_GE100': return '100+' # Handle the regular case for age codes like 'Y0', 'Y1', ..., 'Y99' elif age_code.startswith('Y') and age_code[1:].isdigit(): return str(int(age_code[1:])) # Convert 'Y1' to '1', 'Y99' to '99' return None def age_str_to_int(age_str: str) -> int: """ Converts an age string into an integer for custom sorting. This function maps age strings to integer values for sorting purposes: - Numeric strings (e.g., "0", "1", "99") are converted to their integer equivalents. - The special value "100+" is mapped to 101 to ensure it sorts after other numeric ages. - The special value "TOTAL" is mapped to 102 to ensure it sorts after all other values. Args: age_str (str): The age value as a string. This can be: - A numeric value (e.g., "0", "1", "99"). - The special value "100+". - The special value "TOTAL". Returns: int: A numeric value for sorting: - Numeric strings are converted to integers. - "100+" is mapped to 101. - "TOTAL" is mapped to 102. Examples: age_str_to_int("5") -> 5 age_str_to_int("100+") -> 101 age_str_to_int("TOTAL") -> 102 """ if age_str == '100+': return 101 # Assign a high value so it sorts last if age_str.upper() == 'TOTAL': return 102 return int(age_str) def extract_and_format_data_from_xml_for_web_app(xml_content: str) -> List[Dict[str, str]]: """ Extracts and formats data from an SDMX-ML XML document for use in a Streamlit app. This function parses XML content, extracts demographic data (e.g., location, sex, age, time period, and population), and formats it into a list of dictionaries sorted by time period, location, and age. Args: xml_content (str): The XML content as a string to be parsed. Returns: List[Dict[str, str]]: A list of dictionaries representing the extracted and formatted data. Each dictionary contains the following keys: - `location` (str): The name of the location. - `sex` (str): The descriptive sex (e.g., "Male", "Female", "Total"). - `age (years)` (str): The age group or exact age as a human-readable string. - `time period` (str): The time period of the observation. - `population` (str): The observed population value. Raises: ET.ParseError: If the XML content cannot be parsed. KeyError: If required fields are missing in the XML structure. Notes: - The `assemble_locations` function must be defined to provide a location dictionary. - The `transform_age_code` function is used to convert age codes into human-readable descriptions. - The `age_str_to_int` function is used to ensure proper sorting of age strings. Example: extract_and_format_data_from_xml_for_web_app(xml_data) -> [{'location': 'Italy', 'sex': 'Male', 'age (years)': '0', 'time period': '2020', 'population': '10000'}, ...] """ # Parse the XML content root = ET.fromstring(xml_content) # Define namespaces for the XML structure ns = { 'generic': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic', 'message': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message', 'common': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common' } # Create a dictionary from the locations list for faster lookup locations = assemble_locations() location_dict = {item[next(iter(item))]: next(iter(item)) for item in locations} # List to store the data from all series extracted_data = [] # Iterate over each series in the DataSet for series in root.findall('.//generic:Series', ns): # Extract common series information ref_area_code = series.find(".//generic:Value[@id='REF_AREA']", ns).get('value') # Get the location name using the location dictionary ref_area_name = location_dict.get(ref_area_code, "Unknown Location") # Default to "Unknown Location" if not found age_code = series.find(".//generic:Value[@id='AGE']", ns).get('value') age_description = transform_age_code(age_code) sex_code = series.find(".//generic:Value[@id='SEX']", ns).get('value') # Map sex codes to descriptive strings sex_map = {'1': 'Male', '2': 'Female', '9': 'Total'} sex_description = sex_map.get(sex_code, "Unknown Sex") # Default to "Unknown Sex" if not found # Iterate over each observation in the series for obs in series.findall('.//generic:Obs', ns): time_period = obs.find(".//generic:ObsDimension[@id='TIME_PERIOD']", ns).get('value') obs_value = obs.find('.//generic:ObsValue', ns).get('value') # Append extracted data to the series data list extracted_data.append({ 'location': ref_area_name, 'sex': sex_description, 'age (years)': age_description, 'time period': time_period, 'population': obs_value }) # Sorting the list of dictionaries by 'time period', 'location', and 'age (years)' extracted_data_sorted = sorted( extracted_data, key=lambda x: (int(x['time period']), x['location'], age_str_to_int(x['age (years)'])) ) return extracted_data_sorted def fetch_population_for_locations_years_sex_age_via_sdmx( location_ids: str = 'IT', sex: str = '9', age: str = 'TOTAL', start_period: str = '2024-01-01', end_period: str = '2024-12-31' ) -> Optional[List[Dict[str, str]]]: """ Fetches population data for specific locations, time periods, and demographics using the Istat SDMX web service. This function constructs a query URL based on the provided parameters and retrieves population data in XML format. The data is parsed, formatted, and returned as a list of dictionaries. Args: location_ids (str): Geographical identifiers for the locations, concatenated by '+' if multiple. Default is 'IT' for Italy. sex (str): The sex category for which data is requested. Options: - '1': Male - '2': Female - '9': Total Multiple values can be combined with '+' (e.g., '1+2'). Default is '9'. age (str): The age category for which data is requested. Options: - 'Y0' to 'Y99': Specific ages - 'Y_GE100': 100 years and above - 'TOTAL': Total (all ages) Multiple values can be combined with '+'. Default is 'TOTAL'. start_period (str): The start date of the period for which data is requested, formatted as 'YYYY-MM-DD'. Default is '2024-01-01'. end_period (str): The end date of the period for which data is requested, formatted as 'YYYY-MM-DD'. Default is '2024-12-31'. Returns: Optional[List[Dict[str, str]]]: A list of dictionaries containing population data. Each dictionary includes: - `location`: The name of the location. - `sex`: The demographic category for sex (e.g., "Male", "Female", "Total"). - `age`: The age group or category. - `time period`: The year of the observation. - `population`: The observed population value. Returns `None` if the query fails. Example: fetch_population_for_locations_years_sex_age_via_sdmx( 'ITC+ITE2+ITF14', '9', 'TOTAL', '2024-01-01', '2024-12-31' ) -> [ {'location': 'Nord-ovest', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '15858626'}, {'location': 'Umbria', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '856407'}, {'location': 'Chieti', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '372640'} ] """ if age.upper() == "TOTAL": combined_age = age.upper() else: combined_age = combine_ages(age) url = f"https://esploradati.istat.it/SDMXWS/rest/data/IT1,22_289_DF_DCIS_POPRES1_1,1.0/A.{location_ids}.JAN.{sex}.{combined_age}.99/ALL/?detail=full&startPeriod={start_period}&endPeriod={end_period}&dimensionAtObservation=TIME_PERIOD" print(url) res = query_api(url) if res is None: return None else: data = extract_and_format_data_from_xml_for_web_app(res) return data tools=[ { "type": "function", "function": { "name": "fetch_population_for_locations_years_sex_age_via_sdmx", "description": "Fetches population data for specific locations, sex categories, age and time periods using the Istat SDMX web service. Supports multiple locations and sex categories.", "parameters": { "type": "object", "properties": { "location_ids": { "type": "string", "description": "Geographical identifiers for the locations, concatenated by '+' if multiple, e.g., 'ITC+ITE2+ITF14'" }, "sex": { "type": "string", "description": "The sex category for which data is requested. '1' for male, '2' for female, '9' for total. Can be combined with '+', e.g., '1+2+9'" }, "age": { "type": "string", "description": "The age in years for which data is requested. Follow these rules to interpret the age definition: 1. Exact age (e.g., `X years`): Format: `YX`; Example: `0 year` → `Y0`, `1 year` → `Y1`, `10 years` → `Y10`. 2. Age and over (e.g., `X years and over`): Format: `Y_GEX`; Example: `14 years and over` → `Y_GE14`. 3. Until a certain age (e.g., `until X years` or `under X years`): Format: `Y_UNX`; Example: `until 15 years` → `Y_UN15`. 4. Age ranges (e.g., `X-Y years`): Format: `YX-Y`; Example: `14-15 years` → `Y14-15`. 5. TOTAL (representing all ages): Use the code `TOTAL`." }, "start_period": { "type": "string", "description": "The start date of the period for which data is requested, formatted as 'YYYY-MM-DD', e.g., '2024-01-01'. Default is '2024-01-01'." }, "end_period": { "type": "string", "description": "The end date of the period for which data is requested, formatted as 'YYYY-MM-DD', e.g., '2024-12-31'. Default is '2024-12-31'." } }, "required": ["location_ids", "sex", "age", "start_period", "end_period"] } } } ] tool_functions_map = { 'fetch_population_for_locations_years_sex_age_via_sdmx': fetch_population_for_locations_years_sex_age_via_sdmx }