Caporlingua Marina
go back to Meta-Llama-3-8B-Instruct
cfde4b0
import os
import sys
current_dir = os.getcwd()
sys.path.insert(0, current_dir)
from modules.shared import *
def assemble_locations() -> List[Dict]:
"""
Reads JSON Lines (JSONL) files containing location data and combines their contents into a single list.
This function processes multiple JSONL files stored in the `ISTAT_DATA_PATH` directory,
each representing a type of location (e.g., geographic areas, regions, provinces).
The contents of these files are read, deserialized, and aggregated into a single list.
Args:
None: This function does not take any arguments. It relies on a predefined
`ISTAT_DATA_PATH` constant and file naming convention.
Returns:
List[Dict]: A list of dictionaries, where each dictionary represents a location
extracted from the input JSONL files.
Raises:
FileNotFoundError: If any of the expected JSONL files are missing.
IOError: If there are issues reading the files.
json.JSONDecodeError: If a line in any of the files cannot be parsed as valid JSON.
Notes:
- The `ISTAT_DATA_PATH` constant must be defined elsewhere in the code.
- File names are expected to follow the naming convention specified in the `file_names` list.
"""
file_names = ['_geographic_areas', '_regions', '_provinces']
locations = []
for name in file_names:
file_path = f"{ISTAT_DATA_PATH}/{name}.jsonl"
with open(file_path, 'r', encoding='utf-8') as file:
for line in file:
data = json.loads(line)
locations.append(data)
return locations
def query_api(url: str) -> Optional[str]:
"""
Sends a GET request to the specified URL and returns the response content as a string.
This function attempts to fetch the content from the provided URL using a GET request. If the request
is successful, the content of the response is returned as a UTF-8 encoded string. If an error occurs
(e.g., network issues, invalid URL, or a non-2xx status code), the function prints an error message
and returns `None`.
Args:
url (str): The URL to which the GET request is sent. This should be a valid HTTP or HTTPS URL.
Returns:
Optional[str]: The content of the response as a UTF-8 encoded string if the request is successful,
or `None` if an error occurs.
Raises:
requests.RequestException: This exception is caught internally if there is an issue with the
request (e.g., network problem, invalid URL, or non-2xx HTTP status code).
Instead of propagating the exception, an error message is printed.
Notes:
- Ensure that the `requests` library is installed in your environment to use this function.
"""
try:
response = requests.get(url)
response.raise_for_status() # Raises an HTTPError for bad responses
response.encoding = 'utf-8'
return response.text
except requests.RequestException as e:
print(f"An error occurred: {e}")
return None
def combine_ages(age_code: str) -> str:
"""
Generates a sequence of age codes based on the provided `age_code`.
This function interprets different formats of age codes and generates a sequence
of age codes as a string, joined by " + ".
Args:
age_code (str): The input code representing an age or age group. Supported formats:
- `YX`: Represents an exact age (e.g., "Y65").
- `Y_GEX`: Represents ages X and over up to 100+ (e.g., "Y_GE18").
- `Y_UNX`: Represents all ages until X (e.g., "Y_UN18").
- `YX-Z`: Represents a range of ages from X to Z (e.g., "Y23-42").
Returns:
str: A string representing the sequence of age codes, joined by " + ".
Rules:
1. `YX` -> Return the same string (e.g., "Y65" -> "Y65").
2. `Y_GEX` -> Generate a sequence starting from `YX` to `Y_GE100`
(e.g., "Y_GE18" -> "Y18 + Y19 + ... + Y_GE100").
3. `Y_UNX` -> Generate a sequence from `Y0` to `Y(X-1)`
(e.g., "Y_UN18" -> "Y0 + Y1 + ... + Y17").
4. `YX-Z` -> Generate a sequence from `YX` to `YZ`
(e.g., "Y23-42" -> "Y23 + Y24 + ... + Y42").
Example:
combine_ages("Y_GE18") -> "Y18 + Y19 + ... + Y_GE100"
combine_ages("Y_UN18") -> "Y0 + Y1 + ... + Y17"
combine_ages("Y23-25") -> "Y23 + Y24 + Y25"
combine_ages("Y65") -> "Y65"
"""
if age_code.startswith("Y_"):
if age_code.startswith("Y_GE"):
# Rule 1: Y_GEX -> YX + Y(X+1) + ... + Y_GE100
start_age = int(age_code[4:])
age_list = [f"Y{age}" for age in range(start_age, 100)]
age_list.append("Y_GE100")
return "+".join(age_list)
elif age_code.startswith("Y_UN"):
# Rule 2: Y_UNX -> Y0 + Y1 + ... + Y(X-1)
end_age = int(age_code[4:])
age_list = [f"Y{age}" for age in range(end_age)]
return "+".join(age_list)
elif "-" in age_code:
# Rule 3: YX-Z -> YX + Y(X+1) + ... + YZ
start_age, end_age = map(int, age_code[1:].split('-'))
age_list = [f"Y{age}" for age in range(start_age, end_age + 1)]
return "+".join(age_list)
else:
# Rule 0: YX -> YX
return age_code
def transform_age_code(age_code: str) -> Optional[str]:
"""
Transforms an age code into a simplified human-readable format.
This function converts various age codes into a more user-friendly format:
- "TOTAL" becomes "total".
- "Y_GE100" becomes "100+".
- Age codes in the format "Y<number>" (e.g., "Y0", "Y99") are converted to their numeric representation (e.g., "0", "99").
- For unsupported formats, the function returns `None`.
Args:
age_code (str): The age code to be transformed.
Returns:
Optional[str]: The transformed age code, or `None` if the input is not a recognized format.
Examples:
transform_age_code("TOTAL") -> "total"
transform_age_code("Y_GE100") -> "100+"
transform_age_code("Y25") -> "25"
transform_age_code("INVALID") -> None
"""
if age_code == 'TOTAL':
return 'total'
elif age_code == 'Y_GE100':
return '100+'
# Handle the regular case for age codes like 'Y0', 'Y1', ..., 'Y99'
elif age_code.startswith('Y') and age_code[1:].isdigit():
return str(int(age_code[1:])) # Convert 'Y1' to '1', 'Y99' to '99'
return None
def age_str_to_int(age_str: str) -> int:
"""
Converts an age string into an integer for custom sorting.
This function maps age strings to integer values for sorting purposes:
- Numeric strings (e.g., "0", "1", "99") are converted to their integer equivalents.
- The special value "100+" is mapped to 101 to ensure it sorts after other numeric ages.
- The special value "TOTAL" is mapped to 102 to ensure it sorts after all other values.
Args:
age_str (str): The age value as a string. This can be:
- A numeric value (e.g., "0", "1", "99").
- The special value "100+".
- The special value "TOTAL".
Returns:
int: A numeric value for sorting:
- Numeric strings are converted to integers.
- "100+" is mapped to 101.
- "TOTAL" is mapped to 102.
Examples:
age_str_to_int("5") -> 5
age_str_to_int("100+") -> 101
age_str_to_int("TOTAL") -> 102
"""
if age_str == '100+':
return 101 # Assign a high value so it sorts last
if age_str.upper() == 'TOTAL':
return 102
return int(age_str)
def extract_and_format_data_from_xml_for_web_app(xml_content: str) -> List[Dict[str, str]]:
"""
Extracts and formats data from an SDMX-ML XML document for use in a Streamlit app.
This function parses XML content, extracts demographic data (e.g., location, sex, age, time period, and population),
and formats it into a list of dictionaries sorted by time period, location, and age.
Args:
xml_content (str): The XML content as a string to be parsed.
Returns:
List[Dict[str, str]]: A list of dictionaries representing the extracted and formatted data.
Each dictionary contains the following keys:
- `location` (str): The name of the location.
- `sex` (str): The descriptive sex (e.g., "Male", "Female", "Total").
- `age (years)` (str): The age group or exact age as a human-readable string.
- `time period` (str): The time period of the observation.
- `population` (str): The observed population value.
Raises:
ET.ParseError: If the XML content cannot be parsed.
KeyError: If required fields are missing in the XML structure.
Notes:
- The `assemble_locations` function must be defined to provide a location dictionary.
- The `transform_age_code` function is used to convert age codes into human-readable descriptions.
- The `age_str_to_int` function is used to ensure proper sorting of age strings.
Example:
extract_and_format_data_from_xml_for_web_app(xml_data)
-> [{'location': 'Italy', 'sex': 'Male', 'age (years)': '0', 'time period': '2020', 'population': '10000'}, ...]
"""
# Parse the XML content
root = ET.fromstring(xml_content)
# Define namespaces for the XML structure
ns = {
'generic': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic',
'message': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message',
'common': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common'
}
# Create a dictionary from the locations list for faster lookup
locations = assemble_locations()
location_dict = {item[next(iter(item))]: next(iter(item)) for item in locations}
# List to store the data from all series
extracted_data = []
# Iterate over each series in the DataSet
for series in root.findall('.//generic:Series', ns):
# Extract common series information
ref_area_code = series.find(".//generic:Value[@id='REF_AREA']", ns).get('value')
# Get the location name using the location dictionary
ref_area_name = location_dict.get(ref_area_code,
"Unknown Location") # Default to "Unknown Location" if not found
age_code = series.find(".//generic:Value[@id='AGE']", ns).get('value')
age_description = transform_age_code(age_code)
sex_code = series.find(".//generic:Value[@id='SEX']", ns).get('value')
# Map sex codes to descriptive strings
sex_map = {'1': 'Male', '2': 'Female', '9': 'Total'}
sex_description = sex_map.get(sex_code, "Unknown Sex") # Default to "Unknown Sex" if not found
# Iterate over each observation in the series
for obs in series.findall('.//generic:Obs', ns):
time_period = obs.find(".//generic:ObsDimension[@id='TIME_PERIOD']", ns).get('value')
obs_value = obs.find('.//generic:ObsValue', ns).get('value')
# Append extracted data to the series data list
extracted_data.append({
'location': ref_area_name,
'sex': sex_description,
'age (years)': age_description,
'time period': time_period,
'population': obs_value
})
# Sorting the list of dictionaries by 'time period', 'location', and 'age (years)'
extracted_data_sorted = sorted(
extracted_data,
key=lambda x: (int(x['time period']), x['location'], age_str_to_int(x['age (years)']))
)
return extracted_data_sorted
def fetch_population_for_locations_years_sex_age_via_sdmx(
location_ids: str = 'IT',
sex: str = '9',
age: str = 'TOTAL',
start_period: str = '2024-01-01',
end_period: str = '2024-12-31'
) -> Optional[List[Dict[str, str]]]:
"""
Fetches population data for specific locations, time periods, and demographics using the Istat SDMX web service.
This function constructs a query URL based on the provided parameters and retrieves population data
in XML format. The data is parsed, formatted, and returned as a list of dictionaries.
Args:
location_ids (str): Geographical identifiers for the locations, concatenated by '+' if multiple.
Default is 'IT' for Italy.
sex (str): The sex category for which data is requested. Options:
- '1': Male
- '2': Female
- '9': Total
Multiple values can be combined with '+' (e.g., '1+2'). Default is '9'.
age (str): The age category for which data is requested. Options:
- 'Y0' to 'Y99': Specific ages
- 'Y_GE100': 100 years and above
- 'TOTAL': Total (all ages)
Multiple values can be combined with '+'. Default is 'TOTAL'.
start_period (str): The start date of the period for which data is requested, formatted as 'YYYY-MM-DD'.
Default is '2024-01-01'.
end_period (str): The end date of the period for which data is requested, formatted as 'YYYY-MM-DD'.
Default is '2024-12-31'.
Returns:
Optional[List[Dict[str, str]]]: A list of dictionaries containing population data. Each dictionary includes:
- `location`: The name of the location.
- `sex`: The demographic category for sex (e.g., "Male", "Female", "Total").
- `age`: The age group or category.
- `time period`: The year of the observation.
- `population`: The observed population value.
Returns `None` if the query fails.
Example:
fetch_population_for_locations_years_sex_age_via_sdmx(
'ITC+ITE2+ITF14', '9', 'TOTAL', '2024-01-01', '2024-12-31'
)
-> [
{'location': 'Nord-ovest', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '15858626'},
{'location': 'Umbria', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '856407'},
{'location': 'Chieti', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '372640'}
]
"""
if age.upper() == "TOTAL":
combined_age = age.upper()
else:
combined_age = combine_ages(age)
url = f"https://esploradati.istat.it/SDMXWS/rest/data/IT1,22_289_DF_DCIS_POPRES1_1,1.0/A.{location_ids}.JAN.{sex}.{combined_age}.99/ALL/?detail=full&startPeriod={start_period}&endPeriod={end_period}&dimensionAtObservation=TIME_PERIOD"
print(url)
res = query_api(url)
if res is None:
return None
else:
data = extract_and_format_data_from_xml_for_web_app(res)
return data
tools=[
{
"type": "function",
"function": {
"name": "fetch_population_for_locations_years_sex_age_via_sdmx",
"description": "Fetches population data for specific locations, sex categories, age and time periods using the Istat SDMX web service. Supports multiple locations and sex categories.",
"parameters": {
"type": "object",
"properties": {
"location_ids": {
"type": "string",
"description": "Geographical identifiers for the locations, concatenated by '+' if multiple, e.g., 'ITC+ITE2+ITF14'"
},
"sex": {
"type": "string",
"description": "The sex category for which data is requested. '1' for male, '2' for female, '9' for total. Can be combined with '+', e.g., '1+2+9'"
},
"age": {
"type": "string",
"description": "The age in years for which data is requested. Follow these rules to interpret the age definition: 1. Exact age (e.g., `X years`): Format: `YX`; Example: `0 year` β†’ `Y0`, `1 year` β†’ `Y1`, `10 years` β†’ `Y10`. 2. Age and over (e.g., `X years and over`): Format: `Y_GEX`; Example: `14 years and over` β†’ `Y_GE14`. 3. Until a certain age (e.g., `until X years` or `under X years`): Format: `Y_UNX`; Example: `until 15 years` β†’ `Y_UN15`. 4. Age ranges (e.g., `X-Y years`): Format: `YX-Y`; Example: `14-15 years` β†’ `Y14-15`. 5. TOTAL (representing all ages): Use the code `TOTAL`."
},
"start_period": {
"type": "string",
"description": "The start date of the period for which data is requested, formatted as 'YYYY-MM-DD', e.g., '2024-01-01'. Default is '2024-01-01'."
},
"end_period": {
"type": "string",
"description": "The end date of the period for which data is requested, formatted as 'YYYY-MM-DD', e.g., '2024-12-31'. Default is '2024-12-31'."
}
},
"required": ["location_ids", "sex", "age", "start_period", "end_period"]
}
}
}
]
tool_functions_map = {
'fetch_population_for_locations_years_sex_age_via_sdmx': fetch_population_for_locations_years_sex_age_via_sdmx
}