Spaces:

capmar00
/

census-newsroom-AIde-api

Sleeping

census-newsroom-AIde-api / modules /tools.py

Caporlingua Marina

go back to Meta-Llama-3-8B-Instruct

cfde4b0 10 months ago

17.3 kB

	import os
	import sys
	current_dir = os.getcwd()
	sys.path.insert(0, current_dir)
	from modules.shared import *


	def assemble_locations() -> List[Dict]:
	"""
	Reads JSON Lines (JSONL) files containing location data and combines their contents into a single list.

	This function processes multiple JSONL files stored in the `ISTAT_DATA_PATH` directory,
	each representing a type of location (e.g., geographic areas, regions, provinces).
	The contents of these files are read, deserialized, and aggregated into a single list.

	Args:
	None: This function does not take any arguments. It relies on a predefined
	`ISTAT_DATA_PATH` constant and file naming convention.

	Returns:
	List[Dict]: A list of dictionaries, where each dictionary represents a location
	extracted from the input JSONL files.

	Raises:
	FileNotFoundError: If any of the expected JSONL files are missing.
	IOError: If there are issues reading the files.
	json.JSONDecodeError: If a line in any of the files cannot be parsed as valid JSON.

	Notes:
	- The `ISTAT_DATA_PATH` constant must be defined elsewhere in the code.
	- File names are expected to follow the naming convention specified in the `file_names` list.
	"""
	file_names = ['_geographic_areas', '_regions', '_provinces']
	locations = []
	for name in file_names:
	file_path = f"{ISTAT_DATA_PATH}/{name}.jsonl"
	with open(file_path, 'r', encoding='utf-8') as file:
	for line in file:
	data = json.loads(line)
	locations.append(data)
	return locations



	def query_api(url: str) -> Optional[str]:
	"""
	Sends a GET request to the specified URL and returns the response content as a string.

	This function attempts to fetch the content from the provided URL using a GET request. If the request
	is successful, the content of the response is returned as a UTF-8 encoded string. If an error occurs
	(e.g., network issues, invalid URL, or a non-2xx status code), the function prints an error message
	and returns `None`.

	Args:
	url (str): The URL to which the GET request is sent. This should be a valid HTTP or HTTPS URL.

	Returns:
	Optional[str]: The content of the response as a UTF-8 encoded string if the request is successful,
	or `None` if an error occurs.

	Raises:
	requests.RequestException: This exception is caught internally if there is an issue with the
	request (e.g., network problem, invalid URL, or non-2xx HTTP status code).
	Instead of propagating the exception, an error message is printed.

	Notes:
	- Ensure that the `requests` library is installed in your environment to use this function.
	"""
	try:
	response = requests.get(url)
	response.raise_for_status() # Raises an HTTPError for bad responses
	response.encoding = 'utf-8'
	return response.text
	except requests.RequestException as e:
	print(f"An error occurred: {e}")
	return None



	def combine_ages(age_code: str) -> str:
	"""
	Generates a sequence of age codes based on the provided `age_code`.

	This function interprets different formats of age codes and generates a sequence
	of age codes as a string, joined by " + ".

	Args:
	age_code (str): The input code representing an age or age group. Supported formats:
	- `YX`: Represents an exact age (e.g., "Y65").
	- `Y_GEX`: Represents ages X and over up to 100+ (e.g., "Y_GE18").
	- `Y_UNX`: Represents all ages until X (e.g., "Y_UN18").
	- `YX-Z`: Represents a range of ages from X to Z (e.g., "Y23-42").

	Returns:
	str: A string representing the sequence of age codes, joined by " + ".

	Rules:
	1. `YX` -> Return the same string (e.g., "Y65" -> "Y65").
	2. `Y_GEX` -> Generate a sequence starting from `YX` to `Y_GE100`
	(e.g., "Y_GE18" -> "Y18 + Y19 + ... + Y_GE100").
	3. `Y_UNX` -> Generate a sequence from `Y0` to `Y(X-1)`
	(e.g., "Y_UN18" -> "Y0 + Y1 + ... + Y17").
	4. `YX-Z` -> Generate a sequence from `YX` to `YZ`
	(e.g., "Y23-42" -> "Y23 + Y24 + ... + Y42").

	Example:
	combine_ages("Y_GE18") -> "Y18 + Y19 + ... + Y_GE100"
	combine_ages("Y_UN18") -> "Y0 + Y1 + ... + Y17"
	combine_ages("Y23-25") -> "Y23 + Y24 + Y25"
	combine_ages("Y65") -> "Y65"
	"""
	if age_code.startswith("Y_"):
	if age_code.startswith("Y_GE"):
	# Rule 1: Y_GEX -> YX + Y(X+1) + ... + Y_GE100
	start_age = int(age_code[4:])
	age_list = [f"Y{age}" for age in range(start_age, 100)]
	age_list.append("Y_GE100")
	return "+".join(age_list)
	elif age_code.startswith("Y_UN"):
	# Rule 2: Y_UNX -> Y0 + Y1 + ... + Y(X-1)
	end_age = int(age_code[4:])
	age_list = [f"Y{age}" for age in range(end_age)]
	return "+".join(age_list)
	elif "-" in age_code:
	# Rule 3: YX-Z -> YX + Y(X+1) + ... + YZ
	start_age, end_age = map(int, age_code[1:].split('-'))
	age_list = [f"Y{age}" for age in range(start_age, end_age + 1)]
	return "+".join(age_list)
	else:
	# Rule 0: YX -> YX
	return age_code


	def transform_age_code(age_code: str) -> Optional[str]:
	"""
	Transforms an age code into a simplified human-readable format.

	This function converts various age codes into a more user-friendly format:
	- "TOTAL" becomes "total".
	- "Y_GE100" becomes "100+".
	- Age codes in the format "Y<number>" (e.g., "Y0", "Y99") are converted to their numeric representation (e.g., "0", "99").
	- For unsupported formats, the function returns `None`.

	Args:
	age_code (str): The age code to be transformed.

	Returns:
	Optional[str]: The transformed age code, or `None` if the input is not a recognized format.

	Examples:
	transform_age_code("TOTAL") -> "total"
	transform_age_code("Y_GE100") -> "100+"
	transform_age_code("Y25") -> "25"
	transform_age_code("INVALID") -> None
	"""
	if age_code == 'TOTAL':
	return 'total'
	elif age_code == 'Y_GE100':
	return '100+'
	# Handle the regular case for age codes like 'Y0', 'Y1', ..., 'Y99'
	elif age_code.startswith('Y') and age_code[1:].isdigit():
	return str(int(age_code[1:])) # Convert 'Y1' to '1', 'Y99' to '99'
	return None



	def age_str_to_int(age_str: str) -> int:
	"""
	Converts an age string into an integer for custom sorting.

	This function maps age strings to integer values for sorting purposes:
	- Numeric strings (e.g., "0", "1", "99") are converted to their integer equivalents.
	- The special value "100+" is mapped to 101 to ensure it sorts after other numeric ages.
	- The special value "TOTAL" is mapped to 102 to ensure it sorts after all other values.

	Args:
	age_str (str): The age value as a string. This can be:
	- A numeric value (e.g., "0", "1", "99").
	- The special value "100+".
	- The special value "TOTAL".

	Returns:
	int: A numeric value for sorting:
	- Numeric strings are converted to integers.
	- "100+" is mapped to 101.
	- "TOTAL" is mapped to 102.

	Examples:
	age_str_to_int("5") -> 5
	age_str_to_int("100+") -> 101
	age_str_to_int("TOTAL") -> 102
	"""
	if age_str == '100+':
	return 101 # Assign a high value so it sorts last
	if age_str.upper() == 'TOTAL':
	return 102
	return int(age_str)



	def extract_and_format_data_from_xml_for_web_app(xml_content: str) -> List[Dict[str, str]]:
	"""
	Extracts and formats data from an SDMX-ML XML document for use in a Streamlit app.

	This function parses XML content, extracts demographic data (e.g., location, sex, age, time period, and population),
	and formats it into a list of dictionaries sorted by time period, location, and age.

	Args:
	xml_content (str): The XML content as a string to be parsed.

	Returns:
	List[Dict[str, str]]: A list of dictionaries representing the extracted and formatted data.
	Each dictionary contains the following keys:
	- `location` (str): The name of the location.
	- `sex` (str): The descriptive sex (e.g., "Male", "Female", "Total").
	- `age (years)` (str): The age group or exact age as a human-readable string.
	- `time period` (str): The time period of the observation.
	- `population` (str): The observed population value.

	Raises:
	ET.ParseError: If the XML content cannot be parsed.
	KeyError: If required fields are missing in the XML structure.

	Notes:
	- The `assemble_locations` function must be defined to provide a location dictionary.
	- The `transform_age_code` function is used to convert age codes into human-readable descriptions.
	- The `age_str_to_int` function is used to ensure proper sorting of age strings.

	Example:
	extract_and_format_data_from_xml_for_web_app(xml_data)
	-> [{'location': 'Italy', 'sex': 'Male', 'age (years)': '0', 'time period': '2020', 'population': '10000'}, ...]
	"""
	# Parse the XML content
	root = ET.fromstring(xml_content)
	# Define namespaces for the XML structure
	ns = {
	'generic': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic',
	'message': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message',
	'common': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common'
	}
	# Create a dictionary from the locations list for faster lookup
	locations = assemble_locations()
	location_dict = {item[next(iter(item))]: next(iter(item)) for item in locations}
	# List to store the data from all series
	extracted_data = []
	# Iterate over each series in the DataSet
	for series in root.findall('.//generic:Series', ns):
	# Extract common series information
	ref_area_code = series.find(".//generic:Value[@id='REF_AREA']", ns).get('value')
	# Get the location name using the location dictionary
	ref_area_name = location_dict.get(ref_area_code,
	"Unknown Location") # Default to "Unknown Location" if not found
	age_code = series.find(".//generic:Value[@id='AGE']", ns).get('value')
	age_description = transform_age_code(age_code)
	sex_code = series.find(".//generic:Value[@id='SEX']", ns).get('value')
	# Map sex codes to descriptive strings
	sex_map = {'1': 'Male', '2': 'Female', '9': 'Total'}
	sex_description = sex_map.get(sex_code, "Unknown Sex") # Default to "Unknown Sex" if not found
	# Iterate over each observation in the series
	for obs in series.findall('.//generic:Obs', ns):
	time_period = obs.find(".//generic:ObsDimension[@id='TIME_PERIOD']", ns).get('value')
	obs_value = obs.find('.//generic:ObsValue', ns).get('value')
	# Append extracted data to the series data list
	extracted_data.append({
	'location': ref_area_name,
	'sex': sex_description,
	'age (years)': age_description,
	'time period': time_period,
	'population': obs_value
	})
	# Sorting the list of dictionaries by 'time period', 'location', and 'age (years)'
	extracted_data_sorted = sorted(
	extracted_data,
	key=lambda x: (int(x['time period']), x['location'], age_str_to_int(x['age (years)']))
	)
	return extracted_data_sorted


	def fetch_population_for_locations_years_sex_age_via_sdmx(
	location_ids: str = 'IT',
	sex: str = '9',
	age: str = 'TOTAL',
	start_period: str = '2024-01-01',
	end_period: str = '2024-12-31'
	) -> Optional[List[Dict[str, str]]]:
	"""
	Fetches population data for specific locations, time periods, and demographics using the Istat SDMX web service.

	This function constructs a query URL based on the provided parameters and retrieves population data
	in XML format. The data is parsed, formatted, and returned as a list of dictionaries.

	Args:
	location_ids (str): Geographical identifiers for the locations, concatenated by '+' if multiple.
	Default is 'IT' for Italy.
	sex (str): The sex category for which data is requested. Options:
	- '1': Male
	- '2': Female
	- '9': Total
	Multiple values can be combined with '+' (e.g., '1+2'). Default is '9'.
	age (str): The age category for which data is requested. Options:
	- 'Y0' to 'Y99': Specific ages
	- 'Y_GE100': 100 years and above
	- 'TOTAL': Total (all ages)
	Multiple values can be combined with '+'. Default is 'TOTAL'.
	start_period (str): The start date of the period for which data is requested, formatted as 'YYYY-MM-DD'.
	Default is '2024-01-01'.
	end_period (str): The end date of the period for which data is requested, formatted as 'YYYY-MM-DD'.
	Default is '2024-12-31'.

	Returns:
	Optional[List[Dict[str, str]]]: A list of dictionaries containing population data. Each dictionary includes:
	- `location`: The name of the location.
	- `sex`: The demographic category for sex (e.g., "Male", "Female", "Total").
	- `age`: The age group or category.
	- `time period`: The year of the observation.
	- `population`: The observed population value.
	Returns `None` if the query fails.

	Example:
	fetch_population_for_locations_years_sex_age_via_sdmx(
	'ITC+ITE2+ITF14', '9', 'TOTAL', '2024-01-01', '2024-12-31'
	)
	-> [
	{'location': 'Nord-ovest', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '15858626'},
	{'location': 'Umbria', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '856407'},
	{'location': 'Chieti', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '372640'}
	]
	"""
	if age.upper() == "TOTAL":
	combined_age = age.upper()
	else:
	combined_age = combine_ages(age)
	url = f"https://esploradati.istat.it/SDMXWS/rest/data/IT1,22_289_DF_DCIS_POPRES1_1,1.0/A.{location_ids}.JAN.{sex}.{combined_age}.99/ALL/?detail=full&startPeriod={start_period}&endPeriod={end_period}&dimensionAtObservation=TIME_PERIOD"
	print(url)
	res = query_api(url)
	if res is None:
	return None
	else:
	data = extract_and_format_data_from_xml_for_web_app(res)
	return data


	tools=[
	{
	"type": "function",
	"function": {
	"name": "fetch_population_for_locations_years_sex_age_via_sdmx",
	"description": "Fetches population data for specific locations, sex categories, age and time periods using the Istat SDMX web service. Supports multiple locations and sex categories.",
	"parameters": {
	"type": "object",
	"properties": {
	"location_ids": {
	"type": "string",
	"description": "Geographical identifiers for the locations, concatenated by '+' if multiple, e.g., 'ITC+ITE2+ITF14'"
	},
	"sex": {
	"type": "string",
	"description": "The sex category for which data is requested. '1' for male, '2' for female, '9' for total. Can be combined with '+', e.g., '1+2+9'"
	},
	"age": {
	"type": "string",
	"description": "The age in years for which data is requested. Follow these rules to interpret the age definition: 1. Exact age (e.g., `X years`): Format: `YX`; Example: `0 year` → `Y0`, `1 year` → `Y1`, `10 years` → `Y10`. 2. Age and over (e.g., `X years and over`): Format: `Y_GEX`; Example: `14 years and over` → `Y_GE14`. 3. Until a certain age (e.g., `until X years` or `under X years`): Format: `Y_UNX`; Example: `until 15 years` → `Y_UN15`. 4. Age ranges (e.g., `X-Y years`): Format: `YX-Y`; Example: `14-15 years` → `Y14-15`. 5. TOTAL (representing all ages): Use the code `TOTAL`."
	},
	"start_period": {
	"type": "string",
	"description": "The start date of the period for which data is requested, formatted as 'YYYY-MM-DD', e.g., '2024-01-01'. Default is '2024-01-01'."
	},
	"end_period": {
	"type": "string",
	"description": "The end date of the period for which data is requested, formatted as 'YYYY-MM-DD', e.g., '2024-12-31'. Default is '2024-12-31'."
	}
	},
	"required": ["location_ids", "sex", "age", "start_period", "end_period"]
	}
	}
	}
	]


	tool_functions_map = {
	'fetch_population_for_locations_years_sex_age_via_sdmx': fetch_population_for_locations_years_sex_age_via_sdmx
	}