Spaces:

capmar00
/

census-newsroom-AIde-api

Sleeping

App Files Files Community

Caporlingua Marina commited on Jan 8

Commit

bc3dd01

1 Parent(s): b45e1ba

update docstrings

Browse files

Files changed (5) hide show

app.py +25 -9
init.py +24 -6
modules/query_api.py +98 -42
modules/tools.py +192 -60
modules/utils.py +43 -9

app.py CHANGED Viewed

@@ -17,20 +17,36 @@ app = FastAPI()
 def greet_json():
     return {"msg" : "Space under construction"}
 @app.post("/eurostat/fetch-dataflows")
-async def trigger_fetch_and_parse(api_key: str = Depends(authenticate)):
-    logging.info("Endpoint /eurostat/fetch-dataflows called.")
     output_file = f"{EUROSTAT_DATA_PATH}/dataflows.jsonl"
-    fetch_and_parse_dataflows(EUROSTAT_API_DATAFLOWS, output_file)
-    return {"message": "Eurostat dataflows fetched and saved successfully", "output_file": output_file}
 @app.post("/istat/fetch-dataflows")
-async def trigger_fetch_and_parse(api_key: str = Depends(authenticate)):
-    logging.info("Endpoint /istat/fetch-dataflows called.")
     output_file = f"{ISTAT_DATA_PATH}/dataflows.jsonl"
-    fetch_and_parse_dataflows(ISTAT_API_DATAFLOWS, output_file)
-    return {"message": "Istat dataflows fetched and saved successfully", "output_file": output_file}

 def greet_json():
     return {"msg" : "Space under construction"}
+async def handle_fetch_and_parse(api_key: str, api_url: str, output_path: str) -> dict:
+    """
+    Function to fetch and parse dataflows from a specified API and save to a JSONL file.
+    Args:
+        api_key (str): API key for authentication.
+        api_url (str): The URL to fetch dataflows from.
+        output_path (str): The file path where the dataflows should be saved.
+    Returns:
+        dict: A response dictionary containing a success message and the output file path.
+    """
+    logging.info(f"Fetching and parsing dataflows from {api_url}.")
+    fetch_and_parse_dataflows(api_url, output_path)
+    return {
+        "message": f"Dataflows fetched and saved successfully",
+        "output_file": output_path,
+    }
 @app.post("/eurostat/fetch-dataflows")
+async def fetch_eurostat_dataflows(api_key: str = Depends(authenticate)) -> dict:
     output_file = f"{EUROSTAT_DATA_PATH}/dataflows.jsonl"
+    return await handle_fetch_and_parse(api_key, EUROSTAT_API_DATAFLOWS, output_file)
 @app.post("/istat/fetch-dataflows")
+async def fetch_istat_dataflows(api_key: str = Depends(authenticate)) -> dict:
     output_file = f"{ISTAT_DATA_PATH}/dataflows.jsonl"
+    return await handle_fetch_and_parse(api_key, ISTAT_API_DATAFLOWS, output_file)

init.py CHANGED Viewed

@@ -1,12 +1,29 @@
 import os
-def create_project_structure(base_path, folders):
     """
-    Creates the folder structure if it doesn't exist.
-    Parameters:
-    base_path (str): The base path where the folders will be created.
-    folders (list): A list of folder paths to create within the base path.
     """
     for folder in folders:
         folder_path = os.path.join(base_path, folder)
@@ -23,7 +40,8 @@ folders = [
     "data/eurostat/",
     "data/istat/",
     "modules/",
-    "schemas/"
 ]
 create_project_structure(base_path, folders)

 import os
+from typing import List
+def create_project_structure(base_path: str, folders: List[str]) -> None:
     """
+    Creates a folder structure within the specified base path if it doesn't already exist.
+    This function iterates over a list of folder paths and creates the corresponding directories
+    within the specified base path. If a folder already exists, it skips creation without errors.
+    Args:
+        base_path (str): The base directory where the folders will be created.
+        folders (List[str]): A list of folder paths (relative to `base_path`) to be created.
+    Returns:
+        None: This function does not return a value.
+    Raises:
+        OSError: If there is an issue creating any of the directories, an error message is printed.
+    Example:
+        create_project_structure("/home/user/project", ["data", "logs", "output"])
+        This will create the following structure if it doesn't already exist:
+            /home/user/project/data
+            /home/user/project/logs
+            /home/user/project/output
     """
     for folder in folders:
         folder_path = os.path.join(base_path, folder)
     "data/eurostat/",
     "data/istat/",
     "modules/",
+    "schemas/",
+    "test"
 ]
 create_project_structure(base_path, folders)

modules/query_api.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import requests
 import xml.etree.ElementTree as ET
 import json
@@ -9,20 +10,21 @@ from modules.shared import *
 ####### json and jsonl functions  ###############
 #################################################
-def save_as_jsonl(data, output_file):
     """
     Saves a list of dictionaries to a file in JSON Lines (JSONL) format.
     Each dictionary in the provided list `data` is serialized to a JSON object
-    and written to the specified file, with one JSON object per line.
     Args:
         data (list): A list of dictionaries to be saved as JSON Lines.
         output_file (str): The path to the file where the data should be saved.
     Raises:
-        IOError: If there is an issue opening or writing to the file, an error
-                 message will be printed.
     Example:
         save_as_jsonl([{'key1': 'value1'}, {'key2': 'value2'}], 'output.jsonl')
@@ -45,25 +47,25 @@ def save_as_jsonl(data, output_file):
 ################# API functions  #######################################################
 ######## encapsulated functions with underscore to indicate private/internal use #######
 ########################################################################################
-def query_api(url):
     """
-        Sends a GET request to the specified URL and returns the response content as a string.
-        The function attempts to fetch the content from the provided URL using a GET request.
-        It handles HTTP errors and returns the content of the response if successful.
-        The response encoding is explicitly set to 'utf-8'.
-        Args:
-            url (str): The URL to which the GET request is sent.
-        Returns:
-            str: The content of the response as a string if the request is successful.
-            None: If an error occurs during the request, the function returns None and prints an error message.
-        Raises:
-            requests.RequestException: If there is an issue with the request, such as a network problem or
-                                       a non-2xx HTTP status code, an error message is printed.
-        """
     try:
         response = requests.get(url)
         response.raise_for_status()  # Raises an HTTPError for bad responses
@@ -75,7 +77,30 @@ def query_api(url):
-def _parse_dataflows(xml_data):
     try:
         root = ET.fromstring(xml_data)
         ns = {
@@ -109,35 +134,66 @@ def _parse_dataflows(xml_data):
         return None
-def _filter_and_save_dataflows(dataflows, filter_ids, output_file):
     filtered_dataflows = [df for df in dataflows if df['dataflow_id'] in filter_ids]
     save_as_jsonl(filtered_dataflows, output_file)
-def fetch_and_parse_dataflows(url, output_file):
     """
-        Parses XML data to extract information about dataflows.
-        The function processes the provided XML string to extract details about dataflows,
-        including their ID, version, English name, and associated data structure ID. It returns
-        a list of dictionaries containing this information.
-        Args:
-            xml_data (str): A string containing the XML data to be parsed.
-        Returns:
-            list: A list of dictionaries, where each dictionary represents a dataflow and
-                  contains the following keys:
-                  - 'dataflow_id': The ID of the dataflow.
-                  - 'version': The version of the dataflow.
-                  - 'name': The English name of the dataflow (or "No English name" if not found).
-                  - 'datastructure_id': The associated data structure ID (or "No Ref ID" if not found).
-            None: If an XML parsing error occurs, the function returns None and prints an error message.
-        Raises:
-            xml.etree.ElementTree.ParseError: If the XML data cannot be parsed, an error message is printed.
-        """
     xml_data = query_api(url)
     if xml_data:
         dataflows = _parse_dataflows(xml_data)

+from typing import List, Dict, Optional, Set
 import requests
 import xml.etree.ElementTree as ET
 import json
 ####### json and jsonl functions  ###############
 #################################################
+def save_as_jsonl(data: list[dict], output_file: str) -> None:
     """
     Saves a list of dictionaries to a file in JSON Lines (JSONL) format.
     Each dictionary in the provided list `data` is serialized to a JSON object
+    and written to the specified file, with one JSON object per line. Non-ASCII
+    characters are preserved.
     Args:
         data (list): A list of dictionaries to be saved as JSON Lines.
         output_file (str): The path to the file where the data should be saved.
     Raises:
+        IOError: If there is an issue opening or writing to the file. Instead of
+                 raising the exception, an error message is printed.
     Example:
         save_as_jsonl([{'key1': 'value1'}, {'key2': 'value2'}], 'output.jsonl')
 ################# API functions  #######################################################
 ######## encapsulated functions with underscore to indicate private/internal use #######
 ########################################################################################
+def query_api(url: str) -> Optional[str]:
     """
+    Sends a GET request to the specified URL and returns the response content as a string.
+    This function attempts to fetch content from the provided URL using a GET request.
+    If the request is successful, the content of the response is returned as a string with UTF-8 encoding.
+    If an error occurs (e.g., network issues, invalid URL, or non-2xx status code), the function returns `None`
+    and prints an error message.
+    Args:
+        url (str): The URL to which the GET request is sent. This should be a valid HTTP or HTTPS URL.
+    Returns:
+        Optional[str]: The content of the response as a string if the request is successful, or `None` if an error occurs.
+    Raises:
+        requests.RequestException: Raised internally for issues with the request (e.g., network problems,
+                                   non-2xx HTTP status codes). An error message is printed instead of propagating the exception.
+    """
     try:
         response = requests.get(url)
         response.raise_for_status()  # Raises an HTTPError for bad responses
+def _parse_dataflows(xml_data: str) -> Optional[List[Dict[str, Optional[str]]]]:
+    """
+    Parses dataflows from an SDMX-ML XML string and extracts relevant details.
+    This function processes an XML string containing SDMX-ML data, identifies `Dataflow` elements,
+    and extracts their attributes and names in English or Italian (if available). Each dataflow is
+    represented as a dictionary with keys: `dataflow_id`, `version`, `name`, and `datastructure_id`.
+    Args:
+        xml_data (str): The XML data as a string to be parsed.
+    Returns:
+        Optional[List[Dict[str, Optional[str]]]]:
+            A list of dictionaries representing dataflows. Each dictionary contains:
+                - `dataflow_id` (str or None): The ID of the dataflow.
+                - `version` (str or None): The version of the dataflow.
+                - `name` (str): The name of the dataflow in English, or fallback to Italian, or a default message.
+                - `datastructure_id` (str or None): The ID of the referenced datastructure.
+            Returns `None` if a parsing error occurs.
+    Raises:
+        ET.ParseError: If the XML data is malformed or cannot be parsed, the function catches this
+                       exception, prints an error message, and returns `None`.
+    """
     try:
         root = ET.fromstring(xml_data)
         ns = {
         return None
+def _filter_and_save_dataflows(
+    dataflows: List[Dict[str, str]],
+    filter_ids: Set[str],
+    output_file: str
+    ) -> None:
+    """
+    Filters a list of dataflows by their IDs and saves the filtered data to a JSON Lines (JSONL) file.
+    This function takes a list of dataflows, filters them based on the provided set of `filter_ids`,
+    and writes the resulting filtered dataflows to the specified output file in JSON Lines format.
+    Args:
+        dataflows (List[Dict[str, str]]): A list of dictionaries representing dataflows.
+            Each dictionary must include the key `dataflow_id`.
+        filter_ids (Set[str]): A set of dataflow IDs to filter for. Only dataflows with IDs in this set
+            are included in the output.
+        output_file (str): The path to the file where the filtered dataflows should be saved in JSONL format.
+    Returns:
+        None: This function does not return a value.
+    Raises:
+        KeyError: If any dataflow dictionary does not contain the `dataflow_id` key, an error will occur
+                  during filtering.
+        IOError: If there is an issue saving the filtered data to the output file, the error is raised
+                 by the `save_as_jsonl` function.
+    """
     filtered_dataflows = [df for df in dataflows if df['dataflow_id'] in filter_ids]
     save_as_jsonl(filtered_dataflows, output_file)
+def fetch_and_parse_dataflows(url: str, output_file: str) -> None:
+    """
+    Fetches XML data from a given URL, parses it to extract dataflow information, and saves the results.
+    This function performs the following steps:
+    1. Fetches XML data from the specified URL using `query_api`.
+    2. Parses the XML data to extract details about dataflows (e.g., ID, version, English name,
+       and associated data structure ID) using `_parse_dataflows`.
+    3. Saves the parsed dataflows to the specified output file in JSON Lines format using `save_as_jsonl`.
+    4. Filters and saves dataflows with specific IDs to an additional file using `_filter_and_save_dataflows`.
+    Args:
+        url (str): The URL to fetch the XML data from.
+        output_file (str): The path to the file where parsed dataflows should be saved in JSON Lines format.
+    Returns:
+        None: This function does not return a value.
+    Raises:
+        ET.ParseError: If the XML data cannot be parsed, an error message is printed, and no data is saved.
+        IOError: If there is an issue saving the dataflows to the output file.
+        KeyError: If a required key is missing during filtering in `_filter_and_save_dataflows`.
+    Notes:
+        - The `useful_dataflow_ids` variable must be defined externally and contain a set of dataflow IDs to filter.
+        - The `query_api`, `_parse_dataflows`, `save_as_jsonl`, and `_filter_and_save_dataflows` functions are assumed
+          to be implemented elsewhere in the module.
     """
     xml_data = query_api(url)
     if xml_data:
         dataflows = _parse_dataflows(xml_data)

modules/tools.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from collections import defaultdict
 import requests
 import json
@@ -7,37 +8,68 @@ current_dir = os.getcwd()
 sys.path.insert(0, current_dir)
 from modules.shared import *
-def assemble_locations():
     file_names = ['_geographic_areas', '_regions', '_provinces']
-    locations = []  # Initialize the list to store results
     for name in file_names:
         file_path = f"{ISTAT_DATA_PATH}/{name}.jsonl"
-        with open(file_path, 'r', encoding='utf-8') as file:  # Ensure proper handling of the file opening
             for line in file:
                 data = json.loads(line)
                 locations.append(data)
     return locations
-def query_api(url):
     """
-        Sends a GET request to the specified URL and returns the response content as a string.
-        The function attempts to fetch the content from the provided URL using a GET request.
-        It handles HTTP errors and returns the content of the response if successful.
-        The response encoding is explicitly set to 'utf-8'.
-        Args:
-            url (str): The URL to which the GET request is sent.
-        Returns:
-            str: The content of the response as a string if the request is successful.
-            None: If an error occurs during the request, the function returns None and prints an error message.
-        Raises:
-            requests.RequestException: If there is an issue with the request, such as a network problem or
-                                       a non-2xx HTTP status code, an error message is printed.
-        """
     try:
         response = requests.get(url)
         response.raise_for_status()  # Raises an HTTPError for bad responses
@@ -49,25 +81,37 @@ def query_api(url):
-def combine_ages(age_code):
     """
-    Generate a string representing age codes based on the provided age_code.
     Args:
-    age_code (str): The input code representing an age or age group. The code can be one of the following formats:
-        - YX: Represents an exact age (e.g., "Y65").
-        - Y_GEX: Represents ages X and over up to 100+ (e.g., "Y_GE18").
-        - Y_UNX: Represents all ages until X (e.g., "Y_UN18").
-        - YX-Z: Represents a range of ages from X to Z (e.g., "Y23-42").
     Returns:
-    str: A string representing the sequence of age codes, joined by " + ".
     Rules:
-    1. YX -> Return the same string (e.g., "Y65" -> "Y65").
-    2. Y_GEX -> Generate a sequence starting from YX to Y_GE100 (e.g., "Y_GE18" -> "Y18 + Y19 + ... + Y_GE100").
-    3. Y_UNX -> Generate a sequence from Y0 to Y(X-1) (e.g., "Y_UN18" -> "Y0 + Y1 + ... + Y17").
-    4. YX-Z -> Generate a sequence from YX to YZ (e.g., "Y23-42" -> "Y23 + Y24 + ... + Y42").
     """
     if age_code.startswith("Y_"):
         if age_code.startswith("Y_GE"):
@@ -91,7 +135,28 @@ def combine_ages(age_code):
         return age_code
-def transform_age_code(age_code):
     if age_code == 'TOTAL':
         return 'total'
     elif age_code == 'Y_GE100':
@@ -103,33 +168,72 @@ def transform_age_code(age_code):
-def age_str_to_int(age_str):
     """
-    Custom sorting function for age strings.
     Args:
-    age_str (str): The age value as a string. This can be a numeric value (e.g., "0", "1", "99")
-                   or the special value "100+".
     Returns:
-    int: A numeric value used for sorting. For the special case "100+", it returns 101
-         to ensure that it is sorted after all other numeric ages.
-         For the special case "TOTAL", it returns 102. For numeric values,
-         it returns the integer equivalent of the age string.
-    Example usage:
-    age_key("5") -> 5
-    age_key("100+") -> 101
     """
     if age_str == '100+':
         return 101  # Assign a high value so it sorts last
     if age_str.upper() == 'TOTAL':
         return 102
-    return int(age_str)  # Convert numeric age strings to integers
-def extract_and_format_data_from_xml_for_streamlit_app(xml_content):
     # Parse the XML content
     root = ET.fromstring(xml_content)
     # Define namespaces for the XML structure
@@ -176,27 +280,55 @@ def extract_and_format_data_from_xml_for_streamlit_app(xml_content):
     return extracted_data_sorted
-def fetch_population_for_locations_years_sex_age_via_sdmx(location_ids='IT', sex='9', age='TOTAL', start_period='2024-01-01',
-                                                     end_period='2024-12-31'):
     """
-    Fetches population data for specific locations, time periods, and sex categories using the Istat SDMX web service.
-    Args:
-        location_ids (str): The geographical identifiers for the locations concatenated by '+' if multiple. Default is 'IT' for Italy.
-        sex (str): The sex category for which data is requested. '1' for male, '2' for female, '9' for total. Can be combined with '+'. Default is '9' for total
-        age (str): The age in years for which data is requested. From 'Y0' to 'Y99', 'Y_GE100' for 100 years and above, 'TOTAL' for total. Can be combined with '+'. Default is 'TOTAL' for total
-        start_period (str): The start date of the period for which data is requested, formatted as 'YYYY-MM-DD'. Default is '2024-01-01'.
-        end_period (str): The end date of the period for which data is requested, formatted as 'YYYY-MM-DD'. Default is '2024-12-31'.
     Returns:
-        list: A list of dictionaries containing the population data with reference area, time period, and observation value.
-    Example of use:
-        fetch_population_for_locations_years_sex_age_via_sdmx('ITC+ITE2+ITF14', '9', 'TOTAL', '2024-01-01', '2024-12-31')
-        [{'location': 'Nord-ovest', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '15858626'},
-         {'location': 'Umbria', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '856407'},
-         {'location': 'Chieti', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '372640'}]
     """
     if age.upper() == "TOTAL":
         combined_age = age.upper()

+from typing import List, Dict, Optional, Union
 from collections import defaultdict
 import requests
 import json
 sys.path.insert(0, current_dir)
 from modules.shared import *
+def assemble_locations() -> List[Dict]:
+    """
+    Reads JSON Lines (JSONL) files containing location data and combines their contents into a single list.
+    This function processes multiple JSONL files stored in the `ISTAT_DATA_PATH` directory,
+    each representing a type of location (e.g., geographic areas, regions, provinces).
+    The contents of these files are read, deserialized, and aggregated into a single list.
+    Args:
+        None: This function does not take any arguments. It relies on a predefined
+              `ISTAT_DATA_PATH` constant and file naming convention.
+    Returns:
+        List[Dict]: A list of dictionaries, where each dictionary represents a location
+                    extracted from the input JSONL files.
+    Raises:
+        FileNotFoundError: If any of the expected JSONL files are missing.
+        IOError: If there are issues reading the files.
+        json.JSONDecodeError: If a line in any of the files cannot be parsed as valid JSON.
+    Notes:
+        - The `ISTAT_DATA_PATH` constant must be defined elsewhere in the code.
+        - File names are expected to follow the naming convention specified in the `file_names` list.
+    """
     file_names = ['_geographic_areas', '_regions', '_provinces']
+    locations = []
     for name in file_names:
         file_path = f"{ISTAT_DATA_PATH}/{name}.jsonl"
+        with open(file_path, 'r', encoding='utf-8') as file:
             for line in file:
                 data = json.loads(line)
                 locations.append(data)
     return locations
+def query_api(url: str) -> Optional[str]:
     """
+    Sends a GET request to the specified URL and returns the response content as a string.
+    This function attempts to fetch the content from the provided URL using a GET request. If the request
+    is successful, the content of the response is returned as a UTF-8 encoded string. If an error occurs
+    (e.g., network issues, invalid URL, or a non-2xx status code), the function prints an error message
+    and returns `None`.
+    Args:
+        url (str): The URL to which the GET request is sent. This should be a valid HTTP or HTTPS URL.
+    Returns:
+        Optional[str]: The content of the response as a UTF-8 encoded string if the request is successful,
+                       or `None` if an error occurs.
+    Raises:
+        requests.RequestException: This exception is caught internally if there is an issue with the
+                                   request (e.g., network problem, invalid URL, or non-2xx HTTP status code).
+                                   Instead of propagating the exception, an error message is printed.
+    Notes:
+        - Ensure that the `requests` library is installed in your environment to use this function.
+    """
     try:
         response = requests.get(url)
         response.raise_for_status()  # Raises an HTTPError for bad responses
+def combine_ages(age_code: str) -> str:
     """
+    Generates a sequence of age codes based on the provided `age_code`.
+    This function interprets different formats of age codes and generates a sequence
+    of age codes as a string, joined by " + ".
     Args:
+        age_code (str): The input code representing an age or age group. Supported formats:
+            - `YX`: Represents an exact age (e.g., "Y65").
+            - `Y_GEX`: Represents ages X and over up to 100+ (e.g., "Y_GE18").
+            - `Y_UNX`: Represents all ages until X (e.g., "Y_UN18").
+            - `YX-Z`: Represents a range of ages from X to Z (e.g., "Y23-42").
     Returns:
+        str: A string representing the sequence of age codes, joined by " + ".
     Rules:
+        1. `YX` -> Return the same string (e.g., "Y65" -> "Y65").
+        2. `Y_GEX` -> Generate a sequence starting from `YX` to `Y_GE100`
+           (e.g., "Y_GE18" -> "Y18 + Y19 + ... + Y_GE100").
+        3. `Y_UNX` -> Generate a sequence from `Y0` to `Y(X-1)`
+           (e.g., "Y_UN18" -> "Y0 + Y1 + ... + Y17").
+        4. `YX-Z` -> Generate a sequence from `YX` to `YZ`
+           (e.g., "Y23-42" -> "Y23 + Y24 + ... + Y42").
+    Example:
+        combine_ages("Y_GE18") -> "Y18 + Y19 + ... + Y_GE100"
+        combine_ages("Y_UN18") -> "Y0 + Y1 + ... + Y17"
+        combine_ages("Y23-25") -> "Y23 + Y24 + Y25"
+        combine_ages("Y65") -> "Y65"
     """
     if age_code.startswith("Y_"):
         if age_code.startswith("Y_GE"):
         return age_code
+def transform_age_code(age_code: str) -> Optional[str]:
+    """
+    Transforms an age code into a simplified human-readable format.
+    This function converts various age codes into a more user-friendly format:
+    - "TOTAL" becomes "total".
+    - "Y_GE100" becomes "100+".
+    - Age codes in the format "Y<number>" (e.g., "Y0", "Y99") are converted to their numeric representation (e.g., "0", "99").
+    - For unsupported formats, the function returns `None`.
+    Args:
+        age_code (str): The age code to be transformed.
+    Returns:
+        Optional[str]: The transformed age code, or `None` if the input is not a recognized format.
+    Examples:
+        transform_age_code("TOTAL") -> "total"
+        transform_age_code("Y_GE100") -> "100+"
+        transform_age_code("Y25") -> "25"
+        transform_age_code("INVALID") -> None
+    """
     if age_code == 'TOTAL':
         return 'total'
     elif age_code == 'Y_GE100':
+def age_str_to_int(age_str: str) -> int:
     """
+    Converts an age string into an integer for custom sorting.
+    This function maps age strings to integer values for sorting purposes:
+    - Numeric strings (e.g., "0", "1", "99") are converted to their integer equivalents.
+    - The special value "100+" is mapped to 101 to ensure it sorts after other numeric ages.
+    - The special value "TOTAL" is mapped to 102 to ensure it sorts after all other values.
     Args:
+        age_str (str): The age value as a string. This can be:
+            - A numeric value (e.g., "0", "1", "99").
+            - The special value "100+".
+            - The special value "TOTAL".
     Returns:
+        int: A numeric value for sorting:
+            - Numeric strings are converted to integers.
+            - "100+" is mapped to 101.
+            - "TOTAL" is mapped to 102.
+    Examples:
+        age_str_to_int("5") -> 5
+        age_str_to_int("100+") -> 101
+        age_str_to_int("TOTAL") -> 102
     """
     if age_str == '100+':
         return 101  # Assign a high value so it sorts last
     if age_str.upper() == 'TOTAL':
         return 102
+    return int(age_str)
+def extract_and_format_data_from_xml_for_streamlit_app(xml_content: str) -> List[Dict[str, str]]:
+    """
+    Extracts and formats data from an SDMX-ML XML document for use in a Streamlit app.
+    This function parses XML content, extracts demographic data (e.g., location, sex, age, time period, and population),
+    and formats it into a list of dictionaries sorted by time period, location, and age.
+    Args:
+        xml_content (str): The XML content as a string to be parsed.
+    Returns:
+        List[Dict[str, str]]: A list of dictionaries representing the extracted and formatted data.
+        Each dictionary contains the following keys:
+            - `location` (str): The name of the location.
+            - `sex` (str): The descriptive sex (e.g., "Male", "Female", "Total").
+            - `age (years)` (str): The age group or exact age as a human-readable string.
+            - `time period` (str): The time period of the observation.
+            - `population` (str): The observed population value.
+    Raises:
+        ET.ParseError: If the XML content cannot be parsed.
+        KeyError: If required fields are missing in the XML structure.
+    Notes:
+        - The `assemble_locations` function must be defined to provide a location dictionary.
+        - The `transform_age_code` function is used to convert age codes into human-readable descriptions.
+        - The `age_str_to_int` function is used to ensure proper sorting of age strings.
+    Example:
+        extract_and_format_data_from_xml_for_streamlit_app(xml_data)
+        -> [{'location': 'Italy', 'sex': 'Male', 'age (years)': '0', 'time period': '2020', 'population': '10000'}, ...]
+    """
     # Parse the XML content
     root = ET.fromstring(xml_content)
     # Define namespaces for the XML structure
     return extracted_data_sorted
+def fetch_population_for_locations_years_sex_age_via_sdmx(
+    location_ids: str = 'IT',
+    sex: str = '9',
+    age: str = 'TOTAL',
+    start_period: str = '2024-01-01',
+    end_period: str = '2024-12-31'
+    ) -> Optional[List[Dict[str, str]]]:
     """
+    Fetches population data for specific locations, time periods, and demographics using the Istat SDMX web service.
+    This function constructs a query URL based on the provided parameters and retrieves population data
+    in XML format. The data is parsed, formatted, and returned as a list of dictionaries.
+    Args:
+        location_ids (str): Geographical identifiers for the locations, concatenated by '+' if multiple.
+                            Default is 'IT' for Italy.
+        sex (str): The sex category for which data is requested. Options:
+                   - '1': Male
+                   - '2': Female
+                   - '9': Total
+                   Multiple values can be combined with '+' (e.g., '1+2'). Default is '9'.
+        age (str): The age category for which data is requested. Options:
+                   - 'Y0' to 'Y99': Specific ages
+                   - 'Y_GE100': 100 years and above
+                   - 'TOTAL': Total (all ages)
+                   Multiple values can be combined with '+'. Default is 'TOTAL'.
+        start_period (str): The start date of the period for which data is requested, formatted as 'YYYY-MM-DD'.
+                            Default is '2024-01-01'.
+        end_period (str): The end date of the period for which data is requested, formatted as 'YYYY-MM-DD'.
+                          Default is '2024-12-31'.
     Returns:
+        Optional[List[Dict[str, str]]]: A list of dictionaries containing population data. Each dictionary includes:
+            - `location`: The name of the location.
+            - `sex`: The demographic category for sex (e.g., "Male", "Female", "Total").
+            - `age`: The age group or category.
+            - `time period`: The year of the observation.
+            - `population`: The observed population value.
+        Returns `None` if the query fails.
+    Example:
+        fetch_population_for_locations_years_sex_age_via_sdmx(
+            'ITC+ITE2+ITF14', '9', 'TOTAL', '2024-01-01', '2024-12-31'
+        )
+        -> [
+            {'location': 'Nord-ovest', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '15858626'},
+            {'location': 'Umbria', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '856407'},
+            {'location': 'Chieti', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '372640'}
+        ]
     """
     if age.upper() == "TOTAL":
         combined_age = age.upper()

modules/utils.py CHANGED Viewed

@@ -1,25 +1,59 @@
 import json
-def remove_newlines_and_spaces(s):
-    # Replace newlines with empty strings
     s = s.replace('\n', '')
-    # Remove all spaces
     s = s.replace(' ', '')
     return s
-def read_jsonl_file(file_path, type='str'):
     """
-    Fetches a list of ISTAT datasets from a JSONL file and returns the data as a formatted JSON string.
-    This function reads a JSON Lines (JSONL) file. It processes each line in the file,
-    which represents a JSON object, and compiles these objects into a list. The list is then
-    converted into a JSON string with pretty formatting (indented by 2 spaces) and returned.
     Returns:
-        str: A JSON string representing the list of datasets from the JSONL file.
     """
     data_list = []
     with open(file_path, 'r', encoding='utf-8') as file:

+from typing import Union, List, Dict
 import json
+def remove_newlines_and_spaces(s: str) -> str:
+    """
+    Removes all newlines and spaces from the input string.
+    This function replaces newline characters (`\n`) with an empty string
+    and removes all spaces from the given string.
+    Args:
+        s (str): The input string from which newlines and spaces will be removed.
+    Returns:
+        str: The processed string with all newlines and spaces removed.
+    Examples:
+        remove_newlines_and_spaces("Hello\nWorld") -> "HelloWorld"
+        remove_newlines_and_spaces(" Python  Programming ") -> "PythonProgramming"
+    """
     s = s.replace('\n', '')
     s = s.replace(' ', '')
     return s
+def read_jsonl_file(file_path: str, type: str = 'str') -> Union[str, List[Dict]]:
     """
+    Reads data from a JSON Lines (JSONL) file and returns it in the specified format.
+    This function processes a JSONL file, where each line represents a JSON object. It compiles
+    these objects into a list. Depending on the `type` parameter, the function either returns
+    a formatted JSON string (with all newlines and spaces removed) or a list of dictionaries.
+    Args:
+        file_path (str): The path to the JSONL file to be read.
+        type (str): Specifies the return format. Options:
+                    - `'str'`: Returns the data as a single JSON string (default).
+                    - `'list'`: Returns the data as a list of dictionaries.
     Returns:
+        Union[str, List[Dict]]: The processed data from the JSONL file:
+            - If `type='str'`, returns a JSON string with all newlines and spaces removed.
+            - If `type='list'`, returns a list of dictionaries.
+    Raises:
+        FileNotFoundError: If the file at `file_path` does not exist.
+        json.JSONDecodeError: If a line in the file cannot be parsed as valid JSON.
+    Examples:
+        read_jsonl_file("data.jsonl", type="str")
+        -> '{"key1":"value1","key2":"value2"}'
+        read_jsonl_file("data.jsonl", type="list")
+        -> [{"key1": "value1"}, {"key2": "value2"}]
     """
     data_list = []
     with open(file_path, 'r', encoding='utf-8') as file: