Caporlingua Marina commited on
Commit
bc3dd01
·
1 Parent(s): b45e1ba

update docstrings

Browse files
Files changed (5) hide show
  1. app.py +25 -9
  2. init.py +24 -6
  3. modules/query_api.py +98 -42
  4. modules/tools.py +192 -60
  5. modules/utils.py +43 -9
app.py CHANGED
@@ -17,20 +17,36 @@ app = FastAPI()
17
  def greet_json():
18
  return {"msg" : "Space under construction"}
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  @app.post("/eurostat/fetch-dataflows")
21
- async def trigger_fetch_and_parse(api_key: str = Depends(authenticate)):
22
- logging.info("Endpoint /eurostat/fetch-dataflows called.")
23
  output_file = f"{EUROSTAT_DATA_PATH}/dataflows.jsonl"
24
- fetch_and_parse_dataflows(EUROSTAT_API_DATAFLOWS, output_file)
25
- return {"message": "Eurostat dataflows fetched and saved successfully", "output_file": output_file}
26
-
27
 
28
  @app.post("/istat/fetch-dataflows")
29
- async def trigger_fetch_and_parse(api_key: str = Depends(authenticate)):
30
- logging.info("Endpoint /istat/fetch-dataflows called.")
31
  output_file = f"{ISTAT_DATA_PATH}/dataflows.jsonl"
32
- fetch_and_parse_dataflows(ISTAT_API_DATAFLOWS, output_file)
33
- return {"message": "Istat dataflows fetched and saved successfully", "output_file": output_file}
34
 
35
 
36
 
 
17
  def greet_json():
18
  return {"msg" : "Space under construction"}
19
 
20
+
21
+ async def handle_fetch_and_parse(api_key: str, api_url: str, output_path: str) -> dict:
22
+ """
23
+ Function to fetch and parse dataflows from a specified API and save to a JSONL file.
24
+
25
+ Args:
26
+ api_key (str): API key for authentication.
27
+ api_url (str): The URL to fetch dataflows from.
28
+ output_path (str): The file path where the dataflows should be saved.
29
+
30
+ Returns:
31
+ dict: A response dictionary containing a success message and the output file path.
32
+ """
33
+ logging.info(f"Fetching and parsing dataflows from {api_url}.")
34
+ fetch_and_parse_dataflows(api_url, output_path)
35
+ return {
36
+ "message": f"Dataflows fetched and saved successfully",
37
+ "output_file": output_path,
38
+ }
39
+
40
+
41
  @app.post("/eurostat/fetch-dataflows")
42
+ async def fetch_eurostat_dataflows(api_key: str = Depends(authenticate)) -> dict:
 
43
  output_file = f"{EUROSTAT_DATA_PATH}/dataflows.jsonl"
44
+ return await handle_fetch_and_parse(api_key, EUROSTAT_API_DATAFLOWS, output_file)
 
 
45
 
46
  @app.post("/istat/fetch-dataflows")
47
+ async def fetch_istat_dataflows(api_key: str = Depends(authenticate)) -> dict:
 
48
  output_file = f"{ISTAT_DATA_PATH}/dataflows.jsonl"
49
+ return await handle_fetch_and_parse(api_key, ISTAT_API_DATAFLOWS, output_file)
 
50
 
51
 
52
 
init.py CHANGED
@@ -1,12 +1,29 @@
1
  import os
 
2
 
3
- def create_project_structure(base_path, folders):
4
  """
5
- Creates the folder structure if it doesn't exist.
6
 
7
- Parameters:
8
- base_path (str): The base path where the folders will be created.
9
- folders (list): A list of folder paths to create within the base path.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  """
11
  for folder in folders:
12
  folder_path = os.path.join(base_path, folder)
@@ -23,7 +40,8 @@ folders = [
23
  "data/eurostat/",
24
  "data/istat/",
25
  "modules/",
26
- "schemas/"
 
27
  ]
28
 
29
  create_project_structure(base_path, folders)
 
1
  import os
2
+ from typing import List
3
 
4
+ def create_project_structure(base_path: str, folders: List[str]) -> None:
5
  """
6
+ Creates a folder structure within the specified base path if it doesn't already exist.
7
 
8
+ This function iterates over a list of folder paths and creates the corresponding directories
9
+ within the specified base path. If a folder already exists, it skips creation without errors.
10
+
11
+ Args:
12
+ base_path (str): The base directory where the folders will be created.
13
+ folders (List[str]): A list of folder paths (relative to `base_path`) to be created.
14
+
15
+ Returns:
16
+ None: This function does not return a value.
17
+
18
+ Raises:
19
+ OSError: If there is an issue creating any of the directories, an error message is printed.
20
+
21
+ Example:
22
+ create_project_structure("/home/user/project", ["data", "logs", "output"])
23
+ This will create the following structure if it doesn't already exist:
24
+ /home/user/project/data
25
+ /home/user/project/logs
26
+ /home/user/project/output
27
  """
28
  for folder in folders:
29
  folder_path = os.path.join(base_path, folder)
 
40
  "data/eurostat/",
41
  "data/istat/",
42
  "modules/",
43
+ "schemas/",
44
+ "test"
45
  ]
46
 
47
  create_project_structure(base_path, folders)
modules/query_api.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import requests
2
  import xml.etree.ElementTree as ET
3
  import json
@@ -9,20 +10,21 @@ from modules.shared import *
9
  ####### json and jsonl functions ###############
10
  #################################################
11
 
12
- def save_as_jsonl(data, output_file):
13
  """
14
  Saves a list of dictionaries to a file in JSON Lines (JSONL) format.
15
 
16
  Each dictionary in the provided list `data` is serialized to a JSON object
17
- and written to the specified file, with one JSON object per line.
 
18
 
19
  Args:
20
  data (list): A list of dictionaries to be saved as JSON Lines.
21
  output_file (str): The path to the file where the data should be saved.
22
 
23
  Raises:
24
- IOError: If there is an issue opening or writing to the file, an error
25
- message will be printed.
26
 
27
  Example:
28
  save_as_jsonl([{'key1': 'value1'}, {'key2': 'value2'}], 'output.jsonl')
@@ -45,25 +47,25 @@ def save_as_jsonl(data, output_file):
45
  ################# API functions #######################################################
46
  ######## encapsulated functions with underscore to indicate private/internal use #######
47
  ########################################################################################
48
- def query_api(url):
49
  """
50
- Sends a GET request to the specified URL and returns the response content as a string.
51
 
52
- The function attempts to fetch the content from the provided URL using a GET request.
53
- It handles HTTP errors and returns the content of the response if successful.
54
- The response encoding is explicitly set to 'utf-8'.
 
55
 
56
- Args:
57
- url (str): The URL to which the GET request is sent.
58
 
59
- Returns:
60
- str: The content of the response as a string if the request is successful.
61
- None: If an error occurs during the request, the function returns None and prints an error message.
62
 
63
- Raises:
64
- requests.RequestException: If there is an issue with the request, such as a network problem or
65
- a non-2xx HTTP status code, an error message is printed.
66
- """
67
  try:
68
  response = requests.get(url)
69
  response.raise_for_status() # Raises an HTTPError for bad responses
@@ -75,7 +77,30 @@ def query_api(url):
75
 
76
 
77
 
78
- def _parse_dataflows(xml_data):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  try:
80
  root = ET.fromstring(xml_data)
81
  ns = {
@@ -109,35 +134,66 @@ def _parse_dataflows(xml_data):
109
  return None
110
 
111
 
112
- def _filter_and_save_dataflows(dataflows, filter_ids, output_file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  filtered_dataflows = [df for df in dataflows if df['dataflow_id'] in filter_ids]
114
  save_as_jsonl(filtered_dataflows, output_file)
115
 
116
 
117
 
118
- def fetch_and_parse_dataflows(url, output_file):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  """
120
- Parses XML data to extract information about dataflows.
121
-
122
- The function processes the provided XML string to extract details about dataflows,
123
- including their ID, version, English name, and associated data structure ID. It returns
124
- a list of dictionaries containing this information.
125
-
126
- Args:
127
- xml_data (str): A string containing the XML data to be parsed.
128
-
129
- Returns:
130
- list: A list of dictionaries, where each dictionary represents a dataflow and
131
- contains the following keys:
132
- - 'dataflow_id': The ID of the dataflow.
133
- - 'version': The version of the dataflow.
134
- - 'name': The English name of the dataflow (or "No English name" if not found).
135
- - 'datastructure_id': The associated data structure ID (or "No Ref ID" if not found).
136
- None: If an XML parsing error occurs, the function returns None and prints an error message.
137
-
138
- Raises:
139
- xml.etree.ElementTree.ParseError: If the XML data cannot be parsed, an error message is printed.
140
- """
141
  xml_data = query_api(url)
142
  if xml_data:
143
  dataflows = _parse_dataflows(xml_data)
 
1
+ from typing import List, Dict, Optional, Set
2
  import requests
3
  import xml.etree.ElementTree as ET
4
  import json
 
10
  ####### json and jsonl functions ###############
11
  #################################################
12
 
13
+ def save_as_jsonl(data: list[dict], output_file: str) -> None:
14
  """
15
  Saves a list of dictionaries to a file in JSON Lines (JSONL) format.
16
 
17
  Each dictionary in the provided list `data` is serialized to a JSON object
18
+ and written to the specified file, with one JSON object per line. Non-ASCII
19
+ characters are preserved.
20
 
21
  Args:
22
  data (list): A list of dictionaries to be saved as JSON Lines.
23
  output_file (str): The path to the file where the data should be saved.
24
 
25
  Raises:
26
+ IOError: If there is an issue opening or writing to the file. Instead of
27
+ raising the exception, an error message is printed.
28
 
29
  Example:
30
  save_as_jsonl([{'key1': 'value1'}, {'key2': 'value2'}], 'output.jsonl')
 
47
  ################# API functions #######################################################
48
  ######## encapsulated functions with underscore to indicate private/internal use #######
49
  ########################################################################################
50
+ def query_api(url: str) -> Optional[str]:
51
  """
52
+ Sends a GET request to the specified URL and returns the response content as a string.
53
 
54
+ This function attempts to fetch content from the provided URL using a GET request.
55
+ If the request is successful, the content of the response is returned as a string with UTF-8 encoding.
56
+ If an error occurs (e.g., network issues, invalid URL, or non-2xx status code), the function returns `None`
57
+ and prints an error message.
58
 
59
+ Args:
60
+ url (str): The URL to which the GET request is sent. This should be a valid HTTP or HTTPS URL.
61
 
62
+ Returns:
63
+ Optional[str]: The content of the response as a string if the request is successful, or `None` if an error occurs.
 
64
 
65
+ Raises:
66
+ requests.RequestException: Raised internally for issues with the request (e.g., network problems,
67
+ non-2xx HTTP status codes). An error message is printed instead of propagating the exception.
68
+ """
69
  try:
70
  response = requests.get(url)
71
  response.raise_for_status() # Raises an HTTPError for bad responses
 
77
 
78
 
79
 
80
+ def _parse_dataflows(xml_data: str) -> Optional[List[Dict[str, Optional[str]]]]:
81
+ """
82
+ Parses dataflows from an SDMX-ML XML string and extracts relevant details.
83
+
84
+ This function processes an XML string containing SDMX-ML data, identifies `Dataflow` elements,
85
+ and extracts their attributes and names in English or Italian (if available). Each dataflow is
86
+ represented as a dictionary with keys: `dataflow_id`, `version`, `name`, and `datastructure_id`.
87
+
88
+ Args:
89
+ xml_data (str): The XML data as a string to be parsed.
90
+
91
+ Returns:
92
+ Optional[List[Dict[str, Optional[str]]]]:
93
+ A list of dictionaries representing dataflows. Each dictionary contains:
94
+ - `dataflow_id` (str or None): The ID of the dataflow.
95
+ - `version` (str or None): The version of the dataflow.
96
+ - `name` (str): The name of the dataflow in English, or fallback to Italian, or a default message.
97
+ - `datastructure_id` (str or None): The ID of the referenced datastructure.
98
+ Returns `None` if a parsing error occurs.
99
+
100
+ Raises:
101
+ ET.ParseError: If the XML data is malformed or cannot be parsed, the function catches this
102
+ exception, prints an error message, and returns `None`.
103
+ """
104
  try:
105
  root = ET.fromstring(xml_data)
106
  ns = {
 
134
  return None
135
 
136
 
137
+ def _filter_and_save_dataflows(
138
+ dataflows: List[Dict[str, str]],
139
+ filter_ids: Set[str],
140
+ output_file: str
141
+ ) -> None:
142
+ """
143
+ Filters a list of dataflows by their IDs and saves the filtered data to a JSON Lines (JSONL) file.
144
+
145
+ This function takes a list of dataflows, filters them based on the provided set of `filter_ids`,
146
+ and writes the resulting filtered dataflows to the specified output file in JSON Lines format.
147
+
148
+ Args:
149
+ dataflows (List[Dict[str, str]]): A list of dictionaries representing dataflows.
150
+ Each dictionary must include the key `dataflow_id`.
151
+ filter_ids (Set[str]): A set of dataflow IDs to filter for. Only dataflows with IDs in this set
152
+ are included in the output.
153
+ output_file (str): The path to the file where the filtered dataflows should be saved in JSONL format.
154
+
155
+ Returns:
156
+ None: This function does not return a value.
157
+
158
+ Raises:
159
+ KeyError: If any dataflow dictionary does not contain the `dataflow_id` key, an error will occur
160
+ during filtering.
161
+ IOError: If there is an issue saving the filtered data to the output file, the error is raised
162
+ by the `save_as_jsonl` function.
163
+ """
164
  filtered_dataflows = [df for df in dataflows if df['dataflow_id'] in filter_ids]
165
  save_as_jsonl(filtered_dataflows, output_file)
166
 
167
 
168
 
169
+ def fetch_and_parse_dataflows(url: str, output_file: str) -> None:
170
+ """
171
+ Fetches XML data from a given URL, parses it to extract dataflow information, and saves the results.
172
+
173
+ This function performs the following steps:
174
+ 1. Fetches XML data from the specified URL using `query_api`.
175
+ 2. Parses the XML data to extract details about dataflows (e.g., ID, version, English name,
176
+ and associated data structure ID) using `_parse_dataflows`.
177
+ 3. Saves the parsed dataflows to the specified output file in JSON Lines format using `save_as_jsonl`.
178
+ 4. Filters and saves dataflows with specific IDs to an additional file using `_filter_and_save_dataflows`.
179
+
180
+ Args:
181
+ url (str): The URL to fetch the XML data from.
182
+ output_file (str): The path to the file where parsed dataflows should be saved in JSON Lines format.
183
+
184
+ Returns:
185
+ None: This function does not return a value.
186
+
187
+ Raises:
188
+ ET.ParseError: If the XML data cannot be parsed, an error message is printed, and no data is saved.
189
+ IOError: If there is an issue saving the dataflows to the output file.
190
+ KeyError: If a required key is missing during filtering in `_filter_and_save_dataflows`.
191
+
192
+ Notes:
193
+ - The `useful_dataflow_ids` variable must be defined externally and contain a set of dataflow IDs to filter.
194
+ - The `query_api`, `_parse_dataflows`, `save_as_jsonl`, and `_filter_and_save_dataflows` functions are assumed
195
+ to be implemented elsewhere in the module.
196
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  xml_data = query_api(url)
198
  if xml_data:
199
  dataflows = _parse_dataflows(xml_data)
modules/tools.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from collections import defaultdict
2
  import requests
3
  import json
@@ -7,37 +8,68 @@ current_dir = os.getcwd()
7
  sys.path.insert(0, current_dir)
8
  from modules.shared import *
9
 
10
- def assemble_locations():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  file_names = ['_geographic_areas', '_regions', '_provinces']
12
- locations = [] # Initialize the list to store results
13
  for name in file_names:
14
  file_path = f"{ISTAT_DATA_PATH}/{name}.jsonl"
15
- with open(file_path, 'r', encoding='utf-8') as file: # Ensure proper handling of the file opening
16
  for line in file:
17
  data = json.loads(line)
18
  locations.append(data)
19
  return locations
20
 
21
 
22
- def query_api(url):
 
23
  """
24
- Sends a GET request to the specified URL and returns the response content as a string.
25
 
26
- The function attempts to fetch the content from the provided URL using a GET request.
27
- It handles HTTP errors and returns the content of the response if successful.
28
- The response encoding is explicitly set to 'utf-8'.
 
29
 
30
- Args:
31
- url (str): The URL to which the GET request is sent.
 
 
 
 
32
 
33
- Returns:
34
- str: The content of the response as a string if the request is successful.
35
- None: If an error occurs during the request, the function returns None and prints an error message.
 
36
 
37
- Raises:
38
- requests.RequestException: If there is an issue with the request, such as a network problem or
39
- a non-2xx HTTP status code, an error message is printed.
40
- """
41
  try:
42
  response = requests.get(url)
43
  response.raise_for_status() # Raises an HTTPError for bad responses
@@ -49,25 +81,37 @@ def query_api(url):
49
 
50
 
51
 
52
- def combine_ages(age_code):
53
  """
54
- Generate a string representing age codes based on the provided age_code.
 
 
 
55
 
56
  Args:
57
- age_code (str): The input code representing an age or age group. The code can be one of the following formats:
58
- - YX: Represents an exact age (e.g., "Y65").
59
- - Y_GEX: Represents ages X and over up to 100+ (e.g., "Y_GE18").
60
- - Y_UNX: Represents all ages until X (e.g., "Y_UN18").
61
- - YX-Z: Represents a range of ages from X to Z (e.g., "Y23-42").
62
 
63
  Returns:
64
- str: A string representing the sequence of age codes, joined by " + ".
65
 
66
  Rules:
67
- 1. YX -> Return the same string (e.g., "Y65" -> "Y65").
68
- 2. Y_GEX -> Generate a sequence starting from YX to Y_GE100 (e.g., "Y_GE18" -> "Y18 + Y19 + ... + Y_GE100").
69
- 3. Y_UNX -> Generate a sequence from Y0 to Y(X-1) (e.g., "Y_UN18" -> "Y0 + Y1 + ... + Y17").
70
- 4. YX-Z -> Generate a sequence from YX to YZ (e.g., "Y23-42" -> "Y23 + Y24 + ... + Y42").
 
 
 
 
 
 
 
 
 
71
  """
72
  if age_code.startswith("Y_"):
73
  if age_code.startswith("Y_GE"):
@@ -91,7 +135,28 @@ def combine_ages(age_code):
91
  return age_code
92
 
93
 
94
- def transform_age_code(age_code):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  if age_code == 'TOTAL':
96
  return 'total'
97
  elif age_code == 'Y_GE100':
@@ -103,33 +168,72 @@ def transform_age_code(age_code):
103
 
104
 
105
 
106
- def age_str_to_int(age_str):
107
  """
108
- Custom sorting function for age strings.
 
 
 
 
 
109
 
110
  Args:
111
- age_str (str): The age value as a string. This can be a numeric value (e.g., "0", "1", "99")
112
- or the special value "100+".
 
 
113
 
114
  Returns:
115
- int: A numeric value used for sorting. For the special case "100+", it returns 101
116
- to ensure that it is sorted after all other numeric ages.
117
- For the special case "TOTAL", it returns 102. For numeric values,
118
- it returns the integer equivalent of the age string.
119
-
120
- Example usage:
121
- age_key("5") -> 5
122
- age_key("100+") -> 101
 
123
  """
124
  if age_str == '100+':
125
  return 101 # Assign a high value so it sorts last
126
  if age_str.upper() == 'TOTAL':
127
  return 102
128
- return int(age_str) # Convert numeric age strings to integers
129
 
130
 
131
 
132
- def extract_and_format_data_from_xml_for_streamlit_app(xml_content):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  # Parse the XML content
134
  root = ET.fromstring(xml_content)
135
  # Define namespaces for the XML structure
@@ -176,27 +280,55 @@ def extract_and_format_data_from_xml_for_streamlit_app(xml_content):
176
  return extracted_data_sorted
177
 
178
 
179
- def fetch_population_for_locations_years_sex_age_via_sdmx(location_ids='IT', sex='9', age='TOTAL', start_period='2024-01-01',
180
- end_period='2024-12-31'):
 
 
 
 
 
181
  """
182
- Fetches population data for specific locations, time periods, and sex categories using the Istat SDMX web service.
183
 
184
- Args:
185
- location_ids (str): The geographical identifiers for the locations concatenated by '+' if multiple. Default is 'IT' for Italy.
186
- sex (str): The sex category for which data is requested. '1' for male, '2' for female, '9' for total. Can be combined with '+'. Default is '9' for total
187
- age (str): The age in years for which data is requested. From 'Y0' to 'Y99', 'Y_GE100' for 100 years and above, 'TOTAL' for total. Can be combined with '+'. Default is 'TOTAL' for total
188
 
189
- start_period (str): The start date of the period for which data is requested, formatted as 'YYYY-MM-DD'. Default is '2024-01-01'.
190
- end_period (str): The end date of the period for which data is requested, formatted as 'YYYY-MM-DD'. Default is '2024-12-31'.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
191
 
192
  Returns:
193
- list: A list of dictionaries containing the population data with reference area, time period, and observation value.
194
-
195
- Example of use:
196
- fetch_population_for_locations_years_sex_age_via_sdmx('ITC+ITE2+ITF14', '9', 'TOTAL', '2024-01-01', '2024-12-31')
197
- [{'location': 'Nord-ovest', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '15858626'},
198
- {'location': 'Umbria', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '856407'},
199
- {'location': 'Chieti', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '372640'}]
 
 
 
 
 
 
 
 
 
 
200
  """
201
  if age.upper() == "TOTAL":
202
  combined_age = age.upper()
 
1
+ from typing import List, Dict, Optional, Union
2
  from collections import defaultdict
3
  import requests
4
  import json
 
8
  sys.path.insert(0, current_dir)
9
  from modules.shared import *
10
 
11
+
12
+ def assemble_locations() -> List[Dict]:
13
+ """
14
+ Reads JSON Lines (JSONL) files containing location data and combines their contents into a single list.
15
+
16
+ This function processes multiple JSONL files stored in the `ISTAT_DATA_PATH` directory,
17
+ each representing a type of location (e.g., geographic areas, regions, provinces).
18
+ The contents of these files are read, deserialized, and aggregated into a single list.
19
+
20
+ Args:
21
+ None: This function does not take any arguments. It relies on a predefined
22
+ `ISTAT_DATA_PATH` constant and file naming convention.
23
+
24
+ Returns:
25
+ List[Dict]: A list of dictionaries, where each dictionary represents a location
26
+ extracted from the input JSONL files.
27
+
28
+ Raises:
29
+ FileNotFoundError: If any of the expected JSONL files are missing.
30
+ IOError: If there are issues reading the files.
31
+ json.JSONDecodeError: If a line in any of the files cannot be parsed as valid JSON.
32
+
33
+ Notes:
34
+ - The `ISTAT_DATA_PATH` constant must be defined elsewhere in the code.
35
+ - File names are expected to follow the naming convention specified in the `file_names` list.
36
+ """
37
  file_names = ['_geographic_areas', '_regions', '_provinces']
38
+ locations = []
39
  for name in file_names:
40
  file_path = f"{ISTAT_DATA_PATH}/{name}.jsonl"
41
+ with open(file_path, 'r', encoding='utf-8') as file:
42
  for line in file:
43
  data = json.loads(line)
44
  locations.append(data)
45
  return locations
46
 
47
 
48
+
49
+ def query_api(url: str) -> Optional[str]:
50
  """
51
+ Sends a GET request to the specified URL and returns the response content as a string.
52
 
53
+ This function attempts to fetch the content from the provided URL using a GET request. If the request
54
+ is successful, the content of the response is returned as a UTF-8 encoded string. If an error occurs
55
+ (e.g., network issues, invalid URL, or a non-2xx status code), the function prints an error message
56
+ and returns `None`.
57
 
58
+ Args:
59
+ url (str): The URL to which the GET request is sent. This should be a valid HTTP or HTTPS URL.
60
+
61
+ Returns:
62
+ Optional[str]: The content of the response as a UTF-8 encoded string if the request is successful,
63
+ or `None` if an error occurs.
64
 
65
+ Raises:
66
+ requests.RequestException: This exception is caught internally if there is an issue with the
67
+ request (e.g., network problem, invalid URL, or non-2xx HTTP status code).
68
+ Instead of propagating the exception, an error message is printed.
69
 
70
+ Notes:
71
+ - Ensure that the `requests` library is installed in your environment to use this function.
72
+ """
 
73
  try:
74
  response = requests.get(url)
75
  response.raise_for_status() # Raises an HTTPError for bad responses
 
81
 
82
 
83
 
84
+ def combine_ages(age_code: str) -> str:
85
  """
86
+ Generates a sequence of age codes based on the provided `age_code`.
87
+
88
+ This function interprets different formats of age codes and generates a sequence
89
+ of age codes as a string, joined by " + ".
90
 
91
  Args:
92
+ age_code (str): The input code representing an age or age group. Supported formats:
93
+ - `YX`: Represents an exact age (e.g., "Y65").
94
+ - `Y_GEX`: Represents ages X and over up to 100+ (e.g., "Y_GE18").
95
+ - `Y_UNX`: Represents all ages until X (e.g., "Y_UN18").
96
+ - `YX-Z`: Represents a range of ages from X to Z (e.g., "Y23-42").
97
 
98
  Returns:
99
+ str: A string representing the sequence of age codes, joined by " + ".
100
 
101
  Rules:
102
+ 1. `YX` -> Return the same string (e.g., "Y65" -> "Y65").
103
+ 2. `Y_GEX` -> Generate a sequence starting from `YX` to `Y_GE100`
104
+ (e.g., "Y_GE18" -> "Y18 + Y19 + ... + Y_GE100").
105
+ 3. `Y_UNX` -> Generate a sequence from `Y0` to `Y(X-1)`
106
+ (e.g., "Y_UN18" -> "Y0 + Y1 + ... + Y17").
107
+ 4. `YX-Z` -> Generate a sequence from `YX` to `YZ`
108
+ (e.g., "Y23-42" -> "Y23 + Y24 + ... + Y42").
109
+
110
+ Example:
111
+ combine_ages("Y_GE18") -> "Y18 + Y19 + ... + Y_GE100"
112
+ combine_ages("Y_UN18") -> "Y0 + Y1 + ... + Y17"
113
+ combine_ages("Y23-25") -> "Y23 + Y24 + Y25"
114
+ combine_ages("Y65") -> "Y65"
115
  """
116
  if age_code.startswith("Y_"):
117
  if age_code.startswith("Y_GE"):
 
135
  return age_code
136
 
137
 
138
+ def transform_age_code(age_code: str) -> Optional[str]:
139
+ """
140
+ Transforms an age code into a simplified human-readable format.
141
+
142
+ This function converts various age codes into a more user-friendly format:
143
+ - "TOTAL" becomes "total".
144
+ - "Y_GE100" becomes "100+".
145
+ - Age codes in the format "Y<number>" (e.g., "Y0", "Y99") are converted to their numeric representation (e.g., "0", "99").
146
+ - For unsupported formats, the function returns `None`.
147
+
148
+ Args:
149
+ age_code (str): The age code to be transformed.
150
+
151
+ Returns:
152
+ Optional[str]: The transformed age code, or `None` if the input is not a recognized format.
153
+
154
+ Examples:
155
+ transform_age_code("TOTAL") -> "total"
156
+ transform_age_code("Y_GE100") -> "100+"
157
+ transform_age_code("Y25") -> "25"
158
+ transform_age_code("INVALID") -> None
159
+ """
160
  if age_code == 'TOTAL':
161
  return 'total'
162
  elif age_code == 'Y_GE100':
 
168
 
169
 
170
 
171
+ def age_str_to_int(age_str: str) -> int:
172
  """
173
+ Converts an age string into an integer for custom sorting.
174
+
175
+ This function maps age strings to integer values for sorting purposes:
176
+ - Numeric strings (e.g., "0", "1", "99") are converted to their integer equivalents.
177
+ - The special value "100+" is mapped to 101 to ensure it sorts after other numeric ages.
178
+ - The special value "TOTAL" is mapped to 102 to ensure it sorts after all other values.
179
 
180
  Args:
181
+ age_str (str): The age value as a string. This can be:
182
+ - A numeric value (e.g., "0", "1", "99").
183
+ - The special value "100+".
184
+ - The special value "TOTAL".
185
 
186
  Returns:
187
+ int: A numeric value for sorting:
188
+ - Numeric strings are converted to integers.
189
+ - "100+" is mapped to 101.
190
+ - "TOTAL" is mapped to 102.
191
+
192
+ Examples:
193
+ age_str_to_int("5") -> 5
194
+ age_str_to_int("100+") -> 101
195
+ age_str_to_int("TOTAL") -> 102
196
  """
197
  if age_str == '100+':
198
  return 101 # Assign a high value so it sorts last
199
  if age_str.upper() == 'TOTAL':
200
  return 102
201
+ return int(age_str)
202
 
203
 
204
 
205
+ def extract_and_format_data_from_xml_for_streamlit_app(xml_content: str) -> List[Dict[str, str]]:
206
+ """
207
+ Extracts and formats data from an SDMX-ML XML document for use in a Streamlit app.
208
+
209
+ This function parses XML content, extracts demographic data (e.g., location, sex, age, time period, and population),
210
+ and formats it into a list of dictionaries sorted by time period, location, and age.
211
+
212
+ Args:
213
+ xml_content (str): The XML content as a string to be parsed.
214
+
215
+ Returns:
216
+ List[Dict[str, str]]: A list of dictionaries representing the extracted and formatted data.
217
+ Each dictionary contains the following keys:
218
+ - `location` (str): The name of the location.
219
+ - `sex` (str): The descriptive sex (e.g., "Male", "Female", "Total").
220
+ - `age (years)` (str): The age group or exact age as a human-readable string.
221
+ - `time period` (str): The time period of the observation.
222
+ - `population` (str): The observed population value.
223
+
224
+ Raises:
225
+ ET.ParseError: If the XML content cannot be parsed.
226
+ KeyError: If required fields are missing in the XML structure.
227
+
228
+ Notes:
229
+ - The `assemble_locations` function must be defined to provide a location dictionary.
230
+ - The `transform_age_code` function is used to convert age codes into human-readable descriptions.
231
+ - The `age_str_to_int` function is used to ensure proper sorting of age strings.
232
+
233
+ Example:
234
+ extract_and_format_data_from_xml_for_streamlit_app(xml_data)
235
+ -> [{'location': 'Italy', 'sex': 'Male', 'age (years)': '0', 'time period': '2020', 'population': '10000'}, ...]
236
+ """
237
  # Parse the XML content
238
  root = ET.fromstring(xml_content)
239
  # Define namespaces for the XML structure
 
280
  return extracted_data_sorted
281
 
282
 
283
+ def fetch_population_for_locations_years_sex_age_via_sdmx(
284
+ location_ids: str = 'IT',
285
+ sex: str = '9',
286
+ age: str = 'TOTAL',
287
+ start_period: str = '2024-01-01',
288
+ end_period: str = '2024-12-31'
289
+ ) -> Optional[List[Dict[str, str]]]:
290
  """
291
+ Fetches population data for specific locations, time periods, and demographics using the Istat SDMX web service.
292
 
293
+ This function constructs a query URL based on the provided parameters and retrieves population data
294
+ in XML format. The data is parsed, formatted, and returned as a list of dictionaries.
 
 
295
 
296
+ Args:
297
+ location_ids (str): Geographical identifiers for the locations, concatenated by '+' if multiple.
298
+ Default is 'IT' for Italy.
299
+ sex (str): The sex category for which data is requested. Options:
300
+ - '1': Male
301
+ - '2': Female
302
+ - '9': Total
303
+ Multiple values can be combined with '+' (e.g., '1+2'). Default is '9'.
304
+ age (str): The age category for which data is requested. Options:
305
+ - 'Y0' to 'Y99': Specific ages
306
+ - 'Y_GE100': 100 years and above
307
+ - 'TOTAL': Total (all ages)
308
+ Multiple values can be combined with '+'. Default is 'TOTAL'.
309
+ start_period (str): The start date of the period for which data is requested, formatted as 'YYYY-MM-DD'.
310
+ Default is '2024-01-01'.
311
+ end_period (str): The end date of the period for which data is requested, formatted as 'YYYY-MM-DD'.
312
+ Default is '2024-12-31'.
313
 
314
  Returns:
315
+ Optional[List[Dict[str, str]]]: A list of dictionaries containing population data. Each dictionary includes:
316
+ - `location`: The name of the location.
317
+ - `sex`: The demographic category for sex (e.g., "Male", "Female", "Total").
318
+ - `age`: The age group or category.
319
+ - `time period`: The year of the observation.
320
+ - `population`: The observed population value.
321
+ Returns `None` if the query fails.
322
+
323
+ Example:
324
+ fetch_population_for_locations_years_sex_age_via_sdmx(
325
+ 'ITC+ITE2+ITF14', '9', 'TOTAL', '2024-01-01', '2024-12-31'
326
+ )
327
+ -> [
328
+ {'location': 'Nord-ovest', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '15858626'},
329
+ {'location': 'Umbria', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '856407'},
330
+ {'location': 'Chieti', 'sex': 'Total', 'age': 'Total', 'time period': '2024', 'population': '372640'}
331
+ ]
332
  """
333
  if age.upper() == "TOTAL":
334
  combined_age = age.upper()
modules/utils.py CHANGED
@@ -1,25 +1,59 @@
 
1
  import json
2
 
3
 
4
 
5
- def remove_newlines_and_spaces(s):
6
- # Replace newlines with empty strings
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  s = s.replace('\n', '')
8
- # Remove all spaces
9
  s = s.replace(' ', '')
10
  return s
11
 
12
 
13
- def read_jsonl_file(file_path, type='str'):
14
  """
15
- Fetches a list of ISTAT datasets from a JSONL file and returns the data as a formatted JSON string.
 
 
 
 
16
 
17
- This function reads a JSON Lines (JSONL) file. It processes each line in the file,
18
- which represents a JSON object, and compiles these objects into a list. The list is then
19
- converted into a JSON string with pretty formatting (indented by 2 spaces) and returned.
 
 
20
 
21
  Returns:
22
- str: A JSON string representing the list of datasets from the JSONL file.
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  """
24
  data_list = []
25
  with open(file_path, 'r', encoding='utf-8') as file:
 
1
+ from typing import Union, List, Dict
2
  import json
3
 
4
 
5
 
6
+ def remove_newlines_and_spaces(s: str) -> str:
7
+ """
8
+ Removes all newlines and spaces from the input string.
9
+
10
+ This function replaces newline characters (`\n`) with an empty string
11
+ and removes all spaces from the given string.
12
+
13
+ Args:
14
+ s (str): The input string from which newlines and spaces will be removed.
15
+
16
+ Returns:
17
+ str: The processed string with all newlines and spaces removed.
18
+
19
+ Examples:
20
+ remove_newlines_and_spaces("Hello\nWorld") -> "HelloWorld"
21
+ remove_newlines_and_spaces(" Python Programming ") -> "PythonProgramming"
22
+ """
23
  s = s.replace('\n', '')
 
24
  s = s.replace(' ', '')
25
  return s
26
 
27
 
28
+ def read_jsonl_file(file_path: str, type: str = 'str') -> Union[str, List[Dict]]:
29
  """
30
+ Reads data from a JSON Lines (JSONL) file and returns it in the specified format.
31
+
32
+ This function processes a JSONL file, where each line represents a JSON object. It compiles
33
+ these objects into a list. Depending on the `type` parameter, the function either returns
34
+ a formatted JSON string (with all newlines and spaces removed) or a list of dictionaries.
35
 
36
+ Args:
37
+ file_path (str): The path to the JSONL file to be read.
38
+ type (str): Specifies the return format. Options:
39
+ - `'str'`: Returns the data as a single JSON string (default).
40
+ - `'list'`: Returns the data as a list of dictionaries.
41
 
42
  Returns:
43
+ Union[str, List[Dict]]: The processed data from the JSONL file:
44
+ - If `type='str'`, returns a JSON string with all newlines and spaces removed.
45
+ - If `type='list'`, returns a list of dictionaries.
46
+
47
+ Raises:
48
+ FileNotFoundError: If the file at `file_path` does not exist.
49
+ json.JSONDecodeError: If a line in the file cannot be parsed as valid JSON.
50
+
51
+ Examples:
52
+ read_jsonl_file("data.jsonl", type="str")
53
+ -> '{"key1":"value1","key2":"value2"}'
54
+
55
+ read_jsonl_file("data.jsonl", type="list")
56
+ -> [{"key1": "value1"}, {"key2": "value2"}]
57
  """
58
  data_list = []
59
  with open(file_path, 'r', encoding='utf-8') as file: