{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# Data Cleaning Workflow\n",
        "\n",
        "This Jupyter Notebook demonstrates the process of cleaning a dataset. The steps include loading the dataset, removing rows with garbage values, and saving the cleaned dataset."
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Step 1: Import necessary libraries"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {
        "id": "jyDxPzc_zI5U"
      },
      "outputs": [],
      "source": [
        "import pandas as pd\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Step 2: Load the dataset\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 335
        },
        "id": "9Lh3U2P5zJZp",
        "outputId": "d390e6e4-364d-4140-fd13-4795f189e26a"
      },
      "outputs": [],
      "source": [
        "# Load the dataset\n",
        "file_path = \"./usecase_1_merged.csv\"\n",
        "df = pd.read_csv(file_path)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Step 3: Remove garbage value rows based on 'Unnamed: 0.1'"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 146
        },
        "id": "pHkNZFlmzMIo",
        "outputId": "0c28cdb8-75f5-49ec-b37d-328867a3d00d"
      },
      "outputs": [],
      "source": [
        "# Step 1: Remove garbage value rows based on 'Unnamed: 0.1' (keep only numeric values)\n",
        "df = df[pd.to_numeric(df['Unnamed: 0.1'], errors='coerce').notnull()]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Step 4: Remove garbage value rows based on 'nct_id'"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 146
        },
        "id": "MBdaKv2gzUWA",
        "outputId": "b9923f80-01e2-461b-fd7f-a36db9d36332"
      },
      "outputs": [],
      "source": [
        "# Step 2: Remove garbage value rows based on 'nct_id' (keep only values starting with \"NCT\" followed by numbers)\n",
        "df = df[df['nct_id'].str.match(r'^NCT\\d+$', na=False)]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "## Step 5: Save the cleaned dataset"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "b3QaaKdhzU2O"
      },
      "outputs": [],
      "source": [
        "# Save the cleaned dataset\n",
        "cleaned_file_path = \"usecase_1_merged_cleaned.csv\"\n",
        "df.to_csv(cleaned_file_path, index=False)\n",
        "\n",
        "print(f\"Cleaned data saved to {cleaned_file_path}\")"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}